# This is a perl script

# Program which takes a set of files from the mul11 
# (credit_suisse) eci corpus, processing the texts by gunzipping,
# converting to iso-latin-1 character set and removing markup and
# control codes.
# Concatenates all the files, separated by <div0 type=file>
# and prints to the standard output
#
# This is not perfect but it provides most of the text with the
# minimum of garbage.

$* = 1; undef($/);		# Read file in one gulp

# Adds <div0

while($file=shift){
    # Character convert to ISO-LATIN-1
    die "Must set ECI_ROOT" unless  ($basedir = $ENV{"ECI_ROOT"});
    $basedir =~ s|/$||;		# Remove final / if any
    die "Couldn't access $file: $?" unless
	open(IN,"zcat $file | $basedir/bin/unix/ebdc2iso |");
    $_ = <IN>;
    # Tabs and newlines
    s/\t//g;
    s/\n//g;
    s//\n/g;
    s//\t/g;
    s//\n/g;
    s/\240/ /g; # non-breaking space
    $_ = "<div0 type=file n=\"$file\">\n<p>\n".$_."\n\n</div0>\n";
    # Special codes
    s#\213M(([^\213]|\213[^M]|\213M[^])*)\213M##g;
    s#(([^Y]|Y[^])*)Y##g;
    s#M\n##g;
    s#M##g;
    s#J\n##g;
    s###g;
    s###g;
    s#\213J##g;
    s###g;
    s###g;
    s###g;
    s#([^]*)##g;
    s#(([^]|[^])*)#\n\n#g;
    s#[^]*ݡ##g;
    s#[^]*##g;
    s#[]*##g;
    s###g;
    s#\213M(([^\213]|\213[^M]|\213M[^])*)\213+M##g;
    s###g;
    s#(<div0 [^>]*>\n<p>\n)[^}]*}#\1#;
    # Final cleanup
    s/[\000-\010\013-\037]//g;
    s/[\200-\237]//g;
    print;
}

