Advertisement
Guest User

Subkorpora

a guest
Mar 7th, 2015
369
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #/usr/bin/perl -w
  2.  
  3. #Dieses Modul erstellt Subkorpora
  4.  
  5. $input = "../land_der_traeume_gesamtkorpus_CWB_fertig.vrt";
  6. $output = '../satzanzahl_schnitt_uber5.txt';
  7.  
  8. open(INPUT, "< $input");
  9. open(OUTPUT, "> $output");
  10.  
  11. while (<INPUT>) {
  12.     $alles = join("",<INPUT>);
  13.     @array = split(/<\/text/,$alles);
  14. }
  15.  
  16. foreach (@array) {
  17.     if ($_ =~ m/drug="(.*?)"/) {
  18.         $drug = $1;
  19.         push(@drugarray,$drug);
  20.         @zaehle = split(/[\n]/,$_);
  21.         @zaehle_satz = split(/\$\./,$_);
  22.         $laenge = @zaehle;
  23.         $satzlaenge = @zaehle_satz;
  24.         push(@zaehlarray,"$drug\t$laenge\n");
  25.         push(@satzlaengearray,"$drug\t$satzlaenge\n");
  26.         }
  27. }
  28.  
  29. sub del_double {
  30.     my %all;
  31.     $all{$_}=0 for @drugarray;
  32.     return (keys %all);
  33. }
  34.  
  35. @drugarray=&del_double(@drugarray);
  36. shift(@drugarray);
  37.  
  38. foreach (@drugarray) {
  39.     $droge = $_;
  40.     $output = '../subkorpora/subkorpus' . $droge . ".txt";
  41.     open(OUTPUT, "> $output");
  42.     foreach (@array) {
  43.         if ($_ =~ m/drug="$droge"/) {
  44.             $_ =~ s/.*?\t.*?\t(.*?)\n/$1 /g;
  45.             $_ =~ s/ä/ae/g;
  46.             $_ =~ s/ö/oe/g;
  47.             $_ =~ s/ü/ue/g;
  48.             $_ =~ s/ß/ss/g;
  49.             $_ =~ s/<text.*?">//g;
  50.             $_ =~ s/\@card\@//g;
  51.             print OUTPUT $_;
  52.         }
  53.     }
  54.     close(OUTPUT);
  55. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement