SHARE
TWEET

Satzlängendurchschnitt

a guest Mar 7th, 2015 61 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #/usr/bin/perl -w
  2.  
  3. #Dieses Modul ermittelt jede Satzlänge und die durchschnittliche Satzlänge pro Drogensorte
  4.  
  5. $input = "../land_der_traeume_gesamtkorpus_CWB_fertig.vrt";
  6. $output = '../satzanzahl_schnitt_uber5.txt';
  7.  
  8. open(INPUT, "< $input");
  9. open(OUTPUT, "> $output");
  10.  
  11. # my %hash;
  12.  
  13. while (<INPUT>) {
  14.         $alles = join("",<INPUT>);
  15.         @array = split(/<text/,$alles);
  16. }
  17.  
  18. foreach (@array) {
  19.         if ($_ =~ m/drug="(.*?)"/) {
  20.                 $drug = $1;
  21.                 push(@drugarray,$drug);
  22.                 @zaehle = split(/[\n]/,$_);
  23.                 @zaehle_satz = split(/\$\./,$_);
  24.                 $laenge = @zaehle;
  25.                 $satzlaenge = @zaehle_satz;
  26.                 push(@zaehlarray,"$drug\t$laenge\n");
  27.                 push(@satzlaengearray,"$drug\t$satzlaenge\n");
  28.                 }
  29. }
  30.  
  31. sub del_double {
  32.         my %all;
  33.         $all{$_}=0 for @drugarray;
  34.         return (keys %all);
  35. }
  36.  
  37. @drugarray=&del_double(@drugarray);
  38. shift(@drugarray);
  39.  
  40. $elemente = @drugarray;
  41.  
  42. $counter = 0;
  43. $textlaengeabs = 0;
  44. $satzlaengeabs = 0;
  45.  
  46. foreach (@drugarray) {
  47.         $droge = $_;
  48.         foreach (@zaehlarray) {
  49.                 if ($_ =~ m/^$droge\t(\d*?)\n/) {
  50.                 $textlaengeabs = $textlaengeabs + $1;
  51.                 }
  52.         }
  53.         foreach (@satzlaengearray) {
  54.                 if ($_ =~ m/^$droge\t(\d*?)\n/) {
  55.                 $satzlaengeabs = $satzlaengeabs + $1;
  56.                 $counter++;
  57.                 }
  58.         }      
  59.         $schnittwoerter = $textlaengeabs / $counter;
  60.         $schnittsaetze = $satzlaengeabs / $counter;
  61.         $satzlaenge = $schnittwoerter / $schnittsaetze;
  62.  
  63.         unless ($counter <= 5) {
  64.                 push (@hasharraywoerter,"$droge");
  65.                 push (@hasharraywoerter,"$schnittwoerter");
  66.         }
  67.         unless ($counter <= 5) {
  68.                 push (@hasharraysaetze,"$droge");
  69.                 push (@hasharraysaetze,"$schnittsaetze");
  70.         }
  71.         unless ($counter <= 5) {
  72.                 push (@hasharraysatzlaenge,"$droge");
  73.                 push (@hasharraysatzlaenge,"$satzlaenge");
  74.         }
  75.         undef $textlaengeabs;  
  76.         undef $satzlaengeabs;  
  77.         undef $counter;
  78.         undef $schnittwoerter;
  79.         undef $schnittsaetze;
  80.         undef $satzlaenge;
  81. }
  82.  
  83.  
  84. %hash_wort = @hasharraywoerter;
  85. %hash_satz = @hasharraysaetze;
  86. %hash_satzlaenge = @hasharraysatzlaenge;
  87.  
  88. print OUTPUT "Droge\tSatzanzahlsdurchschnitt\n";
  89.  
  90. # foreach my $key (sort {$hash_wort{$b} <=> $hash_wort{$a}} keys %hash_wort) {
  91. #       print "$key\t$hash_wort{$key}\n";
  92. # }
  93.  
  94. foreach my $key (sort{$hash_satz{$b} <=> $hash_satz{$a}} keys %hash_satz) {
  95.         print OUTPUT "$key\t$hash_satz{$key}\n";
  96. }
RAW Paste Data
Top