Advertisement
Guest User

Textlängendurchschnitt

a guest
Mar 7th, 2015
190
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 1.33 KB | None | 0 0
  1. #/usr/bin/perl -w
  2.  
  3. #Dieses Modul ermittelt jede Beitragslänge und die durchschnittliche Beitragslänge pro Drogensorte
  4.  
  5. $input = "../land_der_traeume_gesamtkorpus_CWB_fertig.vrt";
  6. $output = '../durschnittl_textlaenge_uber5.txt';
  7.  
  8. open(INPUT, "< $input");
  9. open(OUTPUT, "> $output");
  10.  
  11. # my %hash;
  12.  
  13. while (<INPUT>) {
  14.     $alles = join("",<INPUT>);
  15.     @array = split(/<text/,$alles);
  16. }
  17.  
  18. foreach (@array) {
  19.     if ($_ =~ m/drug="(.*?)"/) {
  20.         $drug = $1;
  21.         push(@drugarray,$drug);
  22.         @zaehle = split(/[\n]/,$_);
  23.         $laenge = @zaehle;
  24.         push(@zaehlarray,"$drug\t$laenge\n");
  25.         }
  26. }
  27.  
  28. sub del_double {
  29.     my %all;
  30.     $all{$_}=0 for @drugarray;
  31.     return (keys %all);
  32. }
  33.  
  34. @drugarray=&del_double(@drugarray);
  35. shift(@drugarray);
  36.  
  37. $elemente = @drugarray;
  38.  
  39. $counter = 0;
  40. $textlaengeabs = 0;
  41.  
  42. foreach (@drugarray) {
  43.     $droge = $_;
  44.     foreach (@zaehlarray) {
  45.         if ($_ =~ m/^$droge\t(\d*?)\n/) {
  46.         $textlaengeabs = $textlaengeabs + $1;
  47.         $counter++;
  48.         }
  49.     }  
  50.     $schnitt = $textlaengeabs / $counter;
  51.     unless ($counter <= 5) {
  52.         push (@hasharray,"$droge");
  53.         push (@hasharray,"$schnitt");
  54.     }
  55.     undef $textlaengeabs;  
  56.     undef $counter;
  57.     undef $schnitt;
  58. }
  59.  
  60. %hash = @hasharray;
  61.  
  62. print OUTPUT "Droge\tTextlängendurchschnitt\n";
  63.  
  64. foreach my $key (sort {$hash{$b} <=> $hash{$a}} keys %hash) {
  65.     print OUTPUT "$key\t$hash{$key}\n";
  66. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement