Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #/usr/bin/perl -w
- #Dieses Modul ermittelt jede Satzlänge und die durchschnittliche Satzlänge pro Drogensorte
- $input = "../land_der_traeume_gesamtkorpus_CWB_fertig.vrt";
- $output = '../satzanzahl_schnitt_uber5.txt';
- open(INPUT, "< $input");
- open(OUTPUT, "> $output");
- # my %hash;
- while (<INPUT>) {
- $alles = join("",<INPUT>);
- @array = split(/<text/,$alles);
- }
- foreach (@array) {
- if ($_ =~ m/drug="(.*?)"/) {
- $drug = $1;
- push(@drugarray,$drug);
- @zaehle = split(/[\n]/,$_);
- @zaehle_satz = split(/\$\./,$_);
- $laenge = @zaehle;
- $satzlaenge = @zaehle_satz;
- push(@zaehlarray,"$drug\t$laenge\n");
- push(@satzlaengearray,"$drug\t$satzlaenge\n");
- }
- }
- sub del_double {
- my %all;
- $all{$_}=0 for @drugarray;
- return (keys %all);
- }
- @drugarray=&del_double(@drugarray);
- shift(@drugarray);
- $elemente = @drugarray;
- $counter = 0;
- $textlaengeabs = 0;
- $satzlaengeabs = 0;
- foreach (@drugarray) {
- $droge = $_;
- foreach (@zaehlarray) {
- if ($_ =~ m/^$droge\t(\d*?)\n/) {
- $textlaengeabs = $textlaengeabs + $1;
- }
- }
- foreach (@satzlaengearray) {
- if ($_ =~ m/^$droge\t(\d*?)\n/) {
- $satzlaengeabs = $satzlaengeabs + $1;
- $counter++;
- }
- }
- $schnittwoerter = $textlaengeabs / $counter;
- $schnittsaetze = $satzlaengeabs / $counter;
- $satzlaenge = $schnittwoerter / $schnittsaetze;
- unless ($counter <= 5) {
- push (@hasharraywoerter,"$droge");
- push (@hasharraywoerter,"$schnittwoerter");
- }
- unless ($counter <= 5) {
- push (@hasharraysaetze,"$droge");
- push (@hasharraysaetze,"$schnittsaetze");
- }
- unless ($counter <= 5) {
- push (@hasharraysatzlaenge,"$droge");
- push (@hasharraysatzlaenge,"$satzlaenge");
- }
- undef $textlaengeabs;
- undef $satzlaengeabs;
- undef $counter;
- undef $schnittwoerter;
- undef $schnittsaetze;
- undef $satzlaenge;
- }
- %hash_wort = @hasharraywoerter;
- %hash_satz = @hasharraysaetze;
- %hash_satzlaenge = @hasharraysatzlaenge;
- print OUTPUT "Droge\tSatzanzahlsdurchschnitt\n";
- # foreach my $key (sort {$hash_wort{$b} <=> $hash_wort{$a}} keys %hash_wort) {
- # print "$key\t$hash_wort{$key}\n";
- # }
- foreach my $key (sort{$hash_satz{$b} <=> $hash_satz{$a}} keys %hash_satz) {
- print OUTPUT "$key\t$hash_satz{$key}\n";
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement