Advertisement
Guest User

people extract

a guest
Jun 29th, 2018
1,792
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 2.52 KB | None | 0 0
  1. open(INFO, "/usr/bin/lynx -dump [url redacted]|");
  2. my @lines = <INFO>;
  3. close(INFO);
  4.  
  5. #count variable will represent the current "column", eg:
  6. # name|position|email|... will be 1|2|3|..., up to 7
  7. $count = 0;
  8. foreach $line (@lines)
  9. {
  10. #printf($count);
  11.   chomp($line);
  12.   if($line=~/Waterloo Arts/){
  13.     last;
  14.     next;
  15.   }
  16.   if($line=~/^\[/ && $count==0)
  17.   {
  18.     #removing the leading number and square brackets
  19.     my $index = index($line, ']') + 1;
  20.     my $fragment = substr $line, $index;
  21.     printf("$fragment\|");
  22.     $count = 1;
  23.     next;
  24.   }
  25.   if($line=~/^\[/ && ($count==5||$count==6))
  26.   {
  27.     #removing the leading number and square brackets
  28.     my $index = index($line, ']') + 1;
  29.     my $fragment = substr $line, $index;
  30.     printf("\|\n$fragment\|");
  31.     $count = 1;
  32.     next;
  33.   }
  34.   #removing the leading white space
  35.   #source for this line: http://perlmaven.com/trim
  36.   $line =~ s/^\s+//;
  37.  
  38.  
  39.   if($line=~/head shot/||$line=~/Head shot/){
  40.     next;
  41.   }
  42.   elsif($count==1 && $line=~/\@/){
  43.         #removing the leading number and square brackets
  44.     my $index = index($line, ']') + 1;
  45.     my $fragment = substr $line, $index;
  46.     printf("$fragment\|");
  47.  
  48.     $count=3;
  49.     next;
  50.   }
  51.   elsif($count==1 && ($line=~/[a...Z]/ || $line=~/Professor/)){# || $line=~/Distinguished/)){
  52.     printf("$line\|");
  53.     $count =2;
  54.     next;
  55.   }
  56.   elsif($count==2 && $line=~/\@/){
  57.     #removing the leading number and square brackets
  58.     my $index = index($line, ']') + 1;
  59.     my $fragment = substr $line, $index;
  60.     printf("$fragment\|");
  61.  
  62.     $count = 3;
  63.     next;
  64.   }
  65.   elsif($count==3 && $line=~/[1...9]/ && !($line=~/PAS/ || $line=~/TT/ || $line=~/EV1/ || $line=~/NH/)){
  66.     printf("$line\|");
  67.     $count = 4;
  68.     next;
  69.   }
  70.   elsif(($count==3 || $count==4) && ($line=~/PAS/ || $line=~/TT/ || $line=~/EV1/ || $line=~/NH/)){
  71.     if($count==3){
  72.         printf("\|");
  73.     }
  74.  
  75.         printf("$line\|");
  76.     $count=5;
  77.     next;
  78.   }
  79.   elsif($count==4 && $line=~/Location/){
  80.     next;
  81.   }
  82.   #at this point, after location, no field with [xx] will be included, nor will bullet points.
  83.   #Also hard-coded exception, Rita has a blurb we aren't including
  84.   elsif($line=~/^\[/ || $line=~/^\*/ || $line=~/^\+/ || $line=~/Items of Interest/)
  85.   {
  86.     next;
  87.   }
  88.   elsif($count==5 && $line=~/Supervisor/){
  89.     printf("$line\|");
  90.     $count=6;
  91.     next;
  92.   }
  93.   elsif(($count>=2)&&($line=~/Faclty/ || $line=~/Graduate/ || $line=~/Staff/ || $line=~/Emerita/)){
  94.         if($line=~/Emeritus/){
  95.         $line=Emeritus;
  96.     }
  97.     for(; $count<=5; $count++){
  98.         printf("\|");
  99.     }
  100.     printf("$line\n");
  101.     $count=0;
  102.     next;
  103.   }
  104.   #printf("$count\n");
  105. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement