Advertisement
Guest User

Untitled

a guest
Jul 20th, 2017
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 2.33 KB | None | 0 0
  1. #!/usr/bin/perl -w
  2.  
  3. use strict;
  4. #use lib '/opt/funnelback/lib/perl/';
  5. use File::Find;
  6. #use Funnelback::Config;
  7. #use Funnelback::NumbersTextTimes;
  8. use Data::Dumper;
  9.  
  10. #if ( ( $#ARGV + 1 ) < 2 ) {
  11.     my $numArgs = $#ARGV + 1;
  12.     print $numArgs;
  13.     print "\nGoldsmiths College Pre-processing Script\n";
  14.     print "Usage: $0 <collection config file> <live or offline>\n\n";
  15. #    exit 1;
  16. #}
  17.  
  18. my %courses = ();
  19. my $data_path = $ARGV[0];
  20. print $ARGV[0];
  21. # Recurse through the data directory and process each file
  22. find(\&pre_process_file, $data_path);
  23.  
  24. print '()<?xml version="1.0" encoding="UTF-8" standalone="no" ?>' . "\n<courses>\n";
  25. #print Dumper(\%courses);
  26.  
  27. die Dumper(%courses);
  28. foreach my $k (keys %courses){
  29. #   my $values = $courses{$k};
  30. #   my %values = %{$courses{$k}};
  31.     die Dumper($courses{$k});
  32.  
  33. #   print $values->{'title'};
  34.     print "<course>\n";
  35. #   print $values{'title'};
  36.     print "</course>\n";
  37. }
  38. print "</courses>\n";
  39.  
  40. sub pre_process_file {
  41.     my $file = $_;
  42.  
  43.     return if (not defined $file);
  44.  
  45.     # Skip directories and binary documents
  46.     if (-d $file) {
  47.         return;
  48.     }
  49.     my $path = $File::Find::name;
  50.     if($path){ #=~ /ugrad\/study\/subject.php/i
  51.       # Open file
  52.       if (not open(DATA,  "<", $file)) {
  53.          warn "Goldsmiths College Pre-processing: Unable to read file $file ($!)";
  54.          return;
  55.       }
  56.  
  57.        # Read the file contents into memory
  58.       my @content = <DATA>;
  59.       close(DATA);
  60.       my $content = join("", @content);
  61.       my %course = ();
  62.       my $title;
  63.    
  64.       if($content =~ /<div\sclass="box">\s*<.{2}>(.*?)<.{3}>/msi){
  65.         $title = $1;
  66.     #   print $title;
  67.         $course{"title"} = $title;
  68.         if($content =~ /BASE HREF="(.*?)"/i){
  69.             $course{"url"} = $1;
  70.         }
  71.         if($content =~ /<b>Course Length:<\/b><br(\s| )\/>\s*(.*?)<\/d/msi){
  72.             $course{"course_length"} = $2;
  73.         }
  74.         if($content =~ /<b>find out more:<\/b><br(\s| )\/>\s*(.*?)<\/d/msi){
  75.             my $fom = $2;
  76.             if($fom =~/<a\shref="(.*?)"\son/msi){
  77.                 $course{"bookletUrl"} = $1;
  78.             }
  79.         }
  80.         if($content =~ /<div\sclass="two_col_r_prospectus\sfloatleft">\s*<p>(.*?)<\/p>\s*<h3/msi){
  81.             $course{"info"} = $1;
  82.         }
  83.         if($content =~ /<b>UCAS:<\/b><br(\s| )\/>\s*(.*?)<\/d/msi){
  84.             $course{"ucas"} = $2;
  85.         }
  86.  
  87.     $courses{$title} = \%course; # only fill this if you have a title
  88.       }
  89.    
  90.  
  91. }
  92. }
  93.  
  94. # Done
  95. exit 0;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement