Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl -w
- use strict;
- #use lib '/opt/funnelback/lib/perl/';
- use File::Find;
- #use Funnelback::Config;
- #use Funnelback::NumbersTextTimes;
- use Data::Dumper;
- #if ( ( $#ARGV + 1 ) < 2 ) {
- my $numArgs = $#ARGV + 1;
- print $numArgs;
- print "\nGoldsmiths College Pre-processing Script\n";
- print "Usage: $0 <collection config file> <live or offline>\n\n";
- # exit 1;
- #}
- my %courses = ();
- my $data_path = $ARGV[0];
- print $ARGV[0];
- # Recurse through the data directory and process each file
- find(\&pre_process_file, $data_path);
- print '()<?xml version="1.0" encoding="UTF-8" standalone="no" ?>' . "\n<courses>\n";
- #print Dumper(\%courses);
- die Dumper(%courses);
- foreach my $k (keys %courses){
- # my $values = $courses{$k};
- # my %values = %{$courses{$k}};
- die Dumper($courses{$k});
- # print $values->{'title'};
- print "<course>\n";
- # print $values{'title'};
- print "</course>\n";
- }
- print "</courses>\n";
- sub pre_process_file {
- my $file = $_;
- return if (not defined $file);
- # Skip directories and binary documents
- if (-d $file) {
- return;
- }
- my $path = $File::Find::name;
- if($path){ #=~ /ugrad\/study\/subject.php/i
- # Open file
- if (not open(DATA, "<", $file)) {
- warn "Goldsmiths College Pre-processing: Unable to read file $file ($!)";
- return;
- }
- # Read the file contents into memory
- my @content = <DATA>;
- close(DATA);
- my $content = join("", @content);
- my %course = ();
- my $title;
- if($content =~ /<div\sclass="box">\s*<.{2}>(.*?)<.{3}>/msi){
- $title = $1;
- # print $title;
- $course{"title"} = $title;
- if($content =~ /BASE HREF="(.*?)"/i){
- $course{"url"} = $1;
- }
- if($content =~ /<b>Course Length:<\/b><br(\s| )\/>\s*(.*?)<\/d/msi){
- $course{"course_length"} = $2;
- }
- if($content =~ /<b>find out more:<\/b><br(\s| )\/>\s*(.*?)<\/d/msi){
- my $fom = $2;
- if($fom =~/<a\shref="(.*?)"\son/msi){
- $course{"bookletUrl"} = $1;
- }
- }
- if($content =~ /<div\sclass="two_col_r_prospectus\sfloatleft">\s*<p>(.*?)<\/p>\s*<h3/msi){
- $course{"info"} = $1;
- }
- if($content =~ /<b>UCAS:<\/b><br(\s| )\/>\s*(.*?)<\/d/msi){
- $course{"ucas"} = $2;
- }
- $courses{$title} = \%course; # only fill this if you have a title
- }
- }
- }
- # Done
- exit 0;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement