icenine

extract M06 20131216 MERGE

Dec 17th, 2013
103
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 17.94 KB | None | 0 0
  1. #!/usr/bin/perl -w
  2.  
  3. use strict;
  4. use 5.010;
  5.  
  6. # Required for parsing NRLS data
  7. use Excel::Reader::XLSX;
  8. use DateTime::Format::Excel;
  9.  
  10. # Required for inputing data into Neo4j
  11. use REST::Neo4p;
  12.  
  13. use Data::Dumper;
  14. use Getopt::Long;
  15.  
  16. use Try::Tiny;
  17.  
  18. $| = 1;
  19.  
  20. my $input_file;
  21. my $neo_uri = 'http://larva.med.ic.ac.uk:7474';
  22.  
  23. GetOptions (
  24.     "input_file=s"  => \$input_file,
  25.     "neo_uri=s"     => \$neo_uri,);
  26.  
  27. if (!defined($input_file)) {
  28.     die "Usage: $0 --input_file <xlsx file>";
  29. }
  30.  
  31. # Define required column headers
  32. my @required_columns = (
  33.     "IN07",             # Incident description
  34.     "PD09",             # Indident degree of harm
  35.     "RP07",             # NHS Trust
  36.     "IN05_lvl1",        # Incident category
  37.     "PD05_lvl1",        # Speciality
  38.     "PD02",             # Patient sex
  39.     "IN01",             # Incident date
  40.     "IN02_A_01",        # Incident hour
  41.     "Age_at_Incident",  # Patient age
  42.     "PD11",             # Patient ethnicity
  43.     "ST01_LVL1",        # Recorder staff type
  44.     "IN03_lvl1",        # Incident location
  45.     "incidentID",       # Unique incident ID
  46. );
  47.  
  48. my $reader = Excel::Reader::XLSX->new();
  49.  
  50. my $workbook = $reader->read_file($input_file) or
  51.     die ("Could not read $input_file.");
  52.  
  53. my $worksheet = ($workbook->worksheets())[0] or
  54.     die ("Failed to assign worksheet from workbook.");
  55.  
  56. my $column_header_row = $worksheet->next_row() or
  57.     die ("Could not get column header row.");
  58.  
  59. my %header_map;
  60.  
  61. # Find column positions for required columns
  62. while (my $cell = $column_header_row->next_cell()) {
  63.     my $cell_value = $cell->value();
  64.  
  65.     if (grep /$cell_value/, @required_columns) {
  66.         $header_map{$cell_value} = $cell->col();
  67.     }
  68. }
  69.  
  70. # Create connection to Neo4j
  71. REST::Neo4p->connect($neo_uri) or
  72.     die ("Could not connect to $neo_uri.");
  73.  
  74. my (%nodes, %relationships, %created_nhs_trust_location);
  75.  
  76. my $created_nhs_trust_node;
  77.  
  78. while (my $row = $worksheet->next_row) {
  79.     my @values = $row->values();
  80.     if (@values == 0) {
  81.         warn ("Row at ", $row->row_number() , " contains no values.");
  82.     }
  83.  
  84.     @values = map {
  85.         if ($_ =~ /^ ?$/ or !defined($_)) {
  86.             $_ = "NOT_DEFINED";
  87.         } else {
  88.             $_;
  89.         }
  90.     } @values;
  91.  
  92.     my ($nhs_trust, $nhs_trust_location, $degree_of_harm,
  93.         $patient_age, $patient_sex, $patient_ethnicity,
  94.         $incident_id, $incident_date,
  95.         $incident_hour, $incident_description,
  96.         $incident_category, $incident_reporter,
  97.         $incident_speciality);
  98.  
  99.     my ($last_known_query, $neo4p_query);
  100.  
  101.     try {
  102.         $nhs_trust =            $values[$header_map{"RP07"}];
  103.         $nhs_trust_location =   $values[$header_map{"IN03_lvl1"}];
  104.         $degree_of_harm =       $values[$header_map{"PD09"}];
  105.         $patient_age =          $values[$header_map{"Age_at_Incident"}];
  106.         $patient_sex =          $values[$header_map{"PD02"}];
  107.         $patient_ethnicity =    $values[$header_map{"PD11"}];
  108.         $incident_id =          $values[$header_map{"incidentID"}];
  109.         $incident_date =        $values[$header_map{"IN01"}];
  110.         $incident_hour =        $values[$header_map{"IN02_A_01"}];
  111.         $incident_description = $values[$header_map{"IN07"}];
  112.         $incident_category =    $values[$header_map{"IN05_lvl1"}];
  113.         $incident_reporter =    $values[$header_map{"ST01_LVL1"}];
  114.         $incident_speciality =  $values[$header_map{"PD05_lvl1"}];
  115.  
  116.         # Normalize double quotes in incident description
  117.         $incident_description =~ s/"/\\"/g;
  118.  
  119.         # Assign unique trust id just to ensure uniqueness on set
  120.         my $trust_incident_id = $incident_id . "_" . $nhs_trust;
  121.  
  122.         # Normalize incident hour
  123.         if ($incident_hour eq 'NOT_DEFINED') {
  124.             $incident_hour = '00';
  125.         } else {
  126.             $incident_hour = substr($incident_hour, 0, 2);
  127.         }
  128.  
  129.         # Normalize date
  130.         my $incident_timestamp = DateTime::Format::Excel
  131.             ->parse_datetime($incident_date)->ymd . "T" .
  132.             $incident_hour . ":00:00";
  133.  
  134.         # Build nodes
  135.  
  136.         # Create nodes
  137.        
  138.         # Create unique NHS Trust node
  139.         if (!defined($created_nhs_trust_node)) {
  140.             $last_known_query  =
  141.                 'MATCH (ndt:NRLS_DATA_TYPE)'
  142.                 . ' WHERE'
  143.                 . ' ndt.code = {code}'
  144.                 . ' CREATE UNIQUE'
  145.                 . ' (n:NHS_TRUST { name : {name} })-[:HAS_NRLS_DATA_TYPE]->(ndt)'
  146.                 . ' RETURN n';
  147.                      
  148.             $created_nhs_trust_node = 1;
  149.  
  150.             $neo4p_query = REST::Neo4p::Query->new($last_known_query
  151.                 , { "code" => "RP07", "name" => $nhs_trust }) or
  152.                 die ("Could not create neo4p query from $last_known_query.");
  153.      
  154.             $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  155.            
  156.             destroy_query ($neo4p_query);
  157.         }
  158.  
  159.         if (!defined($created_nhs_trust_location{$nhs_trust_location})) {
  160.             # Create unique NHS Trust Location node
  161.             $last_known_query =
  162.                 'MERGE (ntl:NHS_TRUST_LOCATION'
  163.                 . ' { location_level_01 : { location_level_01 } }) RETURN ntl';
  164.      
  165.             $neo4p_query = REST::Neo4p::Query->new($last_known_query
  166.                 , { "location_level_01" => $nhs_trust_location }) or
  167.                 die ("Could not create neo4p query from $last_known_query.");
  168.      
  169.             $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  170.            
  171.             destroy_query ($neo4p_query);
  172.            
  173.             # Create relationship between NHS Trust/Location    
  174.             $last_known_query =
  175.                 'MATCH (nt:NHS_TRUST), (ntl:NHS_TRUST_LOCATION)'
  176.                 . ' WHERE nt.name = { name }'
  177.                 . ' AND ntl.location_level_01 = { location_level_01 }'
  178.                 . ' CREATE UNIQUE'
  179.                 . ' (ntl)<-[hntl:HAS_NHS_TRUST_LOCATION]-nt RETURN hntl';
  180.      
  181.             $neo4p_query = REST::Neo4p::Query->new($last_known_query
  182.                 , { "name" => $nhs_trust, "location_level_01" => $nhs_trust_location }) or
  183.                 die ("Could not create neo4p query from $last_known_query.");
  184.      
  185.             $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  186.            
  187.             destroy_query ($neo4p_query);
  188.            
  189.             $created_nhs_trust_location{$nhs_trust_location} = 1;
  190.         }
  191.        
  192.         # Create unique Incident node (IN07) ${nhs_trust}
  193.         $last_known_query =
  194.             'MERGE (i:INCIDENT { incident_id : { incident_id } })'
  195.             . ' ON CREATE SET i.incident_timestamp = { incident_timestamp },'
  196.             . ' i.incident_description = { incident_description }'
  197.             . ' RETURN i';
  198.        
  199.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  200.             , { "incident_id" => $incident_id
  201.             , "incident_timestamp" => $incident_timestamp
  202.             , "incident_description" => $incident_description }) or
  203.             die ("Could not create neo4p query from $last_known_query.");
  204.  
  205.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  206.  
  207.         destroy_query ($neo4p_query);
  208.  
  209.         # Create unique Incident node NRLS_DATA_TYPE relationship
  210.         $last_known_query =
  211.             'MATCH (i:INCIDENT), (ndt:NRLS_DATA_TYPE) '
  212.             . ' WHERE i.incident_id = { incident_id } '
  213.             . ' AND ndt.code = { code } '
  214.             . ' CREATE UNIQUE'
  215.             . ' (i)-[hndt:HAS_NRLS_DATA_TYPE]->ndt RETURN hndt';
  216.  
  217.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  218.             , { "incident_id" => $incident_id, "code" => "IN07" }) or
  219.             die ("Could not create neo4p query from $last_known_query.");
  220.  
  221.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  222.  
  223.         destroy_query ($neo4p_query);
  224.        
  225.         # Create relationship to NHS Trust ${nhs_trust} ${incident_id}
  226.         $last_known_query =
  227.             'MATCH (n:NHS_TRUST), (i:INCIDENT)'
  228.             . ' WHERE n.name = { name }'
  229.             . ' AND i.incident_id = { incident_id }'
  230.             . ' CREATE UNIQUE '
  231.             . ' i-[r:IS_NHS_TRUST_INCIDENT]->n '
  232.             . ' RETURN r';
  233.  
  234.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  235.             , { "name" => $nhs_trust, "incident_id" => $incident_id }) or
  236.             die ("Could not create neo4p query from $last_known_query.");
  237.    
  238.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  239.  
  240.         undef ($neo4p_query);
  241.        
  242.         # Create relationship to NHS Trust Location ${nhs_trust_location} ${incident_id}
  243.         $last_known_query =
  244.             'MATCH (ntl:NHS_TRUST_LOCATION), (i:INCIDENT)'
  245.             . ' WHERE ntl.location_level_01 = { location_level_01 }'
  246.             . ' AND i.incident_id = { incident_id }'
  247.             . ' CREATE UNIQUE '
  248.             . ' i-[r:IS_NHS_TRUST_LOCATION_INCIDENT]->ntl '
  249.             . ' RETURN r';
  250.  
  251.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  252.             , { "location_level_01" => $nhs_trust_location
  253.             , "incident_id" => $incident_id }) or
  254.             die ("Could not create neo4p query from $last_known_query.");
  255.    
  256.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  257.        
  258.         destroy_query ($neo4p_query);
  259.        
  260.         # Create unique Incident Category node
  261.         $last_known_query =
  262.             'MATCH (ndt:NRLS_DATA_TYPE) WHERE ndt.code = '
  263.             . ' { code } CREATE UNIQUE'
  264.             . ' (ic:INCIDENT_CATEGORY { category_level_01 : { category_level_01 } '
  265.             . ' })-[:HAS_NRLS_DATA_TYPE]->ndt RETURN ic';
  266.  
  267.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  268.             , { "code" => "IN05_lvl1", "category_level_01" => $incident_category }) or
  269.             die ("Could not create neo4p query from $last_known_query.");
  270.  
  271.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  272.  
  273.         destroy_query ($neo4p_query);
  274.        
  275.         # Create unique Incident Category relationship
  276.         $last_known_query =
  277.             'MATCH (i:INCIDENT), (ic:INCIDENT_CATEGORY)'
  278.             . ' WHERE i.incident_id = { incident_id }'
  279.             . ' AND ic.category_level_01 = { category_level_01 }'
  280.             . ' CREATE UNIQUE'
  281.             . ' ic<-[r:HAS_INCIDENT_CATEGORY]-i RETURN r';
  282.  
  283.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  284.             , { "incident_id" => $incident_id
  285.             , "category_level_01" => $incident_category }) or
  286.             die ("Could not create neo4p query from $last_known_query.");
  287.    
  288.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  289.        
  290.         destroy_query ($neo4p_query);
  291.        
  292.         # Create unique Incident Speciality node PD05_lvl1
  293.         $last_known_query =
  294.             'MATCH (ndt:NRLS_DATA_TYPE) WHERE ndt.code = { code }'
  295.             . ' CREATE UNIQUE'
  296.             . ' (is:INCIDENT_SPECIALITY { speciality_level_01 : { speciality_level_01 }'
  297.             . ' })-[:HAS_NRLS_DATA_TYPE]->ndt RETURN is';
  298.  
  299.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  300.             , { "code" => "PD05_lvl1", "speciality_level_01" => $incident_speciality }) or
  301.             die ("Could not create neo4p query from $last_known_query.");
  302.  
  303.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  304.  
  305.         destroy_query ($neo4p_query);
  306.        
  307.         # Create unique Incident Speciality relationship incident_id incident_speciality
  308.         $last_known_query =
  309.             'MATCH (i:INCIDENT), (is:INCIDENT_SPECIALITY)'
  310.             . ' WHERE i.incident_id = { incident_id }'
  311.             . ' AND is.speciality_level_01 = { speciality_level_01 }'
  312.             . ' CREATE UNIQUE '
  313.             . ' is<-[r:HAS_INCIDENT_SPECIALITY]-i RETURN r';
  314.  
  315.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  316.             , { "incident_id" => $incident_id
  317.             , "speciality_level_01" => $incident_speciality }) or
  318.             die ("Could not create neo4p query from $last_known_query.");
  319.    
  320.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  321.        
  322.         destroy_query ($neo4p_query);
  323.        
  324.         # Create unique Incident Reporter node ST01_LVL1
  325.         $last_known_query =
  326.             'MATCH (ndt:NRLS_DATA_TYPE) WHERE ndt.code ='
  327.             . ' { code } CREATE UNIQUE '
  328.             . ' (ir:INCIDENT_REPORTER { reporter_level_01 : { reporter_level_01 }'
  329.             . ' })-[:HAS_NRLS_DATA_TYPE]->ndt RETURN ir';
  330.  
  331.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  332.             , { "code" => "ST01_LVL1"
  333.             , "reporter_level_01" => $incident_reporter }) or
  334.             die ("Could not create neo4p query from $last_known_query.");
  335.  
  336.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  337.  
  338.         destroy_query ($neo4p_query);
  339.        
  340.         # Create unique Incident Reporter relationship
  341.         $last_known_query =
  342.             'MATCH (i:INCIDENT), (ir:INCIDENT_REPORTER)'
  343.             . ' WHERE i.incident_id = { incident_id }'
  344.             . ' AND ir.reporter_level_01 = { incident_reporter }'
  345.             . ' CREATE UNIQUE '
  346.             . ' ir<-[r:HAS_INCIDENT_REPORTER]-i RETURN r';
  347.  
  348.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  349.             , { "incident_id" => $incident_id
  350.             , "incident_reporter" => $incident_reporter }) or
  351.             die ("Could not create neo4p query from $last_known_query.");
  352.    
  353.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  354.  
  355.         destroy_query ($neo4p_query);
  356.        
  357.         # Create unique Patient node
  358.         $last_known_query =
  359.             'MATCH (ndt:NRLS_DATA_TYPE) WHERE ndt.code = '
  360.             . "'Age_at_Incident' CREATE UNIQUE "
  361.             . '(p:PATIENT { patient_age : "'
  362.             . $patient_age
  363.             . '", patient_sex : "'
  364.             . $patient_sex
  365.             . '", patient_ethnicity : "'
  366.             . $patient_ethnicity
  367.             . '" })-[:HAS_NRLS_DATA_TYPE]->ndt RETURN p';
  368.  
  369.         $neo4p_query = REST::Neo4p::Query->new($last_known_query) or
  370.             die ("Could not create neo4p query from $last_known_query.");
  371.    
  372.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  373.  
  374.         destroy_query ($neo4p_query);
  375.        
  376.         # Create unique Patient relationship
  377.         $last_known_query =
  378.             'MATCH (i:INCIDENT), (p:PATIENT) '
  379.             . ' WHERE i.incident_id = { incident_id }'
  380.             . ' AND p.patient_age = { patient_age }'
  381.             . ' AND p.patient_sex = { patient_sex }'
  382.             . ' AND p.patient_ethnicity = { patient_ethnicity }'
  383.             . ' CREATE UNIQUE '
  384.             . ' p<-[r:HAS_INCIDENT_PATIENT]-i RETURN r';
  385.  
  386.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  387.             , { "incident_id" => $incident_id
  388.             , "patient_age" => $patient_age
  389.             , "patient_sex" => $patient_sex
  390.             , "patient_ethnicity" => $patient_ethnicity }) or
  391.             die ("Could not create neo4p query from $last_known_query.");
  392.    
  393.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  394.  
  395.         destroy_query ($neo4p_query);
  396.        
  397.         # Create unique Degree Of Harm node
  398.         $last_known_query =
  399.             'MATCH (ndt:NRLS_DATA_TYPE) WHERE ndt.code = { code }'
  400.             . ' CREATE UNIQUE'
  401.             . ' (doh:DEGREE_OF_HARM { degree_of_harm : { degree_of_harm }'
  402.             . ' })-[:HAS_NRLS_DATA_TYPE]->ndt RETURN doh';
  403.  
  404.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  405.             , { "code" => "PD09"
  406.             , "degree_of_harm" => $degree_of_harm }) or
  407.             die ("Could not create neo4p query from $last_known_query.");
  408.    
  409.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  410.  
  411.         destroy_query ($neo4p_query);
  412.        
  413.         # Create unique Degree Of Harm relationship
  414.         $last_known_query =
  415.             'MATCH (i:INCIDENT), (doh:DEGREE_OF_HARM)'
  416.             . ' WHERE i.incident_id = { incident_id }'
  417.             . ' AND doh.degree_of_harm = { degree_of_harm }'
  418.             . ' CREATE UNIQUE '
  419.             . ' doh<-[r:HAS_INCIDENT_DEGREE_OF_HARM]-i RETURN r';
  420.  
  421.         $neo4p_query = REST::Neo4p::Query->new($last_known_query
  422.             , { "incident_id" => $incident_id
  423.             , "degree_of_harm" => $degree_of_harm}) or
  424.             die ("Could not create neo4p query from $last_known_query.");
  425.    
  426.         $neo4p_query->execute() or die ("Could not execute query $last_known_query.");
  427.  
  428.         destroy_query ($neo4p_query);        
  429.        
  430.     } catch {
  431.         my $current_row_number = $row->row_number() + 1;
  432.         say STDERR "Failure at row number $current_row_number in $input_file : $_.";
  433.         say STDERR "Last known query $last_known_query."
  434.             if (defined($last_known_query));
  435.         if (defined($neo4p_query) and defined($neo4p_query->errstr)) {
  436.             say STDERR "Last known neo4p query error: " , $neo4p_query->errstr;
  437.         }
  438.  
  439.         foreach my $required_column (@required_columns) {
  440.             if (defined($header_map{$required_column}) and
  441.                 defined($values[$header_map{$required_column}])) {
  442.                 say STDERR "Column $required_column : " , $values[$header_map{$required_column}];
  443.             }
  444.         }
  445.     };
  446.  
  447.     unless ($row->row_number() % 500) {
  448.         say "Processed " , $row->row_number() , " rows. " , scalar localtime;
  449.     }
  450. }
  451.  
  452. sub destroy_query {
  453.     my $query = shift;
  454.    
  455.     if (defined($query)) {
  456.         while (my $response = $query->fetch()) {
  457.             # Who cares, throw it out
  458.         }
  459.         undef $query; # Destroy it
  460.     }
  461. }
Advertisement
Add Comment
Please, Sign In to add comment