Advertisement
Guest User

Untitled

a guest
Jan 9th, 2020
206
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 187.27 KB | None | 0 0
  1.  #!/usr/bin/perl
  2.  
  3. #**************************************************************************************************************
  4. #file: deid.pl   Original author: M. Douglass 2004 
  5. #Last revised:    Apr 2009  DeID 1.1
  6. #                
  7. #
  8. #_______________________________________________________________________________
  9. #
  10. #deid.pl: De-identification algorithm -- scrubs PHI from free-text medical records
  11. #(e.g. Discharge Summaries and Nursing Notes)
  12. #
  13. #Copyright (C) 2004-2007 Margaret Douglas and  Ishna Neamatullah
  14. #
  15. #This code is free software; you can redistribute it and/or modify it under
  16. #the terms of the GNU Library General Public License as published by the Free
  17. #Software Foundation; either version 2 of the License, or (at your option) any
  18. #later version.
  19. #
  20. #This library is distributed in the hope that it will be useful, but WITHOUT ANY
  21. #WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
  22. #PARTICULAR PURPOSE.  See the GNU Library General Public License for more
  23. #details.
  24. #
  25. #You should have received a copy of the GNU Library General Public License along
  26. #with this code; if not, write to the Free Software Foundation, Inc., 59
  27. #Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  28. #
  29. #You may contact the author by e-mail or postal mail
  30. #(MIT Room E25-505, Cambridge, MA 02139 USA).  For updates to this software,
  31. #please visit PhysioNet (http://www.physionet.org/).
  32. #_______________________________________________________________________________
  33. #
  34. # De-identification Algorithm: Scrubs PHI from free-text medical records
  35. #(e.g. Discharge Summaries and Nursing Notes)
  36. # Original version written by:
  37. #   Margaret Douglass (douglass AT alum DOT mit DOT edu)
  38. #   William Long (wjl AT mit DOT edu)
  39. # Modified by:
  40. #   Ishna Neamatullah (ishna AT alum DOT mit DOT edu) in Sept 5, 2006
  41. # Last modified by:
  42. #   Li-wei Lehman (lilehman AT alum DOT mit DOT edu) in April, 2009
  43. #
  44. # Command to run the software:
  45. # perl deid.pl <filename> <config_filename>
  46. #
  47. # Input arguments:
  48. # filename (without extension, where extension must be .text): file to be de-identified
  49. # config_filename: configuration file
  50. #
  51. # Required library file: stat.pm
  52. #
  53. # Output files:
  54. # filename.res: de-identified file
  55. # filename.phi: file containing all PHI locations, used in calculating performance statistics
  56. # filename.info: file containing information useful for debugging
  57. # code performance statistics printed on screen if Gold Standard available for nursing notes (details in README)
  58. #**************************************************************************************************************
  59.  
  60. #use Stat;
  61.  
  62. # Declaring some variables for algorithm run configuration
  63.  
  64. # Variables to switch on/off filter functions
  65. $allfilters = "";
  66. $ssnfilter = "";
  67. $urlfilter = "";
  68. $emailfilter = "";
  69. $telfilter = "";
  70. $unitfilter = "";
  71. $agefilter = "";
  72. $locfilter = "";
  73. $datefilter = "";
  74. $namefilter = "";
  75. $us_state_filter = "";
  76. $ds_specific_filter = ""; #filter for discharge summmary specific patterns
  77. $gs_specific_filter = ""; #filter for gold std specific patterns
  78.  
  79. my $offset; # positive date shift in number of days
  80. my $comparison = ""; # 1=comparison with gold standard, 0=no comparison with gold standard
  81.  
  82. # Variables to switch on/off dictionaries/lists
  83. $alllists = "";
  84. $pid_patientname_list = "";
  85. $pid_dateshift_list = "";
  86. $country_list = "";
  87. $company_list = "";
  88. $ethnicity_list = "";
  89. $hospital_list = "";
  90. $doctor_list = "";
  91. $location_list = "";
  92. $local_list = "";
  93. $state_list = "";
  94.  
  95. use Time::Local;
  96. my ($mday,$mon,$year) = (localtime(time))[3,4,5];
  97. my $shortyear = substr($year,1,4);
  98. my $longyear = '20'.$shortyear;
  99.  
  100. # Nursing note date as retrieved from the header
  101. my $nn_year;
  102.  
  103. # Sets whether a de-identified version should be output: 0 = no new version of text output, 1 = fully de-identified version of text output
  104. # Note: Generally keep output_deid_text = 1
  105. $output_deid_text = 1;
  106.  
  107. #Default date used to de-identify the Gold Std if no default date is specified in the
  108. #config file.  You can change the default date by setting the "Default date" variable to
  109. #some other dates (in MM/DD/YYYY format).
  110. $DEFAULT_DATE = "01/01/2000";
  111.  
  112. #The "Two Digit Year Threshold" is used to determine whether
  113. #to interpret the year as a year in the 1900's or 2000's.
  114. #Must be a 1- or 2-digit number.
  115. #Two digit years > Threshold are  interepreted as in the 1900's
  116. #Two digit years <=  Threshold are interpreted as in the 2000's
  117. $TWO_DIGIT_YEAR_THRESHOLD = 30;#change this default by setting "Two Digit Year Threshold" in config file.
  118.  
  119. # "Valid Year Low" and "Valid Year High" (must be 4-digit numbers) are
  120. # used in certain date pattern checking routines to determine if a
  121. # four digit number that appear in a potential
  122. # date pattern is a year or not -- it is a valid year if it is
  123. # in the range of [Valid Year Low, Valid Year High].  
  124. $VALID_YEAR_LOW = 1900;
  125. $VALID_YEAR_HIGH = 2030;
  126.  
  127.  
  128.  
  129. my @known_phi;
  130. my @known_first_name;
  131. my @known_last_name;
  132.  
  133. # Hash that stores PHI information from de-identification
  134. # KEY = (patient number), VALUE = array with each element = (%HASH with KEY = start-end, VALUE = array of types of PHI for the note number/index of the array)
  135. %all_phi;
  136. my %ID;
  137.  
  138. # Declares some global variables
  139. %lhash = ();
  140. @typename = ();
  141. $ntype = 1;
  142. @extend = ('phrases');
  143.  
  144.  
  145. ##########################################################################################
  146. # Sets paths to lists and dictionaries in working directory that will be used in this algorithm
  147. $date_shift_file = "shift.txt"; # contains mapping of PID to date offset
  148. $countries_file = "lists/countries_unambig.txt";
  149. $ethnicities_unambig_file = "lists/ethnicities_unambig.txt";
  150. $companies_file = "lists/company_names_unambig.txt";
  151. $companies_ambig_file = "lists/company_names_ambig.txt";
  152. $common_words_file = "dict/common_words.txt";
  153. $medical_words_file = "dict/sno_edited.txt";
  154. $very_common_words_file = "dict/commonest_words.txt";
  155. $female_unambig_file = "lists/female_names_unambig.txt";
  156. $female_ambig_file = "lists/female_names_ambig.txt";
  157. $female_popular_file = "lists/female_names_popular.txt";
  158. $male_unambig_file = "lists/male_names_unambig.txt";
  159. $male_ambig_file = "lists/male_names_ambig.txt";
  160. $male_popular_file = "lists/male_names_popular.txt";
  161. $last_unambig_file = "lists/last_names_unambig.txt";
  162. $last_ambig_file = "lists/last_names_ambig.txt";
  163. $last_popular_file = "lists/last_names_popular.txt";
  164. #$last_name_prefixes_file = "lists/last_name_prefixes.txt";
  165. $doctor_first_unambig_file = "lists/doctor_first_names.txt";
  166. $doctor_last_unambig_file = "lists/doctor_last_names.txt";
  167. $prefixes_unambig_file = "lists/prefixes_unambig.txt";
  168. $locations_unambig_file = "lists/locations_unambig.txt";
  169. $locations_ambig_file = "lists/locations_ambig.txt";
  170. $local_places_ambig_file = "lists/local_places_ambig.txt";
  171. $local_places_unambig_file = "lists/local_places_unambig.txt";
  172. $hospital_file = "lists/stripped_hospitals.txt";
  173. $last_name_prefix_file = "lists/last_name_prefixes.txt";
  174. $patient_file = "lists/pid_patientname.txt"; # contains mapping of PID to patient name
  175. $us_states_file = "lists/us_states.txt";
  176. $us_states_abbre_file = "lists/us_states_abbre.txt";
  177. $more_us_states_abbre_file = "lists/more_us_state_abbreviations.txt";
  178. $us_area_code_file = "lists/us_area_code.txt";
  179. $medical_phrases_file = "dict/medical_phrases.txt";
  180. $unambig_common_words_file = "dict/notes_common.txt";
  181.  
  182. ############################################################################################################
  183. # Declares some arrays of context words that can be used to identify PHI
  184. # Days of the month
  185. @days = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday");
  186.  
  187. # Titles that precede last names (ignore .'s)
  188. @titles = ("MISTER", "DOCTOR", "DOCTORS", "MISS", "PROF", "PROFESSOR", "REV", "RABBI", "NURSE", "MD", "PRINCESS", "PRINCE", "DEACON", "DEACONESS", "CAREGIVER", "PRACTITIONER", "MR", "MS");
  189.  
  190. @strict_titles = ("Dr", "DRS", "Mrs");  #treat words after these strict_titles as PHI
  191.  
  192. %titles = ();
  193. foreach $title (@titles){
  194.     $titles{$title} = 1;
  195. }
  196.  
  197. # Name indicators that precede or follow names
  198. @name_indicators = ("problem","problem:", "proxy", "daughter","daughters", "dtr", "son", "brother","sister", "mother", "mom", "father", "dad", "wife", "husband", "neice", "nephew", "spouse", "partner", "cousin", "aunt", "uncle", "granddaughter", "grandson", "grandmother", "grandmom", "grandfather", "granddad", "relative", "friend", "neighbor", "visitor", "family member", "lawyer", "priest", "rabbi", "coworker", "co-worker", "boyfriend", "girlfriend", "name is", "named", "rrt", "significant other", "jr", "caregiver", "proxys", "friends", "sons", "brothers", "sisters", "sister-in-law", "brother-in-law", "mother-in-law", "father-in-law", "son-in-law", "daughter-in-law", "dtr-in-law", "surname will be", "name will be", "name at discharge will be", "name at discharge is");
  199.  
  200.  
  201. # Phrases that precede locations
  202. @location_indicators = ("lives in", "resident of", "lived in", "lives at", "comes from", "called from", "visited from", "arrived from", "returned to");
  203.  
  204. @employment_indicators_pre = ("employee of", "employed by", "employed at", "CEO of", "manager at", "manager for", "manager of", "works at", "business");
  205.  
  206.  
  207. # Hospital indicators that follow hospital names
  208. @hospital_indicators = ("Hospital", "General Hospital", "Gen Hospital", "gen hosp", "hosp", "Medical Center", "Med Center", "Med Ctr", "Rehab", "Clinic", "Rehabilitation", "Campus", "health center", "cancer center", "development center", "community health center", "health and rehabilitation", "Medical", "transferred to", "transferred from", "transfered to", "transfered from");
  209.  
  210. # Location indicators that follow locations
  211. @loc_indicators_suff = ("city", "town", "beach", "valley","county", "harbor", "ville", "creek", "springs", "mountain", "island", "lake", "lakes", "shore", "garden", "haven", "village", "grove", "hills", "hill", "shire", "cove", "coast", "alley", "street", "terrace", "boulevard", "parkway", "highway", "university", "college", "tower");
  212.  
  213. # Location indicators that are most likely preceded by locations
  214. @loc_ind_suff_c = ("town", "ville", "harbor", "tower");
  215.  
  216. # Location indicators that precede locations
  217. #@loc_indicators_pre = ("cape", "fort", "lake", "mount", "santa", "los", "great","east","west","north","south");
  218. @loc_indicators_pre = ("cape", "fort", "lake", "mount", "santa", "los", "east","west","north","south");
  219.  
  220.  
  221. @apt_indicators = ("apt", "suite"); #only check these after the street address is found
  222. @street_add_suff = ("park", "drive", "street", "road", "lane", "boulevard", "blvd", "avenue", "highway", "circle","ave", "place", "rd", "st");
  223.  
  224. #Strict street address suffix: case-sensitive match on the following,
  225. #and will be marked as PHI regardless of ambiguity (common words)
  226. @strict_street_add_suff = ("Park", "Drive", "Street", "Road", "Lane", "Boulevard", "Blvd", "Avenue", "Highway","Ave",,"Rd", "PARK", "DRIVE", "STREET", "ROAD", "LANE", "BOULEVARD", "BLVD", "AVENUE", "HIGHWAY","AVE", "RD");
  227.  
  228. # Age indicators that follow ages
  229. @age_indicators_suff = ("year old", "y\. o\.", "y\.o\.", "yo", "y", "years old", "year-old", "-year-old", "years-old", "-years-old", "years of age", "yrs of age");
  230.  
  231. # Age indicators that precede ages
  232. @age_indicators_pre = ("age", "he is", "she is", "patient is");
  233.  
  234. # Digits, used in identifying ages
  235. @digits = ("one","two","three","four","five","six","seven","eight","nine", "");
  236.  
  237. # Different variations of the 12 months
  238. @months = ("January", "Jan", "February", "Feb", "March", "Mar", "April", "Apr", "May", "June", "Jun", "July", "Jul", "August", "Aug", "September", "Sept", "Sep", "October", "Oct", "November", "Nov", "December", "Dec");
  239.  
  240. ######################################################################################
  241. # If the correct number of input argument is provided, sets the input and output filenames.
  242. if ($#ARGV == 1) {
  243.     $data_file = "$ARGV[0].text";      # data_file: input file
  244.     $output_file = "$ARGV[0].phi";     # output_file: file containing PHI locations
  245.     $debug_file = "$ARGV[0].info";     # debug_file: file used for debugging, contains PHI and non-PHI locations
  246.     $deid_text_file = "$ARGV[0].res";  # deid_text_file: de-identified text file
  247.     $gs_file = "$ARGV[0].deid";        # gs_file: Gold Standard of the input file
  248.  
  249.     print "\n*******************************************************************************************************************\n";
  250.     print "De-Identification Algorithm: Identifies Protected Health Information (PHI) in Discharge Summaries and Nursing Notes";
  251.     print "\n*******************************************************************************************************************\n";
  252.  
  253.  
  254.    
  255.     $config_file = $ARGV[1];
  256.     open CF, $config_file or die "Cannot open $config_file";
  257.     while ($cfline = <CF>) {
  258.     chomp $cfline;
  259.     if ($cfline =~ /\A[\#]+/){
  260.        next;
  261.     }
  262.     if ($cfline =~ /\bGold\s+standard\s+comparison\s*\=\s*([0-9])/ig) {
  263.         $comparison = ($1);
  264.     }
  265.     #Date default expects MM/DD/YYYY
  266.     if ($cfline =~ /\bDate\s+default\s*\=\s*(\d\d)\/(\d\d)\/(\d\d\d\d)/ig) {
  267.        
  268.         my $mm = $1; $dd = $2; $yyyy = $3;
  269.         $DEFAULT_DATE = "$mm/$dd/$yyyy";
  270.         #print "Default date is $DEFAULT_DATE\n";
  271.     }
  272.  
  273.     #The "Two Digit Year Threshold" is used to determine whether
  274.         #to interpret the year as a year in the 1900's or 2000's
  275.     if ($cfline =~ /\bTwo\s+Digit\s+Year\s+Threshold\s*=\s*(\d{1,2})/ig) {
  276.         $TWO_DIGIT_YEAR_THRESHOLD = "$1";
  277.         #print "Two Digit Year Threshold is $TWO_DIGIT_YEAR_THRESHOLD\n";
  278.     }
  279.  
  280.     if ($cfline =~ /\bDate\s+offset\s*\=\s*([0-9]+)/ig) {
  281.         $offset = ($1);
  282.         #print "Date offset is $1\n";
  283.     }
  284.     if ($cfline =~ /\bSSN\s+filter\s*\=\s*([a-z])/ig) {
  285.         $ssnfilter = ($1);     
  286.     }
  287.     if ($cfline =~ /\bURL\s+filter\s*\=\s*([a-z])/ig) {
  288.         $urlfilter = ($1);
  289.     }
  290.     if ($cfline =~ /\bEmail\s+filter\s*\=\s*([a-z])/ig) {
  291.         $emailfilter = ($1);
  292.     }
  293.     if ($cfline =~ /\bTelephone\s+filter\s*\=\s*([a-z])/ig) {
  294.         $telfilter = ($1);
  295.     }
  296.     if ($cfline =~ /\bUnit\s+number\s+filter\s*\=\s*([a-z])/ig) {
  297.         $unitfilter = ($1);
  298.     }
  299.     if ($cfline =~ /\bAge\s+filter\s*\=\s*([a-z])/ig) {
  300.         $agefilter = ($1);
  301.     }
  302.     if ($cfline =~ /\bLocation\s+filter\s*\=\s*([a-z])/ig) {
  303.         $locfilter = ($1);
  304.     }
  305.     if ($cfline =~ /\bDate\s+filter\s*\=\s*([a-z])/ig) {
  306.         $datefilter = ($1);
  307.     }
  308.     if ($cfline =~ /\bName\s+filter\s*\=\s*([a-z])/ig) {
  309.         $namefilter = ($1);
  310.     }
  311.  
  312.     if ($cfline =~ /\bState\s+filter\s*\=\s*([a-z])/ig) {
  313.         $us_state_filter = ($1);
  314.  
  315.     }
  316.  
  317.     if ($cfline =~ /\bDS\s+filter\s*\=\s*([a-z])/ig) {
  318.         $ds_specific_filter = ($1);
  319.  
  320.     }
  321.  
  322.     if ($cfline =~ /\bGS\s+filter\s*\=\s*([a-z])/ig) {
  323.         $gs_specific_filter = ($1);
  324.  
  325.     }
  326.  
  327.     ######################################################
  328.     #get the config info for dictionaries loading
  329.     if ($cfline =~ /\bPID\s+to\s+patient\s+name\s+mapping\s*\=\s*([a-z])/ig) {
  330.         $pid_patientname_list = ($1);
  331.  
  332.     }  
  333.     if ($cfline =~ /\bPID\s+to\s+date\s+offset\s+mapping\s*\=\s*([a-z])/ig) {
  334.         $pid_dateshift_list = ($1);
  335.     }
  336.     if ($cfline =~ /\bCountry\s+names\s*\=\s*([a-z])/ig) {
  337.         $country_list = ($1);
  338.     }
  339.     if ($cfline =~ /\bCompany\s+names\s*\=\s*([a-z])/ig) {
  340.         $company_list = ($1);
  341.     }
  342.     if ($cfline =~ /\bEthnicities\s*\=\s*([a-z])/ig) {
  343.         $ethnicity_list = ($1);
  344.     }
  345.     if ($cfline =~ /\bHospital\s+names\s*\=\s*([a-z])/ig) {
  346.         $hospital_list = ($1);
  347.     }
  348.     if ($cfline =~ /\bLocation\s+names\s*\=\s*([a-z])/ig) {
  349.         $location_list = ($1);
  350.  
  351.     }  
  352.     if ($cfline =~ /\bDoctor\s+names\s*\=\s*([a-z])/ig) {
  353.         $doctor_list = ($1);
  354.  
  355.     }
  356.  
  357.     if ($cfline =~ /\bLocalPlaces\s+names\s*\=\s*([a-z])/ig) {
  358.         $local_list = ($1);
  359.     }
  360.  
  361.     if ($cfline =~ /\bState\s+names\s*\=\s*([a-z])/ig) {
  362.         $state_list = ($1);
  363.     }
  364.  
  365.     }
  366.  
  367. }
  368.  
  369. # Prints an error message on the screen if number of arguments is incorrect
  370. else {
  371.     print "\n===========================================================================================";
  372.     print "\nError: Wrong number of arguments entered";
  373.     print "\nThe algorithm takes 2 arguments:";
  374.     print "\n  1. filename (the filename of medical notes, without extension, where extension must be .text)";
  375.     print "\n  2. config_filename (the configuration filename)";
  376.     print "\nExample (for Gold Standard Comparison): perl deid.pl id deid.config";
  377.     print "\nExample (for output mode using Gold Standard): perl deid.pl id deid-output.config";
  378.     print "\nFor further documentation, please consult the README.txt file";
  379.     print "\n===========================================================================================\n";
  380.     exit;
  381.  
  382. }
  383.  
  384. # After setting file names and configuring the run, indicates that de-identification has commenced
  385. print "\n\nStarting de-identification (version 1.1) ...\n\n";
  386.  
  387.  
  388. #check if we can open the .phi file
  389. open F, ">$output_file" or die "Cannot open $output_file";
  390. close F;
  391.  
  392. # Calls setup to create some lookup lists in memory
  393. setup();
  394.  
  395.  
  396. if ($comparison==1) {
  397.     print "Running deid in performance comparison mode.\n";    
  398.     print "Using PHI locations in $gs_file as comparison. Output files will be:\n";
  399.     print "$output_file: the PHI locations found by the code.\n";
  400.     print "$debug_file: debug info about the PHI locations.\n";
  401.     #check if the gold std file exists
  402.     open GS, $gs_file or die "Cannot open $gs_file. Make sure that the gold standard file exists!\n";   # GS = Gold Standard file
  403.     close GS;
  404. }
  405. else {
  406.     #check if we can open the .res file
  407.     open F, ">$deid_text_file" or die "Cannot open $deid_text_file";
  408.     close F;
  409.     print "Running deid in output mode. Output files will be: \n";
  410.     print "$output_file: the PHI locations found by the code.\n";
  411.     print "$deid_text_file: the scrubbed text.\n";
  412.     print "$debug_file: debug info about the PHI locations.\n";
  413.      
  414. }
  415.  
  416. deid();
  417.  
  418. # Calls function stat() to calculate code performance statistics, if comparison mode = 1
  419. if ($comparison==1) {
  420.     require "stat.pm";
  421.     &stat($gs_file, $output_file);
  422.  
  423. }
  424.  
  425. # End of top level of code
  426. #***********************************************************************************************************
  427. #***********************************************************************************************************
  428. #***********************************************************************************************************
  429. sub numerically {
  430.               #print "a is $a  b is $b\n";
  431.               $a <=> $b;
  432.  }
  433. #***********************************************************************************************************
  434. #***********************************************************************************************************
  435. #***********************************************************************************************************
  436. # Reads in a file and pushes each line onto an array. Returns the array.
  437.  
  438. sub preload {
  439.     my ($file) = @_;
  440.     my @res = ();
  441.     open FILE, $file or die "Cannot open file $file";
  442.     while ($line = <FILE>) {
  443.     chomp $line;
  444.     push(@res, uc($line));
  445.     }
  446.     close FILE;
  447.     return @res;
  448. }
  449. # End of preload()
  450. #***********************************************************************************************************
  451. #***********************************************************************************************************
  452. #***********************************************************************************************************
  453. # Reads in a file and pushes each line onto an array. Returns the array.
  454.  
  455. sub preload_uc {
  456.     my ($file) = @_;
  457.     my @res = ();
  458.     open FILE, $file or die "Cannot open file $file";
  459.     while ($line = <FILE>) {
  460.     chomp $line;
  461.     push(@res, uc($line));
  462.     }
  463.     close FILE;
  464.     return @res;
  465. }
  466. # End of preload_uc()
  467. #***********************************************************************************************************
  468. #***********************************************************************************************************
  469. #***********************************************************************************************************
  470. # Reads in a file and creates a dictionary that records each line in that file under the given association, by mapping the line to a '1' in the dictionary of the association
  471.  
  472. sub preload_assoc {
  473.     my ($file,$assoc) = @_;
  474.     open FILE, $file or die "Cannot open file $file";
  475.  
  476.     while ($line = <FILE>) {
  477.     chomp $line;
  478.     $$assoc{uc($line)}=1;
  479.     }
  480.     close FILE;
  481. }
  482. # End of preload_assoc()
  483. #***********************************************************************************************************
  484. #***********************************************************************************************************
  485. #***********************************************************************************************************
  486. # Reads in a file and calls setup_hash on each line
  487.  
  488. sub setup_hash {
  489.     my ($file, $tname) = @_;
  490.     my @entry;
  491.     $typename[$ntype]= $tname;
  492.     open FILE, $file or die "Cannot open file $file";
  493.     while ($line = <FILE>) {
  494.     chomp $line;
  495.     &setup_item($ntype,$line);
  496.     }
  497.     $ntype++;
  498.     close FILE;
  499. }
  500. # End of setup_hash()
  501. #***********************************************************************************************************
  502. #***********************************************************************************************************
  503. #***********************************************************************************************************
  504. # Reads in a file and calls setup_hash on each line
  505. sub setup_lst_hash {
  506.     my ($tname, @hlst) = @_;
  507.     $typename[$ntype]= $tname;
  508.     foreach $line (@hlst) {
  509.     &setup_item($ntype,$line);
  510.     }
  511.     $ntype++;
  512. }
  513. # End of setup_lst_hash()
  514. #***********************************************************************************************************
  515. #***********************************************************************************************************
  516. #***********************************************************************************************************
  517. sub setup_item {
  518.     my ($type,$line) = @_;
  519.     my ($head, @lst) = split (/([^a-zA-Z0-9_\']+)/,uc($line));
  520.     my $ix = $type;
  521.  
  522.     if(@lst){
  523.  
  524.     push @extend, [@lst];
  525.     $ix = "$type,$#extend";
  526.     }
  527.     my $entry = $lhash{$head};
  528.     if ($entry){
  529.     $lhash{$head} .= "|" . $ix;
  530.     }
  531.     else{$lhash{$head}=$ix;
  532.      }
  533. }
  534.  
  535. # End of setup_item()
  536. #***********************************************************************************************************
  537. #***********************************************************************************************************
  538. #***********************************************************************************************************
  539. sub typename {
  540.     my ($num) = @_;
  541.     return($typename[$num]);
  542. }
  543. # End of typename()
  544. #***********************************************************************************************************
  545. #***********************************************************************************************************
  546. #***********************************************************************************************************
  547. # Takes in an array of words and compares them with hashes of known PHI. Recognizes and adds PHI using addType().
  548.  
  549. sub lookhash {
  550.     my @txtlst = @_;
  551.     my $pos = 0;
  552.     my $npos = 0;
  553.     my $tl,$ty;
  554.     my $txt = 1; # item is text not separator
  555.     my $item;
  556.  
  557.     while (@txtlst) {
  558.     $item = shift(@txtlst);
  559.  
  560.     $npos = $pos + length($item);
  561.     if ($item =~ /([a-zA-Z\']+)/ig) {
  562.         if($item){
  563.         $item = uc($item);
  564.         $tl = $lhash{$item};
  565.  
  566.         #if the term ends with 's, remove it and see if
  567.                 #there is a match in PHI hash
  568.         #if (!($tl) && $item =~/([a-zA-Z\'\-]+)\'s$/ig) {
  569.         if (!($tl) && $item =~/([a-zA-Z\']+)\'s$/ig) {
  570.             $tl = $lhash {$1};
  571.         }
  572.  
  573.         if($tl){
  574.  
  575.             # Compares with known PHI by calling bestmatch()
  576.             ($xpos, $done, @types)=&bestmatch($tl,@txtlst);
  577.             splice(@txtlst,0,$#txtlst-$done);
  578.             $npos += $xpos;
  579.  
  580.             foreach $typ (@types){
  581.             #print "item $item, adding type $type for position $pos-$npos\n";
  582.             #print "type is $typ , key is $pos - $npos  \n";
  583.             addType("$pos-$npos",$typ);    
  584.             #print "positions are $pos-$npos, typ is $typ\n";
  585.             }
  586.         }
  587.         }  #end if $item
  588.         $txt = 0;
  589.     }  #end if $txt
  590.     else {
  591.         $txt = 1;
  592.     }
  593.     $pos = $npos;
  594.    }
  595. }
  596. # End of lookhash()
  597. #***********************************************************************************************************
  598. #***********************************************************************************************************
  599. #***********************************************************************************************************
  600. sub bestmatch {
  601.    my ($tl,@txtlst) = @_;
  602.    my $pos = 0;
  603.    my $bestpos = 0;
  604.    my $bestrest = $#txtlst;
  605.    my @besttyp = ();
  606.    my ($type, $rest, $xpos, @nlst);
  607.  
  608.    foreach $ck (split '\|',$tl) {
  609.     ($type, $rest) = split ',',$ck;
  610.  
  611.     if($rest) {
  612.         ($xpos , @nlst) = &matchrest($rest,@txtlst);
  613.         if ($xpos) {
  614.         if($xpos > $bestpos) {
  615.  
  616.             $bestrest = $#nlst; $bestpos = $xpos;
  617.             @besttyp = ($typename[$type]);
  618.         }
  619.         elsif($xpos == $bestpos) {
  620.             push @besttyp,$typename[$type];
  621.         }
  622.         }
  623.     }
  624.     elsif ($bestpos == 0) {
  625.         push @besttyp,$typename[$type];
  626.     }
  627.    }
  628.    return ($bestpos, $bestrest, @besttyp);
  629. }
  630. # End of bestmatch()
  631. #***********************************************************************************************************
  632. #***********************************************************************************************************
  633. #***********************************************************************************************************
  634. sub matchrest {
  635.    my ($rest, @txtlst) = @_;
  636.    my @mlst = @{$extend[$rest]};
  637.    my $item;
  638.    my $pos = 0;
  639.    my $pm = 0;
  640.  
  641.    my $tmppos = 0;
  642.  
  643.    if ($mlst[0] eq '|') {
  644.     $pm = 1; shift(@mlst);
  645.    }
  646.    foreach $i (@mlst) {
  647.     if ($i !~ /([a-zA-Z\'\-]+)$/ig){
  648.         next;
  649.     }
  650.  
  651.     $item = shift(@txtlst);
  652.     $pos += length($item);
  653.  
  654.  
  655.     while ( $#txtlst >= 0 && ($item !~ /([a-zA-Z\'\-]+)/ig)){
  656.         $item = shift(@txtlst);
  657.         $pos += length ($item);
  658.         #print "item is $item, len is $#txtlst";
  659.     }
  660.  
  661.     if ($pm ? (uc($item) !~ /$i/) : ($i ne uc($item))) {
  662.         #print "pm is $pm, returning zero\n";
  663.         return 0;
  664.     }
  665.    }
  666.    return ($pos, @txtlst);
  667. }
  668. # End of matchrest()
  669. #***********************************************************************************************************
  670. #***********************************************************************************************************
  671. #***********************************************************************************************************
  672. # Function: setup()
  673. # Arguments: None
  674. # Returns: None
  675. # Called by: Topmost level of code
  676. # Description: Creates some lookup lists to have in memory
  677. sub setup {
  678.    
  679.    # This part is not necessary
  680.    open LP, $last_name_prefix_file or die "Cannot open $last_name_prefix_file";
  681.    while ($line = <LP>) {
  682.     chomp $line;
  683.     $prefixes{uc($line)} = 1;
  684.    }
  685.    close LP;
  686.  
  687.    #Added to reduce false positives
  688.    &setup_hash($medical_phrases_file,"MedicalPhrase");
  689.  
  690.    # Sets up hashes of some PHI lists for direct identification
  691.    if ($namefilter =~ /y/) {
  692.     &setup_hash($female_unambig_file,"Female First Name (un)");
  693.     &setup_hash($female_ambig_file,"Female First Name (ambig)");
  694.     &setup_hash($male_unambig_file,"Male First Name (un)");
  695.     &setup_hash($male_ambig_file,"Male First Name (ambig)");
  696.     &setup_hash($last_unambig_file,"Last Name (un)");
  697.     &setup_hash($last_popular_file,"Last Name (popular/ambig)");
  698.     &setup_hash($last_ambig_file,"Last Name (ambig)");
  699.     &setup_hash($female_popular_file, "Female First Name (popular/ambig)");
  700.     &setup_hash($male_popular_file, "Male First Name (popular/ambig)");
  701.    
  702.     if ($doctor_list =~ /y/) {
  703.         &setup_hash($doctor_first_unambig_file, "Doctor First Name");
  704.         &setup_hash($doctor_last_unambig_file, "Doctor Last Name");
  705.     }
  706.    
  707.    }
  708.  
  709.    if ($locfilter =~ /y/) {
  710.     if ($location_list =~ /y/) {
  711.         &setup_hash($locations_ambig_file,"Location (ambig)");
  712.         &setup_hash($locations_unambig_file,"Location (un)");
  713.      
  714.     } else {
  715.         @loc_unambig = ();
  716.         @more_loc_unambig = ();
  717.         @loc_ambig = ();
  718.     }
  719.  
  720.  
  721.     if ($hospital_list =~ /y/) {
  722.         &setup_hash($hospital_file,"Hospital");
  723.     }
  724.     if ($ethnicity_list =~ /y/) {      
  725.         &setup_hash($ethnicities_unambig_file, "Ethnicity");
  726.     }
  727.     if ($company_list =~ /y/) {
  728.         &setup_hash($companies_file, "Company");
  729.         &setup_hash($companies_ambig_file, "Company (ambig)");
  730.    
  731.     }
  732.     if ($country_list =~ /y/) {
  733.         &setup_hash($countries_file, "Country");
  734.     }
  735.  
  736.     if ($local_list =~ /y/){
  737.         &setup_hash($local_places_unambig_file, "Location (un)");
  738.         &setup_hash($local_places_ambig_file, "Location (ambig)");
  739.    
  740.     }
  741.    }
  742.  
  743.    # Preloads PHI in some lists into corresponding arrays    
  744.    @female_popular = &preload($female_popular_file);
  745.    @male_popular = &preload($male_popular_file);
  746.    #@last_name_prefixes = &preload_uc($last_name_prefixes_file);
  747.    @prefixes_unambig = &preload_uc($prefixes_unambig_file);
  748.    
  749.  
  750.    if ($hospital_list =~ /y/) {
  751.     @hospital = &preload($hospital_file);
  752.    } else {@hospital = ();}
  753.  
  754.    if ($state_list =~ /y/){
  755.     @us_states = &preload($us_states_file);
  756.     @us_states_abbre =  &preload($us_states_abbre_file);
  757.     @more_us_states_abbre =  &preload($more_us_states_abbre_file);
  758.    }
  759.  
  760.  
  761.    # Generates associations between PHI in some lists and PHI categories
  762.    &preload_assoc($common_words_file,"common_words");
  763.    &preload_assoc($medical_words_file,"common_words");
  764.  
  765.    &preload_assoc($very_common_words_file,"very_common_words");
  766.    &preload_assoc($unambig_common_words_file, "unambig_common_words");
  767.    &preload_assoc($male_unambig_file, "male_unambig");
  768.    &preload_assoc($female_unambig_file, "female_unambig");
  769.    &preload_assoc($female_ambig_file, "female_ambig");
  770.    &preload_assoc($male_ambig_file, "male_ambig");
  771.    &preload_assoc($last_ambig_file, "last_ambig");
  772.    &preload_assoc($male_popular_file, "male_popular");
  773.    &preload_assoc($female_popular_file, "female_popular");
  774.    &preload_assoc($us_area_code_file,"us_area_code");
  775.  
  776.     # Opens debug file for debugging
  777.    open D, ">".$debug_file;
  778.    close D;
  779. }
  780. # End of setup()
  781. #***********************************************************************************************************
  782. #***********************************************************************************************************
  783. #***********************************************************************************************************
  784. # Function: deid()
  785. # Arguments: None
  786. # Returns: None
  787. # Called by: Topmost level of code
  788. # Description: One of the 2 major branches of the code
  789. # This function takes over de-identification if no performance statistic is required
  790. # Function first loads the patient name list file.
  791. # Then reads the medical text line by line.
  792. #  If the line indicates START_OF_RECORD, read the Patient ID (PID), Note ID (NID), (and Note Date if any) info
  793. # Scan for PHI a paragraph at a time
  794. # output PHI locations into .phi file
  795. my $currentID;
  796. my @known_phi;
  797. my @known_first_name;
  798. my @known_last_name;
  799. my %pidPtNames; # key = pid, value = [0] first name [1] last name
  800.  
  801. #***********************************************************************************************************
  802. #***********************************************************************************************************
  803. #***********************************************************************************************************
  804.  
  805. sub deid {
  806.    
  807.    $allText = "";
  808.    $allallText = "";
  809.    open DF, $data_file or die "Cannot open $data_file";
  810.  
  811.    my $paraCount = 0;
  812.  
  813.    my $stpos = 0;
  814.  
  815.  
  816.    my %deids; #key = start index, value = array of (end index, ID)
  817.    my %phiTerms; #key = lc(word), value = # of occurrences
  818.    my %phiT;
  819.    my %phiTT;
  820.    my $noteDate;
  821.    my $line;
  822.  
  823.    $currentID = 0; #initialize current ID to a non-existent PID 0
  824.  
  825.    # Code runs through text by paragraph so things that extend over lines aren't missed
  826.     $para = "";
  827.  
  828.     #load the patient name file
  829.     if ($pid_patientname_list =~ /y/) {
  830.     open PF, $patient_file or die "Cannot open $patient_file";
  831.    
  832.     while ($pfline = <PF>) {
  833.         chomp $pfline;
  834.  
  835.         if ($pfline =~ /((.)+)\|\|\|\|((.)+)\|\|\|\|((.)+)/ig) {       
  836.         my $pid = $1;
  837.  
  838.         $known_first_names = $3;
  839.         $known_last_names = $5;        
  840.         $pidPtNames{$pid}[0] = $known_first_names;
  841.         $pidPtNames{$pid}[1] = $known_last_names;      
  842.        
  843.         }# end if pfline = ~ /((.)+)\|\|\|\|((.)+)\|\|\|\|((.)+)/ig)
  844.     } # end while $pfline = <PF>
  845.     } #end if pid_patient_name_list ~= y
  846.     ###End loading the patient names
  847.  
  848.     while ($line = <DF>) {
  849.    
  850.     #If this is a new record, set PID, Note ID
  851.     #we assume all notes for the same patient go together
  852.     if ( $line =~ /\ASTART_OF_RECORD=(([0-9]+)(\|)+([0-9]+)(\|)+(\d\d\/\d\d\/\d{4})?)/) {
  853.        
  854.        # $currentNote = $2;
  855.        # $thePT = $4;
  856.        # $noteDate =$6; #noteDate is empty in gold std
  857.  
  858.         #print "pt $thePT, currentNote $currentNote\n";
  859.  
  860.         #if it's gold standard, the header contains PID then NID
  861.        
  862.         $currentNote = $4;
  863.         $thePT = $2;
  864.         $noteDate =$6; #noteDate is empty in gold std
  865.  
  866.         my $label = "Patient $thePT\tNote $currentNote";
  867.         open DEST, ">>".$debug_file or die "Cannot open $debug_file";
  868.         print DEST "$label\n";
  869.         close DEST;
  870.        
  871.             #Gold Standard corpus does not specify note date, so assign a default
  872.             #If you would like deid to date shift the notes on a per note basis, make sure you
  873.             #specify the record date in the header!
  874.         if (length($noteDate) ==0){
  875.             #$noteDate="01/01/2020";
  876.         $noteDate = $DEFAULT_DATE;
  877.         }
  878.  
  879.         #if this is a new patient, set the PID, and lookup patient name
  880.         if ($thePT != $currentID) {  #This is a new patient!   
  881.         $currentID = $thePT;
  882.         %phiTerms = ();  #clear the phiTerms on new pt
  883.  
  884.  
  885.         # Find the patient name for the current PID
  886.         #lookup first and last name of this patient
  887.         if ($pid_patientname_list =~ /y/ && exists $pidPtNames{$currentID} ) {
  888.            
  889.             $known_first_names = $pidPtNames{$currentID}[0];
  890.             $known_last_names = $pidPtNames{$currentID}[1];
  891.            
  892.             #print "Found patient names, first = $known_first_names   last = $known_last_names\n";
  893.  
  894.             if ($known_first_names =~ /([a-z][a-z]+)[\s\-]([a-z][a-z]+)/ig) {
  895.                 @known_first_name = ($1, $2);}
  896.             else {
  897.             if ($known_first_names =~ /\b([a-z])(\.?)\s([a-z]+)/ig) {
  898.                 @known_first_name = ($3);
  899.             }
  900.                
  901.             elsif ($known_first_names =~ /\b([a-z][a-z]+)\s([a-z])(\.?)/ig) {
  902.                 @known_first_name = ($1);
  903.             }
  904.             else {
  905.                 @known_first_name = ($known_first_names);
  906.             }
  907.             }
  908.            
  909.             if ($known_last_names =~ /([a-z][a-z]+)[\s\-]([a-z][a-z]+)/ig) {
  910.             @known_last_name = ($1, $2);}
  911.             else {
  912.             if ($known_last_names =~ /\b([a-z])(\.?)\s([a-z]+)/ig) {
  913.                 @known_last_name = ($3);
  914.             }
  915.             elsif ($known_last_names =~ /\b([a-z][a-z]+)\s([a-z])(\.?)/ig) {
  916.                 @known_last_name = ($1);
  917.             }
  918.            
  919.             else {
  920.                 @known_last_name = ($known_last_names);
  921.             }
  922.             }
  923.  
  924.  
  925.         } else {  #if no pid/patient name file, just set the first name and last name to null
  926.             @known_first_name = ();
  927.             @known_last_name = ();
  928.         }
  929.         } # end if this is a new patient
  930.  
  931.         #output the header to output file      
  932.             $allText = "";       #reset,
  933.         $allallText = "";    #reset
  934.         $stpos = 0;
  935.         $para = "";
  936.  
  937.         #if output mode, output the header line (with patient and note ID) to .res file
  938.             if ($comparison == 0) {
  939.           open TF, ">>$deid_text_file" or die "Cannot open $deid_text_file";   #now open in append mode
  940.           print TF "\n$line";
  941.           close TF;
  942.             }  
  943.         next; #skip to next line
  944.       }  #end if this is start of a record
  945.     else {  #else this is not the start of a record, just append the line to the end of the current text
  946.         chomp $line;  
  947.         $allText .= $line."\n";
  948.         $allallText .= $line."\n";   
  949.            
  950.             #$myline = $line;
  951.             #chomp $myline;
  952.         #$allText .= $myline."\n";
  953.         #$allallText .= $myline."\n";    
  954.     }
  955.  
  956.     #Look for paragraph separator: if this is a line is entirely non-alphanumeric or
  957.         #if it starts with spaces, or if it is an empty line, or if this line marks the end of record,
  958.         #then call findPHI() for the current paragraph we have so far.
  959.         #If end of record is encoutnered, output the de-id text; else if
  960.         #it's not end of record yet, append the line to the paragraph.
  961. #   if ( (!($line =~ /[a-zA-Z\d]+/)) || $line =~ /^ +/ || $line eq "" || ($line =~ /\|\|\|\|END_OF_RECORD/) ) {  
  962.     if (   (!($line =~ /[a-zA-Z\d]+/)) || ($line eq "") || ($line =~ /\|\|\|\|END_OF_RECORD/) ) {  
  963.         if ($para =~ /\w/ ){  #if para contains alphanumeric
  964.                
  965.                 # Calls findPHI() with current paragraph; resulting PHI locations are stored in %phiT
  966.         #%phiT = findPHI("Para $paraCount", $date, $stpos, $para);
  967.         %phiT = findPHI("Para $paraCount", $noteDate, $stpos, $para);
  968.         $paraCount++;
  969.         # %phiT is copied over to %phiTT
  970.         foreach $x (sort numerically keys %phiT) {
  971.             @{$phiTT{$x}} = @{$phiT{$x}};
  972.         }
  973.        
  974.         #Sorts keys in %phiT; outputs text accordingly
  975.         foreach $k (keys %phiT) {        
  976.             my ($start, $end) = split '-', $k;
  977.             # $deids_end = ${@{$deids{$start}}}[0]; #does not work with perl v5.10
  978.             my @deidsval =  ${@{$deids{$start}}};
  979.             $deids_end = $deidsval[0];
  980.                
  981.             $found = $phiT{$k};
  982.             foreach $t (@{$phiT{$k}}) {
  983.             }
  984.             my $word = lc(substr $allallText, $start, ($end - $start));
  985.  
  986.             #print "Key in PhiT = $k, word = $word\n"; #DEBUG
  987.  
  988.            # if ($end > ${@{$deids{$start}}}[0]) {  
  989.             if ($end > $deidsval[0]) {
  990.             $deids{$start}[0] = $end;
  991.             $deids{$start}[1] = $currentID;
  992.             $deids{$start}[2] = $noteDate;                     
  993.             }
  994.  
  995.  
  996.             #############################################################################
  997.             #Now remember the PHI terms that are important names for checking for repeated occurrences of PHIs
  998.             #PHI Name Tags
  999.                     #(NI)       Name indicators
  1000.                     #(LF)       Lastname Firstnames
  1001.                     #(PTitle)   plural titles
  1002.                     #(MD)       followed by  "MD" or "M.D"
  1003.                     #(PRE)      checks up to 3 words following "PCP Name" ("PCP", "physician", "provider", "created by", "name");
  1004.                     #(NameIs)   followed by pattern "name is"
  1005.                     #(Prefixes) for @prefixes_unambig)
  1006.                     #(STitle)   @specific_titles = ("MR", "MISTER", "MS");
  1007.                     #(Titles)       @titles
  1008.                     #(NamePattern)  all other name patterns in sub name3
  1009.             #remember all the PHI of type name (strict_titles) and name (indicators)
  1010.             #print "checking for repeated occurences of PHIs: word is $word, phitype is (@{$phiT{$k}})\n";
  1011.  
  1012.             if ( ($word !~ /\d/) && ((length $word) > 3) && !(isCommon($word)) &&
  1013.              ( isPHIType( "(NI)", @{$phiT{$k}}) ||
  1014.                isPHIType( "(PTitle)", (@{$phiT{$k}})) || isPHIType( "(LF)", (@{$phiT{$k}})) ||  
  1015.                isPHIType( "(NamePattern)", (@{$phiT{$k}})) ||
  1016.                isPHIType( "(MD)", (@{$phiT{$k}})) ||                         
  1017.                isPHIType( "(NameIs)", (@{$phiT{$k}})) || isPHIType("(STitle)",  (@{$phiT{$k}})) ||
  1018.                isPHIType("(Titles)", (@{$phiT{$k}}) ))
  1019.             ) {    
  1020.                 #$phiTerms{$word}++;
  1021.                 if (!(exists $phiTerms{$word})) {
  1022.                
  1023.                 $phiTerms{$word} = 1;}
  1024.                 else {
  1025.                
  1026.                 $phiTerms{$word} = $phiTerms{$word}+ 1;
  1027.                 }
  1028.  
  1029.  
  1030.             } #end if
  1031.         }  # end foreach $k (keys %phiT)
  1032.             } #end if ($para =~ /\W/)
  1033.  
  1034.         if ($line =~ /\|\|\|\|END_OF_RECORD/ ) {  
  1035.        
  1036.         open DEST, ">>".$debug_file or die "Cannot open $debug_file";
  1037.         ####################################################
  1038.         #check for repeated occurences of PHIs for this note
  1039.         while ($allallText =~ /\b([A-Za-z]+)\b/g) {
  1040.             my $token = $1;
  1041.             my $start = (length ($`));  #$` is the string preceding what was matched by the last successful match
  1042.             my $end = $start + length($token);
  1043.  
  1044.            if (!(exists    $deids{$start})   ) {
  1045.             #if (!(exists ${@{$deids{$start}}}[0])) {
  1046.               L:
  1047.             foreach $word (keys %phiTerms) {
  1048.        
  1049.                 if ( (uc($token) eq uc($word))  ) {
  1050.                
  1051.                 $deids{$start}[0] = $end;
  1052.                 $deids{$start}[1] = $currentID;
  1053.                 $deids{$start}[2] = $noteDate;
  1054.                
  1055.                 $term = substr $text, $start, ($end - $start +1);
  1056.                 $outstr = "$start \t $end \t $term \t Name (Repeated Occurrence) \n";
  1057.                 print DEST $outstr;
  1058.                
  1059.                 next L;
  1060.                 } #end if
  1061.             } # end foreach
  1062.             } # end if
  1063.         } # end while
  1064.         #end checking for repeated occurences of PHIs
  1065.         close DEST;
  1066.         #####################################################
  1067.  
  1068.         ##output PHI locations to the .phi file
  1069.         open OUTF, ">>$output_file" or die "Cannot open $output_file";
  1070.         print OUTF "\nPatient $currentID\tNote $currentNote";
  1071.         foreach $k (sort numerically keys %deids) {
  1072.             my @deidvals = @{$deids{$k}};
  1073.             $thisend = $deidvals[0];
  1074.             if ($thisend ){
  1075.             print OUTF "\n$k\t$k\t$thisend";
  1076.             }
  1077.         }
  1078.         close OUTF;
  1079.  
  1080.         ###output de-ided text to .res file
  1081.         if ($comparison==0) {
  1082.             outputText(\%deids, \%phiTT);  
  1083.         }
  1084.  
  1085.         #now that we have output text for this record, we reset the
  1086.                 #variables to get ready for the next record
  1087.         $para = "";
  1088.         $paraCount = 0;
  1089.         $stpos = 0;
  1090.        
  1091.         %deids=(); #clear the deid hash
  1092.         %phiTT=(); #clear the phiTT hash
  1093.         %phiT=();
  1094.  
  1095.  
  1096.         $allText = "";       #reset,
  1097.         $allallText = "";    #reset
  1098.  
  1099.         } else {  # this is not end of record yet ...
  1100.         my $tmp = length($para);
  1101.         $stpos += length ($para);
  1102.         $para = $line.' ';
  1103.         }
  1104.  
  1105.                
  1106.     }  #end if line starts with empty spaces || empty line || end of record
  1107.     else { # else this line is still a part of the current paragraph
  1108.         #$para .= ' '.$line;  #just append to end of current paragraph
  1109.         #$para .= $line.' '; #just append to end of current paragraph
  1110.          if ($line eq "") {
  1111.         $para .= "\n";
  1112.         } else {
  1113.         $para .= $line.' '; #just append to end of current paragraph
  1114.         }
  1115.     }
  1116.     } #end while ($line=<DF>)
  1117.    
  1118.     close DF;
  1119.  
  1120. }
  1121. # End of deid()
  1122.  
  1123.  
  1124.  
  1125.  
  1126.  
  1127. #***********************************************************************************************************
  1128. #***********************************************************************************************************
  1129. #***********************************************************************************************************
  1130. # Function: findPHI()
  1131. # Arguments: string $label ("Para $paraCount"), string $date (e.g. "yyyy-mm-dd"), int $startIndex (normally 0), string $text (paragraph of text)
  1132. # Returns: hash %approved (key=start-end of each PHI, value=PHI types)
  1133. # Called by: deid()
  1134. # Description: Dispatched from deid() to perform de-identification
  1135. # Reads in a paragraph of text and runs the de-identification algorithm on it
  1136.  
  1137.  
  1138. sub findPHI {
  1139.     my ($label, $curr_date, $startIndex, $text) = @_;
  1140.  
  1141.  
  1142.     # Initializes the hash %phi which stores PHI locations and their PHI types
  1143.     %phi = (); #key = start-end, value = (type1, type2, ...)
  1144.     local %end = (); # key = start, value = end (dynamic scope so addType can do it)
  1145.     if ($text !~ /\w/) {   
  1146.     return %phi;
  1147.     } #[wjl] skip blank lines
  1148.    
  1149.     # Splits the text into separate items at spaces
  1150.     #my @txtlst = split (/([^a-zA-Z0-9_\'\-]+)/,$text); # used by lookhash and appx match
  1151.     my @txtlst = split (/([^a-zA-Z0-9_\']+)/,$text);
  1152.    
  1153.     # Performs exact matching with the hashes of PHI lists
  1154.     &lookhash(@txtlst);
  1155.  
  1156.     # Initializes hash %phik which stores only start and end indices of each PHI in %phi
  1157.     %phik = ();
  1158.  
  1159.     foreach $k (keys %phi) {
  1160.     ($st,$end) = split ('-',$k);
  1161.     $phik{$st} = $end;
  1162.     }  
  1163.  
  1164.  
  1165.     # Calls each filter module
  1166.  
  1167.     if ($datefilter =~ /y/) {
  1168.     &commonHoliday($text, 0);
  1169.     &date ($text, $curr_date);
  1170.     &dateWithContextCheck($text, $curr_date);
  1171.     &yearWithContextCheck($text, $curr_date);
  1172.     &seasonYear ($text, 0);
  1173.     }
  1174.  
  1175.     if ($telfilter =~ /y/) {
  1176.     &telephone($text, 0);
  1177.     &pager ($text, 0);
  1178.     }
  1179.  
  1180.     if ($locfilter =~ /y/) {
  1181.     &wardname($text, 0);
  1182.     &location1 ($text, 0);
  1183.     &location2 ($text, 0);
  1184.     }
  1185.  
  1186.     if ($emailfilter =~ /y/) {
  1187.     &email ($text, 0);
  1188.     }
  1189.  
  1190.     if ($urlfilter =~ /y/) {
  1191.     &url ($text, 0);
  1192.     }
  1193.  
  1194.     if ($ssnfilter =~ /y/) {   
  1195.     &ssn ($text, 0);
  1196.     }
  1197.  
  1198.     if ($agefilter =~ /y/) {
  1199.     &age ($text, 0);
  1200.     }
  1201.  
  1202.     if ($namefilter =~ /y/) {
  1203.  
  1204.     &name1 ($text, 0);
  1205.     &name2 ($text, 0);
  1206.     &name3 ($text, 0);
  1207.     &knownPatientName($text, 0);
  1208.     &problem ($text, 0);
  1209.     &signatureField ($text, 0);
  1210.     }
  1211.  
  1212.     if ($unitfilter =~ /y/) {
  1213.     &mrn ($text, 0);
  1214.     &unit ($text, 0);
  1215.     &providerNumber ($text, 0);
  1216.     }
  1217.  
  1218.  
  1219.    
  1220.     if ($ds_specific_filter =~ /y/){
  1221.     #discharge summary specific filters
  1222.     #filter not enabled for this version
  1223.     #&dischargeSummarySpecific ($text, 0);
  1224.     }
  1225.  
  1226.  
  1227.     # Call new function here >>>>>>>>>>
  1228.     # Follow format shown here if necessary
  1229.     # &functionName ($text, 0);
  1230.  
  1231.     open DEST, ">>".$debug_file or die "Cannot open $debug_file";
  1232.     #print DEST "$label\n";
  1233.  
  1234.     # Sub-function: finalPHICheck()
  1235.     # After findPHI() has performed most PHI checks, goes through the identified PHI before adding them to the final PHI files
  1236.    
  1237.     my %approved;
  1238.     my ($startp, $endp) = (0, 0);
  1239.     my $notAmbig = 0;
  1240.     my $stg1 = "";
  1241.     my $stg2 = "";
  1242.     my $prevAmbig = 0;
  1243.     my $oldk = "";
  1244.     my $prevKey = "";
  1245.     my ($oldstartp, $oldendp,) = (0, 0);
  1246.     my $oldtext = "";
  1247.    
  1248.     # Prunes keys and checks whether each PHI is ambiguous or is an indicator (e.g. hospital indicator)
  1249.     foreach $k (&pruneKeys("phi",$text)) {
  1250.     ($startp, $endp) = split "-", $k;
  1251.  
  1252.     my $the_word = (substr $text, $startp, ($endp - $startp));
  1253.  
  1254.  
  1255.     $notAmbig = 0;  #so by default, the term is ambiguous
  1256.     foreach $tt (@{$phi{$k}}){
  1257.         #if(($tt !~ /ambig/) && ($tt !~ /Indicator/)    ) {
  1258.         if (($tt !~ /ambig/) && ($tt !~ /Indicator/)  && ($tt !~ /MedicalPhrase/)) {
  1259.         #so IF the term matches ANY type that's non-ambiguous, THEN set it as non-ambiguous
  1260.         $notAmbig = 1; last;}
  1261.     }  #end for each
  1262.  
  1263.        
  1264.     my $notIndicator = 1; #default to be not an indicator
  1265.    
  1266.     foreach $tt (@{$phi{$k}}){
  1267.         if ($tt =~ /Indicator/) {
  1268.         $notIndicator = 0; last;
  1269.         }
  1270.     }
  1271.    
  1272.     $prevText = (substr $text, $oldstartp, ($oldendp - $oldstartp));
  1273.     $newText = (substr $text, $startp, ($endp - $startp));
  1274.        
  1275.  
  1276.     $a = (isType($prevKey, "Male First Name", 1) && ($prevAmbig==1) && (!isCommon($prevText)));
  1277.     $b = (isType($k, "Last Name", 1) && ($notAmbig==0) && (!isCommon($newText)));
  1278.    
  1279.  
  1280.     #if (this is ambig) and (previous is ambig) and ...
  1281.     if ((($notAmbig==0) && ($prevAmbig==1) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText)) && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
  1282.     #if (this is not-ambig) and (previous is ambig) and ...
  1283.         (($notAmbig==1) && ($prevAmbig==1) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
  1284.     #if (this is not-ambig) and (previous is ambig) and ...
  1285.         (($notAmbig==1) && ($prevAmbig==1) && isType($prevKey, "Last Name", 1) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "First Name", 1) && (($startp-$oldendp)<3)) ||
  1286.         #commented out on 1/31/07
  1287.         (($notAmbig==0) && ($prevAmbig==0) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
  1288.  
  1289.     #if (this is ambig) and (previous is not ambig) and ...
  1290.         (($notAmbig==0) && ($prevAmbig==0) && isType($prevKey, "Last Name", 1) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "First Name", 1) && (($startp-$oldendp)<3))) {
  1291.  
  1292.         print DEST ($startIndex + $oldstartp)."\t".($startIndex+$oldendp)."\t".(substr $text, $oldstartp, ($oldendp - $oldstartp +1));
  1293.  
  1294.  
  1295.         ###################
  1296.  
  1297.         my $oldtext = $text;
  1298.         my $newKey = ($startIndex + $oldstartp)."-".($startIndex + $oldendp);
  1299.         ###my $text = (substr $text, $oldstartp, ($oldendp - $oldstartp));
  1300.         foreach $tt (@{$phi{$prevKey}}) {
  1301.         print DEST "\t$tt";
  1302.         push @{$approved{$newKey}}, $tt;}
  1303.         print DEST "\n";
  1304.         print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t".(substr $oldtext, $startp, ($endp - $startp +1));
  1305.        
  1306.         my $newKey = ($startIndex + $startp)."-".($startIndex + $endp);
  1307.         ###my $text = (substr $oldtext, $startp, ($endp - $startp));
  1308.         foreach $tt (@{$phi{$k}}) {
  1309.         print DEST "\t$tt";
  1310.         push @{$approved{$newKey}}, $tt;
  1311.         }
  1312.         print DEST "\n";
  1313.  
  1314.  
  1315.     }
  1316.    
  1317.     # If the PHI is not ambiguous and not an indicator, recognizes it as PHI; add it to PHI file
  1318.     elsif ($notAmbig && $notIndicator) {
  1319.  
  1320.         ###################
  1321.  
  1322.        
  1323.         print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t".(substr $text, $startp, ($endp - $startp +1));
  1324.         my $newKey = ($startIndex + $startp)."-".($startIndex + $endp);
  1325.         ###my $text = (substr $text, $startp, ($endp - $startp));
  1326.         foreach $tt (@{$phi{$k}}){     
  1327.         print DEST "\t$tt";
  1328.         if (($tt !~ /ambig/) && ($tt !~ /Indicator/)) {
  1329.             push @{$approved{$newKey}}, $tt;
  1330.         }
  1331.         }
  1332.         print DEST "\n";
  1333.  
  1334.     } # Else ck keys discarded
  1335.    
  1336.     else {
  1337.         ###################
  1338.        
  1339.         # Otherwise keeps the remaining PHI as non-PHI
  1340.         print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t# ".(substr $text, $startp, ($endp - $startp +1));
  1341.         foreach $tt (@{$phi{$k}}) {
  1342.         print DEST "\t$tt";
  1343.         }
  1344.         print DEST "\n";}
  1345.  
  1346.     # Sets ambiguous variables for current PHI to be recognized as previous PHI for the next round
  1347.     if ($notAmbig==0) {
  1348.         $prevAmbig = 1;
  1349.         $prevKey = $k;
  1350.         ($oldstartp, $oldendp) = split "-", $prevKey;
  1351.         $oldtext = $text;
  1352.  
  1353.     }
  1354.     else {
  1355.         $prevAmbig = 0;
  1356.     }
  1357.     }    
  1358.     close DEST;
  1359.  
  1360.     # End of sub-function finalPHICheck()
  1361.     #***********************************************************************************************************
  1362.  
  1363.  
  1364.     return %approved;
  1365. }
  1366. # End of findPHI()
  1367.  
  1368.  
  1369.  
  1370. #***********************************************************************************************************
  1371. #***********************************************************************************************************
  1372. #***********************************************************************************************************
  1373. # Function: date ()
  1374. # Searches for date ranges following formats that appear most frequently in text
  1375. sub date {
  1376.     #$text = $_[0];    
  1377.     my ($text, $date) = @_;
  1378.     my $year = substr $date, 0, 4;  
  1379.     # Checks if dates should be filtered
  1380.     if ($datefilter =~ /y/) {
  1381.    
  1382.     # Searches for the pattern mm/dd-mm/dd where the items are valid dates
  1383.     while ($text =~ /\b((\d\d?)\/(\d\d?)\-(\d\d?)\/(\d\d?))\b/ig) {
  1384.         if (isValidDate($2,$3,-1) && isValidDate($4,$5,-1)) {
  1385.         $date_range = $1;
  1386.         $start = length($`);
  1387.         $end = $start + length($date_range);
  1388.         $key = "$start-$end";
  1389.         addType($key, "Date range (1)");
  1390.         }
  1391.     }      
  1392.  
  1393.     # Searches for mm/dd/yy-mm/dd/yy or mm/dd/yyyy-mm/dd/yyyy where the items are valid dates
  1394.     while ($text =~ /\b((\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d)\-(\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d))\b/ig) {
  1395.         if (isValidDate($2,$3,$4) && isValidDate($5,$6,$7)) {
  1396.         $date_range = $1;
  1397.         $start = length($`);
  1398.         $end = $start + length($date_range);
  1399.         $key = "$start-$end";
  1400.         addType($key, "Date range (2)");
  1401.         }
  1402.     }      
  1403.    
  1404.     # Searches for mm/dd-mm/dd/yy or mm/dd-mm/dd/yyyy where the items are valid dates
  1405.     while ($text =~ /\b((\d\d?)\/(\d\d?)\-(\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d))\b/ig) {
  1406.         if (isValidDate($6,$2,$3) && isValidDate($6,$4,$5)) {
  1407.         $date_range = $1;
  1408.         $start = length($`);
  1409.         $end = $start + length($date_range);
  1410.         $key = "$start-$end";
  1411.         addType($key, "Date range (3)");
  1412.         }
  1413.     }  
  1414.     } #end if date filter is on
  1415.     # End of sub-function date1()
  1416.  
  1417.     if ($datefilter =~ /y/) {  
  1418.     # Checks for month/day/year
  1419.     while ($text =~ /\b(\d\d?)[\-\/](\d\d?)[\-\/](\d\d|\d{4})\b/g) {
  1420.         my $startI = (length($`));
  1421.         my $endI = $startI + length($&);
  1422.         my $key = $startI."-".$endI;
  1423.         my $first_num = $1;
  1424.         my $second_num = $2;
  1425.         my $third_num = $3;
  1426.         my $beginr = substr $text, ($startI - 2), 2;
  1427.         my $endr = substr $text, $endI, 2;
  1428.  
  1429.  
  1430.         if (($beginr !~ /(\|\|)/) && ($endr !~ /(\|\|)/)) {
  1431.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1432.  
  1433.             #print "checking M/D/Y, $first_num, $second_num, $third_num\n";
  1434.  
  1435.             if (isValidDate ($first_num, $second_num, $third_num)) {
  1436.             addType ($key, "Month/Day/Year");
  1437.             }
  1438.         }
  1439.         }
  1440.     } #end while
  1441.    
  1442.     # Checks for month/day/year
  1443.     while ($text =~ /\b(\d\d?)\.(\d\d?)\.(\d\d|\d{4})\b/g) {
  1444.         my $startI = (length($`));
  1445.         my $endI = $startI + length($&);
  1446.         my $key = $startI."-".$endI;
  1447.         my $first_num = $1;
  1448.         my $second_num = $2;
  1449.         my $third_num = $3;
  1450.         my $beginr = substr $text, ($startI - 2), 2;
  1451.         my $endr = substr $text, $endI, 2;
  1452.        
  1453.         #print "2. checking M/D/Y, $first_num, $second_num, $third_num\n";
  1454.  
  1455.         if (($beginr !~ /(\|\|)/) && ($endr !~ /(\|\|)/)) {
  1456.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1457.             if (isValidDate ($first_num, $second_num, $third_num)) {
  1458.             addType ($key, "Month/Day/Year");
  1459.             }
  1460.         }
  1461.         }
  1462.     }            
  1463.  
  1464.     # Checks for day/month/year
  1465.     while ($text =~ /\b(\d\d?)[\-\/](\d\d?)[\-\/](\d\d|\d{4})\b/g){
  1466.         my $startI = (length($`));
  1467.         my $endI = $startI + length($&);
  1468.         my $key = $startI."-".$endI;
  1469.         my $first_num = $1;
  1470.         my $second_num = $2;
  1471.         my $third_num = $3;
  1472.         my $beginr = substr $text, ($startI - 2), 2;
  1473.         my $endr = substr $text, $endI, 2;
  1474.        
  1475.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1476.         if (isValidDate ($second_num, $first_num, $third_num)) {
  1477.         }
  1478.         }
  1479.     }
  1480.    
  1481.     # Checks for year/month/day
  1482.     while ($text =~ /\b(\d\d|\d{4})[\-\/](\d\d?)[\-\/](\d\d?)\b/g){
  1483.         my $startI = (length($`));
  1484.         my $endI = $startI + length($&);
  1485.         my $key = $startI."-".$endI;
  1486.         my $yr = $1;
  1487.         $nn_year = $yr;
  1488.         my $mo = $2;
  1489.         my $da = $3;
  1490.         my $beginr = substr $text, ($startI - 2), 2;
  1491.         my $endr = substr $text, $endI, 2;
  1492.        
  1493.        
  1494.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1495.         if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
  1496.             $prevChars = (substr $text, ($startI-4), 4);
  1497.             $nextChars = (substr $text, $endI, 11);
  1498.             if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
  1499.             addType ($key, "Header Date");
  1500.             $longyear = $yr;
  1501.             }
  1502.             else {
  1503.             addType ($key, "Year/Month/Day");
  1504.             }
  1505.         }
  1506.         }
  1507.     } #end while
  1508.    
  1509.    
  1510.    
  1511.     # Checks for year/month/day
  1512.     while ($text =~ /\b(\d\d|\d{4})\.(\d\d?)\.(\d\d?)\b/g){
  1513.         my $startI = (length($`));
  1514.         my $endI = $startI + length($&);
  1515.         my $key = $startI."-".$endI;
  1516.         my $yr = $1;
  1517.         $nn_year = $yr;
  1518.         my $mo = $2;
  1519.         my $da = $3;
  1520.         my $beginr = substr $text, ($startI - 2), 2;
  1521.         my $endr = substr $text, $endI, 2;
  1522.        
  1523.        
  1524.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1525.         #if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
  1526.         if (isValidDate ($mo, $da, $yr)) {
  1527.             $prevChars = (substr $text, ($startI-4), 4);
  1528.             $nextChars = (substr $text, $endI, 11);
  1529.             if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
  1530.             addType ($key, "Header Date");
  1531.             $longyear = $yr;
  1532.             }
  1533.             else {
  1534.             addType ($key, "Year/Month/Day");
  1535.             }
  1536.         }
  1537.         }
  1538.     }
  1539.    
  1540.    
  1541.    
  1542.     # Checks for year/day/month
  1543.     while ($text =~ /\b(\d\d|\d{4})[\-\/](\d\d?)[\-\/](\d\d?)\b/g){
  1544.         my $startI = (length($`));
  1545.         my $endI = $startI + length($&);
  1546.         my $key = $startI."-".$endI;
  1547.         my $yr = $1;
  1548.         $nn_year = $yr;
  1549.         my $mo = $3;
  1550.         my $da = $2;
  1551.         my $beginr = substr $text, ($startI - 2), 2;
  1552.         my $endr = substr $text, $endI, 2;
  1553.        
  1554.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1555.         #if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
  1556.         if (isValidDate ($mo, $da, $yr)) {
  1557.             $prevChars = (substr $text, ($startI-4), 4);
  1558.             $nextChars = (substr $text, $endI, 11);
  1559.            
  1560.             if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
  1561.             addType ($key, "Header Date");
  1562.             $longyear = $yr;
  1563.             }
  1564.             else {
  1565.             }
  1566.         }
  1567.         }
  1568.     } #end while
  1569.  
  1570.     # Checks for month/4-digit year
  1571.     while ($text =~ /\b((\d\d?)[\-\/](\d{4}))/g) {
  1572.         my $startI = (length($`));
  1573.         my $endI = $startI + length($&);
  1574.         my $beginr = substr $text, ($startI - 2), 2;
  1575.         my $endr = substr $text, $endI, 2;
  1576.         my $first_num = $2;
  1577.         my $second_num = $3;
  1578.         my $st = length($`);
  1579.         my $endb = $st + length ($2) + length($3) + 1;
  1580.         my $key = "$st-$endb";
  1581.         if (($beginr !~ /\|\|/) && ($endr !~ /\|\|/)) {
  1582.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /[\/\.\%]/)) {
  1583.             #if (($first_num <= 12) && ($first_num > 0) && ($second_num>=1900)) {
  1584.             if (($first_num <= 12) && ($first_num > 0) &&
  1585.             ( $second_num >= $VALID_YEAR_LOW &&  $second_num <= $VALID_YEAR_HIGH  )  ) {
  1586.             addType ($key, "Month/Year 1"); }}}
  1587.     } #end while
  1588.    
  1589.         # Checks for 4-digit year/month
  1590.     while ($text =~ /\b((\d{4})[\-\/](\d\d?))\b/g) {
  1591.         my $first_num = $2;
  1592.         my $second_num = $3;
  1593.         my $st = length($`);
  1594.         my $endb = $st + length ($2) + length($3) + 1;
  1595.         my $key = "$st-$endb";
  1596.         if (($begin !~ /\d[\/\.\-]/) && ($end !~ /[\/\.\%]/)) {
  1597.         #if (($second_num <= 12) && ($second_num > 0) && ($first_num>=1900) && ($first_num<2010)) {
  1598.        
  1599.         if (($second_num <= 12) && ($second_num > 0) && ($first_num>=$VALID_YEAR_LOW) && ($first_num <= $VALID_YEAR_HIGH)) {
  1600.             addType ($key, "Year/Month"); }}
  1601.     } #end while
  1602.  
  1603.    
  1604.     # Checks for spelled-out months
  1605.     # Accounts for ambiguity around the dates, e.g. acronyms for measurements, spelled out months and such
  1606.     foreach $m (@months) {
  1607.        
  1608.         while ($text =~ /\b((\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
  1609.         my $day = $2;
  1610.         my $completeDate = $1;
  1611.         my $st = length($`);
  1612.         my $key = "$st-".($st + length($1));
  1613.         if (($day < 32) && ($day > 0)) {
  1614.             addType ($key, "Day Month Year");
  1615.         }
  1616.         }
  1617.  
  1618.         while ($text =~ /\b((\d{1,2}) ?(\-|to|through)+ ?(\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
  1619.         my $day1 = $2;
  1620.         my $day2 = $4;
  1621.         my $completeDate = $1;
  1622.         my $st = length($`);
  1623.         my $key = "$st-".($st + length($1));
  1624.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 >0)) {
  1625.             addType ($key, "Date range (4)");
  1626.         }
  1627.         }
  1628.        
  1629.         while ($text =~ /\b((\d{1,2}) ?\-\> ?(\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
  1630.         my $day1 = $2;
  1631.         my $day2 = $3;
  1632.         my $completeDate = $1;
  1633.         my $st = length($`);
  1634.         my $key = "$st-".($st + length($1));
  1635.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1636.             addType ($key, "Date range (5)");
  1637.         }
  1638.         }
  1639.        
  1640.        
  1641.         while ($text =~ /\b($m\b\.? (\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
  1642.    
  1643.         my $day = $2;
  1644.         my $completeDate = $1;
  1645.         my $st = length($`);
  1646.         my $key = "$st-".($st + length($1));
  1647.         if (($day < 32) && ($day > 0)) {
  1648.             addType ($key, "Month Day Year");
  1649.         }
  1650.         }
  1651.  
  1652.         while ($text =~ /\b($m\b\.? (\d{1,2}) ?(\-|to|through)+ ?(\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
  1653.         my $day1 = $2;
  1654.         my $day2 = $4;
  1655.         my $completeDate = $1;
  1656.         my $st = length($`);
  1657.         my $key = "$st-".($st + length($1));
  1658.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) & ($day2 > 0)) {
  1659.             addType ($key, "Date range (6)");
  1660.         }
  1661.         }
  1662.  
  1663.         while ($text =~ /\b($m\b\.? (\d{1,2}) ?\-\> ?(\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
  1664.    
  1665.         my $day1 = $2;
  1666.         my $day2 = $3;
  1667.         my $completeDate = $1;
  1668.         my $st = length($`);
  1669.         my $key = "$st-".($st + length($1));
  1670.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) & ($day2 > 0)) {
  1671.             addType ($key, "Date range (7)");
  1672.         }
  1673.         }
  1674.                
  1675.         #while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 12 2000
  1676.        while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|) ?[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 12th 2000
  1677.         my $day = $2;
  1678.         my $completeDate = $1;
  1679.         my $st = length($`);
  1680.         my $key = "$st-".($st + length($1));
  1681.         if (($day < 32) && ($day > 0)) {
  1682.             addType ($key, "Month Day Year (2)");
  1683.             #addType ($key, "Month Day Year");
  1684.         }
  1685.         }
  1686.        
  1687.        # while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
  1688.         while ($text =~ /\b($m\b\.?,?\s*(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
  1689.         my $day = $2;
  1690.         my $completeDate = $1;
  1691.         my $st = length($`);
  1692.         my $key = "$st-".($st + length($1));
  1693.  
  1694.         if (($day < 32) && ($day > 0)) {
  1695.             addType ($key, "Month Day");
  1696.         }
  1697.         }
  1698.        
  1699.         while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?(\-|to|through)+ ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
  1700.         my $day1 = $2;
  1701.         my $day2 = $4;
  1702.         my $completeDate = $1;
  1703.         my $st = length($`);
  1704.         my $key = "$st-".($st + length($1));
  1705.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1706.             addType ($key, "Date range (8)");
  1707.         }
  1708.         }
  1709.        
  1710.         while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?\-\> ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12th
  1711.         my $day1 = $2;
  1712.         my $day2 = $4;
  1713.         my $completeDate = $1;
  1714.         my $st = length($`);
  1715.         my $key = "$st-".($st + length($1));
  1716.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1717.             addType ($key, "Date range (9)");
  1718.         }
  1719.         }
  1720.        
  1721.         while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr, or Second of April
  1722.         my $day = $2;
  1723.         my $completeDate = $1;
  1724.         my $st = length($`);
  1725.         my $key = "$st-".($st + length($1));
  1726.         if (($day < 32) && ($day > 0)) {
  1727.             addType ($key, "Day Month");
  1728.         }
  1729.         }
  1730.  
  1731.         ###
  1732.       #  while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)?\s+(of)?\s+[\-]\b$m\.?,?)\s*(\d{2,4})\b/ig) { # 12-Apr, or Second of April
  1733. while ($text =~ /\b(((\d{1,2})(|st|nd|rd|th|)?\s+(of\s)?[\-]?\b($m)\.?,?)\s+(\d{2,4}))\b/ig) { # 12-Apr, or Second of April
  1734.         my $day = $3;
  1735.         my $month = $6;
  1736.         my $year = $7;
  1737.  
  1738.         my $completeDate = $1;
  1739.         my $st = length($`);
  1740.         my $key = "$st-".($st + length($1));
  1741.         if (($day < 32) && ($day > 0)) {
  1742.             addType ($key, "Day Month Year 2");
  1743.         }
  1744.         }
  1745.            ###
  1746.        
  1747.         while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)? ?(\-|to|through)+ ?(\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr
  1748.         my $day1 = $2;
  1749.         my $day2 = $5;
  1750.         my $completeDate = $1;
  1751.         my $st = length($`);
  1752.         my $key = "$st-".($st + length($1));
  1753.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1754.             addType ($key, "Date range (10)");
  1755.         }
  1756.         }
  1757.        
  1758.         while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)? ?\-\> ?(\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr
  1759.         my $day1 = $2;
  1760.         my $day2 = $5;
  1761.         my $completeDate = $1;
  1762.         my $st = length($`);
  1763.         my $key = "$st-".($st + length($1));
  1764.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1765.             addType ($key, "Date range (11)");
  1766.         }
  1767.         }
  1768.        
  1769.        
  1770.         while ($text =~ /\b($m\.?,? ?(of )?\d{2}\d{2}?)\b/ig) { # Apr. 2002
  1771.         my $year = $2;
  1772.         my $completeDate = $1;
  1773.         my $st = length($`);
  1774.         my $key = "$st-".($st + length($1));
  1775.         addType ($key, "Month Year");
  1776.         }
  1777.     }
  1778.     }  
  1779. }
  1780. # End of function date()
  1781.  
  1782. #***********************************************************************************************************
  1783. #***********************************************************************************************************
  1784. #***********************************************************************************************************
  1785. # Function: age()
  1786. # Checks for ages that are >=90 (assuming that no one is over 125 years old, just as a sanity check)
  1787. # When ages are spelled-out, assumes that the number will not be over hundred, i.e. highest spelled-out age="hundred"
  1788.  
  1789. sub age {
  1790.     $text = $_[0];  
  1791.     if ($agefilter =~ /y/) {
  1792.    
  1793.     foreach $i (@age_indicators_suff) {
  1794.         if (($text =~ /\b(ninety)[\s\-]+ *$i\b/ig) || ($text =~ /\b(hundred)[\s\-]+ *$i\b/ig)) {
  1795.         my $age = $1;
  1796.         my $st = length($`);
  1797.         my $key = "$st-".((length $age) + $st);
  1798.         addType ($key, "Age over 90");
  1799.         }
  1800.         while ($text =~ /\b(([A-Za-z]+)([\s \-])([A-Za-z]+)) ? *$i\b/ig) {           
  1801.         foreach $j (@digits) {
  1802.             $first = $2;
  1803.             $second = $4;
  1804.             my $age1 = $1;
  1805.             my $st1 = length($`);
  1806.             my $end1 = (length $age1) + $st1;
  1807.             my $key1 = "$st1-$end1";
  1808.             $st2 = $st1+length($2)+length($3);
  1809.             $end2 = $st2+(length($second));
  1810.             my $key2 = "$st2-$end2";
  1811.             if ((($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig)) && (($second=~/\b$digits\b/ig))) {
  1812.             addType ($key1, "Age over 90");    
  1813.             }
  1814.             else {
  1815.             if (!(($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig))) {
  1816.                 if (($second=~/\bninety\b/ig) || ($second=~/\bhundred\b/ig)) {
  1817.                 addType ($key2, "Age over 90");
  1818.                 }
  1819.             }
  1820.             }
  1821.         }
  1822.         }
  1823.                
  1824.         while ($text =~ /\b(\d+) *$i/ig) {    
  1825.         my $age = $1;
  1826.         my $st = length($`);
  1827.         my $key = "$st-".((length $age) + $st);
  1828.         if (($age >= 90) && ($age <= 125)) {
  1829.             addType ($key, "Age over 90");
  1830.         }
  1831.         }
  1832.     }
  1833.    
  1834.     foreach $i (@age_indicators_pre) {
  1835.         while ($text =~ /\b($i + *)(([A-Za-z]+)([\s \-])([A-Za-z]+))\b/ig) {         
  1836.         foreach $j (@digits) {
  1837.             $first = $3;
  1838.             $second = $5;
  1839.             my $age1 = $2;
  1840.             my $st1 = length($`)+length($1);
  1841.             my $end1 = (length $age1) + $st1;
  1842.             my $key1 = "$st1-$end1";
  1843.             $st2 = $st1;
  1844.             $end2 = $st2+length($first);
  1845.             my $key2 = "$st2-$end2";
  1846.             if ((($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig)) && (($second=~/\b$digits\b/ig) || (length($second)))) {
  1847.             addType ($key1, "Age over 90");    
  1848.             }
  1849.             else {
  1850.             if (!(($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig))) {
  1851.                 if (($second=~/\bninety\b/ig) || ($second=~/\bhundred\b/ig)) {
  1852.                 addType ($key2, "Age over 90");
  1853.                 }
  1854.             }
  1855.             }
  1856.         }
  1857.         }
  1858.        
  1859.        
  1860.         while ($text =~ /\b($i + *)(\d+)\b/ig) {    
  1861.         my $age = $2;
  1862.         my $st = length($`)+length($1);
  1863.         my $key = "$st-".((length $age) + $st);
  1864.         if (($age >= 90) && ($age <= 125)) {
  1865.             addType ($key, "Age over 90");
  1866.         }
  1867.         }
  1868.     }
  1869.     }
  1870. }
  1871. # End of function age()
  1872. #***********************************************************************************************************
  1873. #***********************************************************************************************************
  1874. #***********************************************************************************************************
  1875. # Function: pager()
  1876. # Checks for pager numbers
  1877.  
  1878. sub pager {
  1879.     $text = $_[0];
  1880.     if ($telfilter =~ /y/) {
  1881.     foreach $i ("Pager", "Pg", "Pgr", "Page", "Beeper", "Beeper number", "Pager number") {  #removed "P"
  1882.  
  1883.         while ($text =~ /\b$i[\s\#\:\-\=]*([a-zA-Z]\s+)*[a-zA-Z]*\s*(\d\d\d+)\b/gi) {
  1884.         my $num = $2;
  1885.         my $end = length($`)+length($&);
  1886.         my $key = ($end - (length $num))."-$end";
  1887.         addType ($key, "Pager number");
  1888.         }
  1889.         while ($text =~ /\b$i[\s\#\:\-\=]*/gi){
  1890.         my $startp = length($`);
  1891.                 my $endp =  length($`)+length($&);
  1892.         #get the next 30 characters
  1893.         my $the_chunck = (substr $text, $endp, 30);
  1894.                 #now look for a 5-digit number
  1895.         while ($the_chunck =~ /(\D)(\d{5})(\D)/gi){
  1896.             my $pager_startp = $endp + length($`) + length($1);
  1897.             my $pager_endp = $pager_startp + length($2);   
  1898.             my $key = "$pager_startp-$pager_endp";
  1899.             addType ($key, "Pager number");
  1900.            
  1901.         } #end while
  1902.         } #end while
  1903.     }
  1904.     }
  1905. }
  1906. # End of function pager()
  1907. #***********************************************************************************************************
  1908. #***********************************************************************************************************
  1909. #***********************************************************************************************************
  1910. # Function: email()
  1911. # Looks for e-mail addresses
  1912.  
  1913. sub email {
  1914.     $text = $_[0];  
  1915.     if ($emailfilter =~ /y/) {
  1916.     while ($text =~ /\b([\w\.]+\w ?@ ?\w+[\.\w+]\.\w{2,3})\b/g) {
  1917.         my $st = length($`);
  1918.         my $key = "$st-".($st+length($&));
  1919.         addType ($key, "E-mail address");
  1920.     }
  1921.     }
  1922. }
  1923. # End of function email()
  1924. #***********************************************************************************************************
  1925. #***********************************************************************************************************
  1926. #***********************************************************************************************************
  1927. # Function: url()
  1928. # Checks for URLs of different types
  1929.  
  1930. sub url {
  1931.     $text = $_[0];  
  1932.     if ($urlfilter =~ /y/) {
  1933.    
  1934.     while ($text =~ /\bhttps?\:\/\/[\w\.]+\w{2,4}\/\S+\b/gi) {
  1935.         my $st = length($`);
  1936.         my $key = "$st-".($st+length($&));
  1937.         addType ($key, "URL");
  1938.     }
  1939.     while ($text =~ /\bftp\:\/\/[\w\.]+\w{2,4}\/\S+\b/gi) {
  1940.         my $st = length($`);
  1941.         my $key = "$st-".($st+length($&));
  1942.         addType ($key, "URL");
  1943.     }
  1944.     while ($text =~ /\bwww\.[\w\.]+\w{2,4}\/\S+\b/gi) {
  1945.         my $st = length($`);
  1946.         my $key = "$st-".($st+length($&));
  1947.         addType ($key, "URL");
  1948.     }
  1949.     while ($text =~ /\bwww\.[\w\.]+\w{2,4}\b/gi) {
  1950.         my $st = length($`);
  1951.         my $key = "$st-".($st+length($&));
  1952.         addType ($key, "URL");
  1953.     }
  1954.     while ($text =~ /\bweb\.[\w\.]+\w{2,4}\/\S+\b/gi) {
  1955.         my $st = length($`);
  1956.         my $key = "$st-".($st+length($&));
  1957.         addType ($key, "URL");
  1958.     }
  1959.     while ($text =~ /\bweb\.[\w\.]+\w{2,4}\b/gi) {
  1960.         my $st = length($`);
  1961.         my $key = "$st-".($st+length($&));
  1962.         addType ($key, "URL");
  1963.     }
  1964.  
  1965.     while ($text =~ /\bhttps?\:\/\/[\w\.]+\w{2,4}\b/g) {
  1966.         my $st = length($`);
  1967.         my $key = "$st-".($st+length($&));
  1968.         addType ($key, "URL");
  1969.     }
  1970.     while ($text =~ /\bftp\:\/\/[\w\.]+\w{2,4}\b/g) {
  1971.         my $st = length($`);
  1972.         my $key = "$st-".($st+length($&));
  1973.         addType ($key, "URL");
  1974.     }
  1975.     }
  1976. }
  1977. # End of function url()
  1978. #***********************************************************************************************************
  1979. #***********************************************************************************************************
  1980. #***********************************************************************************************************
  1981. # Function: ssn()
  1982. # Checks for social security numbers (SSNs)
  1983.  
  1984. sub ssn {
  1985.     $text = $_[0];  
  1986.      
  1987.     if ($ssnfilter =~ /y/) {
  1988.     while ($text =~ /\b\d\d\d([- \/]?)\d\d\1\d\d\d\d\b/g) {
  1989.         my $st = length($`);
  1990.         my $key = "$st-".($st+length($&));
  1991.         addType ($key, "Social Security Number");
  1992.     }
  1993.     }
  1994. }
  1995. # End of function ssn()
  1996. #***********************************************************************************************************
  1997. #***********************************************************************************************************
  1998. #***********************************************************************************************************
  1999. # Function: name1()
  2000. # Uses name prefixes to make last names
  2001.  
  2002. sub name1 {
  2003.     $text = $_[0];  
  2004.     if ($namefilter =~ /y/) {
  2005.  
  2006.     my @keylst = sort numerically keys %phi;
  2007.     my $key;
  2008.     for($i=0;$i<$#keylst;$i++){
  2009.         $key = $keylst[$i];
  2010.         if (isType($key, "Last Prefix", 0)){
  2011.         ($f1,$t1) = split ('-',$key);
  2012.         ($f2,$t2) = split ('-',$keylst[$i+1]);
  2013.         if ($f2 < $t1+3){
  2014.             if (isType($keylst[$i+1], "Last Name", 1)){
  2015.             print "Found Last Prefix match, Adding $keylst[$i+1] type for last name!!";
  2016.             addType("$f1-$t2","Last Name");
  2017.             }
  2018.         }
  2019.         }
  2020.     }
  2021.     #####################################################  
  2022.     # Uses common-sense heuristics to try to find more names
  2023.     foreach $i (@name_indicators){
  2024.      
  2025.         #while ($text =~ /\b($i)(s)?( *)(\-|\,|\.|\()?( *)([A-Za-z]+\b)(\s+)(and )?( *)([A-Za-z]+)\b/ig) {
  2026.         while ($text =~ /\b($i)(s)?( *)(\-|\,|\.|\()?( *)([A-Za-z]+\b)\b/ig) {
  2027.  
  2028.         $potential_name = $6;
  2029.    
  2030.         $start = length($`)+length($1) + length ($2) + length($3) + length($4) + length($5);
  2031.         $end = $start + length($potential_name);
  2032.         $key = "$start-$end";
  2033.         my $tmpstr = substr $text, $start, $end-$start;
  2034.        
  2035.         my $tmp = isType($key, "Name",1);
  2036.          
  2037.  
  2038.         if (isProbablyName($key, $potential_name)){    
  2039.             addType ($key, "Name (NI)");
  2040.         } # end if the first word after the name indicator is a name
  2041.  
  2042.          my $new_start = $end + length($6) + length($7);
  2043.          my $new_end = $new_start + length($8);
  2044.  
  2045.  
  2046.          #########now check the next word
  2047.          my $rest = substr $text, $end+1, 20;
  2048.         if (($rest =~ /\b(and )?( *)([A-Za-z]+)\b/ig)){
  2049.             my $new_start = $end + 1 + length($`) + length($1)+length($2);
  2050.             my $new_end = $new_start + length ($3);
  2051.  
  2052.             my $keyAfter = "$new_start-$new_end";
  2053.             my $wordAfter = substr $rest,   (length ($`)+ length($1) + length($2)) , length ($3);
  2054.          
  2055.  
  2056.             if ( !isNameIndicator($wordAfter) && ( (  !isCommon($wordAfter) ||
  2057.             ((isType($keyAfter, "Name", 1) && isType($keyAfter, "(un)"))  ||
  2058.             (isType($keyAfter, "Name", 1) && ($wordAfter =~ /\b(([A-Z])([a-z]+))\b/g)) ||
  2059.             (!isCommonest($wordAfter) && isType($keyAfter, "Name", 1)) ||
  2060.             (isType($keyAfter, "popular",1)) ) )  ) ) {
  2061.    
  2062.             if ($rest =! /\b[\d]\b/ig) { #disregard if the rest contains numeric patterns
  2063.                 addType ($keyAfter, "Name2 (NI)");
  2064.             }
  2065.             }
  2066.  
  2067.             elsif ($1 =~ /and/ig){
  2068.             if (! (isCommon($wordAfter) || isNameIndicator($wordAfter))) {
  2069.                 addType ($keyAfter, "Name2 (NI)");
  2070.             }
  2071.             }
  2072.         } #end if rest
  2073.          
  2074.        
  2075.         } # end of while
  2076.  
  2077.     } #end for each name indicator
  2078.  
  2079.     #**********************************************************************************************************
  2080.     # Searches for the name pattern LASTNAME, FIRSTNAME
  2081.     # First checks if word2 is a firstname
  2082.     # If it is, then if word1 is not a common or commonest word, identifies word1 as a lastname
  2083.     while ($text =~ /\b([A-Za-z]+)( ?\, ?)([A-Za-z]+)\b/ig) {
  2084.         $input1 = $1;
  2085.         $input2 = $3;
  2086.         my $st1 = length($`);
  2087.         my $end1 = $st1 + length($input1);
  2088.         my $key = $key1;
  2089.         my $key1 = "$st1-$end1";
  2090.         my $st2 = $end1+length($2);
  2091.         my $end2 = $st2 + length($input2);
  2092.         my $key = $key2;
  2093.         my $key2 = "$st2-$end2";
  2094.        
  2095.         if ((isType($key2, "Name", 1)) && (isType($key1, "Name (ambig)", 1)) && (!isNameIndicator($input1)) ) {
  2096.         addType ($key1, "Last Name (LF)");
  2097.         addType ($key2, "First Name1 (LF)");
  2098.         }
  2099.        
  2100.         if ((isType($key1, "Name", 1)) && (isType($key2, "Name (ambig)", 1)) && (!isNameIndicator($input1))  ) {
  2101.         addType ($key2, "Last Name (LF)");
  2102.         addType ($key1, "First Name2 (LF)");
  2103.         }
  2104.        
  2105.         if (isType($key2, "First Name", 1)) {
  2106.         if (   (isType($key1, "Last Name", 1) && (!isCommonest($input1)) &&  (!isNameIndicator($input1))) ||
  2107.                ((!isCommon($input1)) && (!isCommonest($input1)))   ) {
  2108.             addType ($key1, "Last Name (LF)");
  2109.             addType ($key2, "First Name3 (LF)");
  2110.         }
  2111.         }  
  2112.     }
  2113.     }
  2114. }
  2115. # End of function name1()
  2116. #***********************************************************************************************************
  2117. #***********************************************************************************************************
  2118. #***********************************************************************************************************
  2119. # Function: seasonYear()
  2120. # Checks for both season and year from patterns like "<season> of? <year>"
  2121.  
  2122. sub seasonYear {
  2123.     $text = $_[0];  
  2124.    
  2125.     @seasons = ("winter", "spring", "summer", "autumn", "fall");
  2126.    
  2127.     foreach $i (@seasons) {
  2128.         while ($text =~ /\b(($i)(( +)of( +))? ?\,?( ?)\'?(\d{2}|\d{4}))\b/gi) {
  2129.         $f2=$2;
  2130.         $f3=$3;
  2131.         $f6=$6;
  2132.         $f7=$7;
  2133.         if (length($f7)==4) {
  2134.             if (($f7<=$longyear) && ($f7>1900)) {  
  2135.             my $st1 = length($`);
  2136.             my $end1 = (length $f2) + $st1;
  2137.             my $key1 = "$st1-$end1";
  2138.            
  2139.             my $st2 = $end1+(length $f3)+(length $f6);
  2140.             my $key2 = "$st2-".((length $f7) + $st2);
  2141.             addType ($key2, "Year (4 digits)");    
  2142.             }
  2143.         }
  2144.         else {                   
  2145.             my $st1 = length($`);
  2146.             my $end1 = (length $f2) + $st1;
  2147.             my $key1 = "$st1-$end1";
  2148.            
  2149.             my $st2 = $end1+(length $f3)+(length $f6);
  2150.             my $key2 = "$st2-".((length $f7) + $st2);
  2151.             addType ($key2, "Year (4 digits)");                    
  2152.         }
  2153.         }
  2154.     }
  2155.    
  2156. }
  2157. # End of function seasonYear()
  2158. #***********************************************************************************************************
  2159. #***********************************************************************************************************
  2160. #***********************************************************************************************************
  2161. # Function: name2()
  2162. # Checks for more than 1 name following plural drs, drs., doctors, professors,
  2163. # Checks up to 3 names
  2164.  
  2165. sub name2 {
  2166.     $text = $_[0];  
  2167.     if ($namefilter =~ /y/) {
  2168.  
  2169.     @plural_titles = ("doctors", "drs", "drs\.", "professors");
  2170.    
  2171.     foreach $p (@plural_titles) {
  2172.     while ($text =~ /\b((($p +)([A-Za-z]+) *(and +)?\,? *)([A-Za-z]+) *(and +)?\,? *)([A-Za-z]+)?\b/gi) {  
  2173.         my %names = ();
  2174.         $st3 = length($`);
  2175.         $st4 = $st3+length($3);
  2176.         $end4 = $st4+length($4);
  2177.         $key4 = "$st4-$end4";
  2178.         $st6 = $st3+length($2);
  2179.         $end6 = $st6+length($6);
  2180.         $key6 = "$st6-$end6";
  2181.         $st8 = $st3+length($1);
  2182.         $end8 = $st8+length($8);
  2183.         $key8= "$st8-$end8";
  2184.         $names{$4} = $key4;
  2185.         $names{$6} = $key6;
  2186.         $names{$8} = $key8;
  2187.         foreach $i (keys %names) {
  2188.         $val = $names{$i};
  2189.         if (length($i)>0) {
  2190.             if (!(isCommonest($i)) || (isType($val, "Name", 1))) {
  2191.             #print "addtype, val is $val\n";
  2192.             addType ($val, "Name5 (PTitle)");
  2193.             }
  2194.         }
  2195.         }
  2196.     }
  2197.     }
  2198.  
  2199.  
  2200.  
  2201.  
  2202. #****************************************************************************************
  2203. # Checks for names followed by "MD" or "M.D"
  2204. # Checks up to 3 previous words
  2205.     while ($text =~ /\b((([A-Za-z\']+) +)?(([A-Za-z])\. +)?([A-Za-z\-\']+)((\, *)|(\s+))(rrt|md|m\.d\.|crt|np|rn|nnp|msw|r\.n\.)(\.|\,)*\b)/ig) {
  2206.    
  2207.     $name = $1;
  2208.     $start = length($`);
  2209.     $end = $start + length($name);
  2210.     $key = "$start-$end";
  2211.    
  2212.     $name1 = $3; #if present, would be first name
  2213.     $name2 = $4;  #if present, would be initial
  2214.     $name3 = $6;  #if present would be last name
  2215.  
  2216.     $st1 = length($`);
  2217.     $end1 = $st1 + (length $name1);
  2218.     $key1 = "$st1-$end1";
  2219.  
  2220.     $st2 = $st1 + length($2);
  2221.     $end2 = $st2 + (length $name2);
  2222.     $key2 = "$st2-$end2";  
  2223.  
  2224.     $st3 = length($`) + (length $2) + (length $4);
  2225.     $end3 = $st3 + (length $name3);
  2226.     $key3 = "$st3-$end3";
  2227.    
  2228.     if ((!($text =~ /(m\.?d\.?\')/)) && (!($text =~ /(m\.?d\.?s)/))) {
  2229.    
  2230.         if (length($name1)>0) {
  2231.             if ((length($name1)==1) || ((length($name1)==2) && ($name1 =~ /\b([A-Za-z])\.+\b/ig))) {  
  2232.             addType($key1, "Name Initial (MD)");
  2233.         }  
  2234.         else {
  2235.             if (isProbablyName($key1, $name1)) {
  2236.             addType($key1,"Name6 (MD)" );
  2237.             }
  2238.         }
  2239.         }
  2240.  
  2241.         if (length($name2)>0 && length($name3)>0) {
  2242.         if ((length($name2)==1) || ((length($name2)==2) && ($name2 =~ /\b([A-Za-z])\.+\b/ig))) {    
  2243.              #addType($key2, "Name Initial (MD)");
  2244.         }  
  2245.         else {
  2246.             #if (!(isCommon($name2) && !(isType($key2, "Name", 1)))) {
  2247.             if (isProbablyName($key2, $name2)) {
  2248.             addType($key2,"Name7 (MD)" );
  2249.             }
  2250.         }
  2251.         }
  2252.  
  2253.         if (length($name3)>0) {
  2254.         if ((length($name3)==1) || ((length($name3)==2) && ($name3 =~ /\b([A-Za-z])\.\b/ig))) {  
  2255.             #addType($key3, "Name Initial (MD)");
  2256.         }  
  2257.         else {
  2258.             if (isProbablyName($key3, $name3)) {   
  2259.            
  2260.             addType($key3,"Name8 (MD)" );
  2261.  
  2262.             }
  2263.         } # end else
  2264.         } #endif
  2265.     } #end if text does not have M.D.' or M.D.s
  2266.     }
  2267.  
  2268.  
  2269. #****************************************************************************************
  2270. # Removes PCP name field, leaving "PCP name:" intact, from discharge summaries
  2271. # Does not check for name patterns, since these should be caught by the other methods
  2272. # Required mainly for unknown names, checks up to 3 words following "PCP Name"
  2273. # Follows the pattern seen in discharge summaries, may not work well in nursing notes
  2274.  
  2275.     #@name_pre = ("PCP", "physician", "provider", "created by", "name");
  2276.     @name_pre = ("PCP", "physician", "provider", "created by");
  2277.    
  2278.     foreach $l (@name_pre) {
  2279.     while ($text =~/\b(($l( +name)?( +is)?\s\s*)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)(((\s*\,*\s*)? *)([A-Za-z\-]+))?)\b/ig) {  
  2280.         my $key1 = $5;
  2281.         my $st1 = length($`)+(length $2);
  2282.         my $end1 = $st1+(length $5);
  2283.         my $keyloc1 = "$st1-$end1";
  2284.         my $key2 = $8;
  2285.         my $st2 = $end1+(length $6);
  2286.         my $end2 = $st2+(length $8);
  2287.         my $keyloc2 = "$st2-$end2";
  2288.         my $key3 = $12;
  2289.         my $st3 = $end2+(length $10);
  2290.         my $end3 = $st3+(length $12);
  2291.         my $keyloc3 = "$st3-$end3";    
  2292.         my %pcp = ();
  2293.         $pcp{$keyloc1} = $key1;
  2294.         $pcp{$keyloc2} = $key2;
  2295.         $pcp{$keyloc3} = $key3;
  2296.        
  2297.         foreach my $keyloc (keys %pcp ) {
  2298.         my $val = $pcp{$keyloc};
  2299.         if (length($val)>0) {
  2300.             if ((length($val)==1) || ($val =~ /\b([A-Za-z])\.\b/ig)) {    
  2301.             addType($keyloc, "Name Initial (PRE)");
  2302.             }  
  2303.             else {
  2304.             #if (!(isCommonest($val) && !(isType($keyloc, "Name", 1)))) {
  2305.             if (isProbablyName($keyloc, $val)){
  2306.            
  2307.                 addType($keyloc,"Name9 (PRE)" );
  2308.             }
  2309.             }
  2310.         }
  2311.         }
  2312.     }
  2313.        
  2314.         #followed by pattern "name is"
  2315.     while ($text =~ /\b(($l( +name)?( +is)? ?([\#\:\-\=\.\,])+ *)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)?)\b/ig) {
  2316.         my $key1 = $6;
  2317.         my $st1 = length($`)+(length $2);
  2318.         my $end1 = $st1+(length $6);
  2319.         my $keyloc1 = "$st1-$end1";
  2320.         my $key2 = $9;
  2321.         my $st2 = $end1+(length $7);
  2322.         my $end2 = $st2+(length $9);
  2323.         my $keyloc2 = "$st2-$end2";
  2324.         my $key3 = $12;
  2325.         my $st3 = $end2+(length $10);
  2326.         my $end3 = $st3+(length $12);
  2327.         my $keyloc3 = "$st3-$end3";
  2328.         my %pcp = ();
  2329.         my $firstfound = 0;
  2330.         my $secondfound = 0;
  2331.         $pcp{$keyloc1} = $key1;
  2332.         $pcp{$keyloc2} = $key2;
  2333.         $pcp{$keyloc3} = $key3;
  2334.         $blah = isCommonest($key3);
  2335.         $blah2 = isType($keyloc3, "Name", 1);
  2336.        
  2337.         if (length($key1)>0) {
  2338.         if ((length($key1)==1) || ($key1 =~ /\b([A-Za-z])\.\b/ig)) {
  2339.             addType($keyloc1, "Name Initial (NameIs)");
  2340.             $firstfound = 1;
  2341.         }
  2342.         else {
  2343.             if (isProbablyName($keyloc1, key1)){
  2344.             addType($keyloc1,"Name10 (NameIs)" );
  2345.             $firstfound = 1;
  2346.             }
  2347.         }
  2348.         }
  2349.         if ($firstfound == 1) {
  2350.         if (length($key2)>0) {
  2351.             if ((length($key2)==1) || ($key2 =~ /\b([A-Za-z])\.\b/ig)) {
  2352.             addType($keyloc2, "Name Initial (NameIs)");
  2353.             $secondfound = 1;
  2354.             }
  2355.             else {
  2356.             if (isProbablyName($keyloc2, $key2)){
  2357.                 addType($keyloc2,"Name11 (NameIs)" );
  2358.                 $secondfound = 1;
  2359.             }
  2360.             }
  2361.         }
  2362.         }
  2363.         if ($secondfound == 1) {
  2364.         if (length($key3)>0) {
  2365.             if ((length($key3)==1) || ($key3 =~ /\b([A-Za-z])\.\b/ig)) {
  2366.             addType($keyloc3, "Name Initial (NameIs)");
  2367.             }
  2368.             else {
  2369.             if (isProbablyName ($keyloc3, $key3)){
  2370.                 addType($keyloc3,"Name12 (NameIs)" );
  2371.             }
  2372.             }
  2373.         }
  2374.         }
  2375.     }
  2376.     }
  2377. }
  2378. }
  2379. # End of function name2()
  2380. #***********************************************************************************************************
  2381. #***********************************************************************************************************
  2382. #***********************************************************************************************************
  2383. # Function: providerNumber()
  2384. # Removes "provider number", whole field, from discharge summaries
  2385. # Does not remove the field if there is no number following it
  2386.  
  2387. sub providerNumber {
  2388.     $text = $_[0];      
  2389.     if ($unitfilter =~ /y/) {
  2390.    
  2391.     while ($text =~ /\b(provider(( +)number)?( ?)[\#\:\-\=\s\.]?( ?)(\d+)([\/\-\:](\d+))?)\b/gi) {  
  2392.         my $unit = $1;
  2393.         my $st = length($`);
  2394.         my $key = "$st-".((length $unit) + $st);
  2395.         addType ($key, "Provider Number");
  2396.     }
  2397.     }
  2398. }
  2399. # End of function providerNumber()
  2400. #***********************************************************************************************************
  2401. #***********************************************************************************************************
  2402. #***********************************************************************************************************
  2403. # Function: signatureField()
  2404. # Removes signature fields from discharge summaries
  2405. # signature field taken as 3 or more underscores
  2406. # does not remove doctor names, since these are handled by other name handlers
  2407.    
  2408. sub signatureField {
  2409.     $text = $_[0];  
  2410.  
  2411.     while ($text =~ /\b(\_\_+)\b/gi) {
  2412.    
  2413.     my $sigfield = $1;
  2414.     my $st = length($`);
  2415.     my $key = "$st-".((length $sigfield) + $st);
  2416.     addType ($key, "Signature");
  2417.     }
  2418. }
  2419. # End of function signatureField()
  2420. #***********************************************************************************************************
  2421. #***********************************************************************************************************
  2422. #***********************************************************************************************************
  2423. # Function: location1()
  2424. # Removes complete street addresses only when including one of the terms in @street_add_suff
  2425.  
  2426. sub location1 {
  2427.     $text = $_[0];    
  2428.  
  2429.     if ($locfilter =~ /y/) {
  2430.     #check if ambiguous locations are preceded by any of the location indicators,
  2431.         #if so, add in PHI list
  2432.     foreach $i (@location_indicators){
  2433.      
  2434.         while ($text =~ /\b($i)(\s+)([A-Za-z]+)\b/ig) {
  2435.         #print "Location Match 1 is $1, 2 is $2, 3 is $3\n";
  2436.         my $st = length ($`) +  length ($1) + length ($2);
  2437.                 my $end = $st + length ($3);
  2438.         my $key = "$st-$end";
  2439.         my $word = substr $text, $st, ($end-$st);
  2440.         #print "word is $word\n";
  2441.            
  2442.         if (isType($key, "Location",1)  || (length ($word) > 1 && !(isCommon($word))) ) {  
  2443.            
  2444.             addType ($key, "Location");
  2445.         }
  2446.         }
  2447.     } #end for each i in location indicators
  2448.  
  2449.  
  2450.  
  2451.     #if the company dictionary is loaded, check if any of the ambiguous company names
  2452.     #are preceded by the employment indicators
  2453.     if ($company_list =~/y/){
  2454.         #print "company list is on\n";
  2455.         foreach $i (@employment_indicators_pre){
  2456.         while ($text =~ /\b($i)(\s+)([A-Za-z]+)\b/ig) {
  2457.            
  2458.             $st = length ($`) +  length ($1) + length ($2);
  2459.             $end = $st + length ($3);
  2460.             $key = "$st-$end";
  2461.             my $word = substr $text, $st, ($end-$st);
  2462.            
  2463.             my $tmp = isCommon($word);
  2464.            
  2465.  
  2466.             if (isType($key, "Company",1)|| (length ($word) > 1 && !(isCommon($word)))  ) {
  2467.             #print "adding  $3 key $key as company unambig";
  2468.             addType ($key, "Company");
  2469.             }  
  2470.         }
  2471.         } #end for each i in location indicators
  2472.     }
  2473.  
  2474.  
  2475.  
  2476.     #strict address suffix, case-sensitive match, PHI regardless of ambiguity
  2477.     foreach $i (@strict_street_add_suff) { 
  2478.         #make it a case-sensitive match for street address suffix
  2479.         while ($text =~ /\b(([0-9]+ +)?(([A-Za-z\.\']+) +)?([A-Za-z\.\']+) +\b$i\.?\b)\b/g) {
  2480.  
  2481.         $st = length($`);
  2482.         $end = $st + length($1);
  2483.  
  2484.         #check next segment for apartment, suite, floor #s
  2485.         my $nextSeg = substr $text, $end, 30;
  2486.         #print "check nextSeg for apt and sutie #, seg is  $nextSeg\n";
  2487.         foreach $k (@apt_indicators){
  2488.             if ($nextSeg =~ /\b($k\.?\#? +[\w]+)\b/gi) {
  2489.             $end += length ($`) + length($1);
  2490.             }
  2491.         }
  2492.         $key = "$st-$end";
  2493.         #addType ($key, "Street Address");
  2494.  
  2495.         if (length($3) == 0) {
  2496.             if (!isUnambigCommon($5)) {
  2497.             addType ($key, "Street Address");
  2498.             }
  2499.         }
  2500.         elsif (!((isUnambigCommon($4)) && (isUnambigCommon($5)))) {
  2501.             addType($key, "Street Address");
  2502.         }
  2503.         } # end while
  2504.     } #end foreach
  2505.     }#end if
  2506.  
  2507.     #Non-strict address suffix, case-insensitive match, PHI if no ambiguity
  2508.     if ($locfilter =~ /y/) {
  2509.    
  2510.     foreach $i (@street_add_suff) {
  2511.  
  2512.         while ($text =~ /\b(([0-9]+) +(([A-Za-z]+) +)?([A-Za-z]+) +$i)\b/gi) {
  2513.         $st = length($`);
  2514.         $end = $st + length($1);
  2515.         $key = "$st-$end";
  2516.  
  2517.         if (length($3) == 0) {
  2518.         if (!isUnambigCommon($word)){
  2519.             addType ($key, "Street Address");
  2520.         }
  2521.         }
  2522.         elsif ( ! (isUnambigCommon($4) || isUnambigCommon($5))){
  2523.         addType($key, "Street Address");
  2524.         }
  2525.     }
  2526.     }
  2527. }
  2528. #****************************************************************************************
  2529.     # Removes 2-word location PHI ending with @loc_indicators_suff or preceded by @loc_indicators_pre
  2530.  
  2531.     # Words potentially PHI
  2532.     if ($locfilter =~ /y/) {
  2533.    
  2534.     foreach $i (@loc_indicators_suff) {
  2535.         while ($text =~ /\b(([A-Za-z\-]+)? +)?(([A-Za-z\-]+) + *$i +)\b/ig) {
  2536.         if (!isCommon($4)) {       
  2537.             $st2 = length($`)+length($1);
  2538.             $end2 = $st2 + length($3);
  2539.             $key2 = "$st2-$end2";
  2540.             addType ($key2, "Location");
  2541.            
  2542.             if (length $2>0) {
  2543.             if (!isCommon($2)) {
  2544.                 $st1 = length($`);
  2545.                 $end1 = $st1 + length($2);
  2546.                 $key1 = "$st1-$end1";
  2547.                 addType ($key1, "Location");}
  2548.             }
  2549.            
  2550.         }
  2551.         }  
  2552.     }
  2553.     }
  2554.     if ($locfilter =~ /y/) {
  2555.    
  2556.     # Words most likely PHI
  2557.     foreach $i (@loc_ind_suff_c) {
  2558.        
  2559.         while ($text =~ /\b(([A-Za-z]+ +)?)(([A-Za-z]+)$i+)\b/ig) {
  2560.         if (!isCommon($3)) {       
  2561.             $st2 = length($`)+length($1);
  2562.             $end2 = $st2 + length($3);
  2563.             $key2 = "$st2-$end2";
  2564.             addType ($key2, "Location");
  2565.         }
  2566.         }
  2567.     }
  2568.     }
  2569.    
  2570.     if ($locfilter =~ /y/) {
  2571.    
  2572.     # Words potentially PHI
  2573.     foreach $i (@loc_indicators_pre) {
  2574.         while ($text =~ /\b((($i + *([A-Za-z\-]+)) *)([A-Za-z\-]+)?)\b/ig) {
  2575.         if (!isCommon($4)) {       
  2576.             $st2 = length($`);
  2577.             $end2 = $st2 + length($3);
  2578.             $key2 = "$st2-$end2";
  2579.             addType ($key2, "Location");
  2580.            
  2581.             if (length $5>0) {
  2582.             if (!isCommon($5)) {
  2583.                 $st1 = length($`)+length($2);
  2584.                 $end1 = $st1 + length($5);
  2585.                 $key1 = "$st1-$end1";
  2586.                 addType ($key1, "Location");
  2587.             }
  2588.             }              
  2589.         }
  2590.         }
  2591.     }
  2592.     } # end if locfilter =~ /y/
  2593.  
  2594.  
  2595.     @universities_pre = ("University", "U", "Univ", "Univ.");
  2596.  
  2597.     #catches "University of", "U of", "Univ of", "Univ. of"
  2598.     if ($locfilter =~ /y/) {
  2599.    
  2600.     # Words potentially PHI
  2601.     foreach $i (@universities_pre) {
  2602.         while ($text =~ /\b((($i +of *([A-Za-z\-]+)) *)([A-Za-z\-]+)?)\b/ig) {
  2603.         my $tmp = isUSStateAbbre($4);
  2604.  
  2605.         if (isUSStateAbbre($4) || isUSState($4) ||  !isCommon($4) ) {      
  2606.             $st2 = length($`);
  2607.             $end2 = $st2 + length($3);
  2608.             $key2 = "$st2-$end2";
  2609.             addType ($key2, "Location");
  2610.            
  2611.             if (length $5>0) {
  2612.             if (!isCommon($5)) {
  2613.                 $st1 = length($`)+length($2);
  2614.                 $end1 = $st1 + length($5);
  2615.                 $key1 = "$st1-$end1";
  2616.                 addType ($key1, "Location (Universities)");
  2617.             }
  2618.             }              
  2619.         }
  2620.         }
  2621.     }
  2622.     } # end if locfilter =~ /y/
  2623.  
  2624.    
  2625.  
  2626. }
  2627. # End of function location1()
  2628.  
  2629. #***********************************************************************************************************
  2630. #***********************************************************************************************************
  2631. #***********************************************************************************************************
  2632. # Function: location2()
  2633. # Searches for multiple hospital and location terms
  2634.  
  2635. sub location2 {
  2636.     $text = $_[0];
  2637.     if ($locfilter =~ /y/) {
  2638.    
  2639.     foreach $hos (@hospital) {
  2640.        
  2641.         my @hospital_terms = split " ", $hos;
  2642.         $len = 0;
  2643.         foreach $h (@hospital_terms) {
  2644.         if (length($h) != 0) {
  2645.             $len = $len+1;
  2646.             $hos[$len] = $h;
  2647.         }
  2648.         }
  2649.        
  2650.         if ($len == 1) {
  2651.         while ($text =~ /\b($hos[1])\b/ig) {
  2652.             $hospital = $1;
  2653.             $st = length($`);
  2654.             $end = $st + length($hospital);
  2655.             $key = "$st-$end";
  2656.             addType($key, "Hospital1");    
  2657.         }
  2658.         }
  2659.        
  2660.         if ($len == 2) {
  2661.         while ($text =~ /\b($hos[1])( )($hos[2])\b/ig) {
  2662.            
  2663.             $hos1 = $1;
  2664.             $hos2 = $3;
  2665.             $space = $2;
  2666.             $st1 = length($`);
  2667.             $end1 = $st1 + length($hos1);
  2668.             $key1 = "$st1-$end1";
  2669.             addType($key1, "Hospital2");
  2670.             $st2 = $end1 + length($space);
  2671.             $end2 = $st2 + length($hos2);
  2672.             $key2 = "$st2-$end2";
  2673.             addType($key2, "Hospital3");       
  2674.         }
  2675.         }
  2676.        
  2677.         if ($len == 3) {
  2678.         while ($text =~ /\b($hos[1])( )($hos[2])( )($hos[3])\b/ig) {
  2679.             $hos1 = $1;
  2680.             $hos2 = $3;
  2681.             $hos3 = $5;
  2682.             $st1 = length($`);
  2683.             $end1 = $st1 + length($hos1);
  2684.             $key1 = "$st1-$end1";
  2685.             addType($key1, "Hospital4");
  2686.             $st2 = $end1 + length($2);
  2687.             $end2 = $st2 + length($hos2);
  2688.             $key2 = "$st2-$end2";
  2689.             addType($key2, "Hospital5");
  2690.             $st3 = $end2 + length($4);
  2691.             $end3 = $st3 + length($hos3);
  2692.             $key3 = "$st3-$end3";
  2693.             addType($key3, "Hospital6");
  2694.         }
  2695.         }
  2696.                        
  2697.         if ($len == 4) {
  2698.         while ($text =~ /\b($hos[1])( )($hos[2])( )($hos[3])( )($hos[4])\b/ig) {
  2699.             $hos1 = $1;
  2700.             $hos2 = $3;
  2701.             $hos3 = $5;    
  2702.             $st1 = length($`);
  2703.             $end1 = $st1 + length($hos1);
  2704.             $key1 = "$st1-$end1";
  2705.             addType($key1, "Hospital");
  2706.             $st2 = $end1 + length($2);
  2707.             $end2 = $st2 + length($hos2);
  2708.             $key2 = "$st2-$end2";
  2709.             addType($key2, "Hospital");
  2710.             $st3 = $end2 + length($4);
  2711.             $end3 = $st3 + length($hos3);
  2712.             $key3 = "$st3-$end3";
  2713.             addType($key3, "Hospital");
  2714.             $st4 = $end3 + length($6);
  2715.             $end4 = $st4 + length($hos4);
  2716.             $key4 = "$st4-$end4";
  2717.             addType($key4, "Hospital");
  2718.         }
  2719.         }          
  2720.     }
  2721.        
  2722.     foreach $loc (@loc_unambig) {
  2723.        
  2724.         my @loc_terms = split " ", $loc;
  2725.         $len = 0;
  2726.         foreach $h (@loc_terms) {
  2727.         if (length($h) != 0) {
  2728.             $len = $len+1;
  2729.             $loc[$len] = $h;
  2730.         }
  2731.         }
  2732.        
  2733.         if ($len == 1) {
  2734.         while ($text =~ /\b($loc[1])\b/ig) {
  2735.             $location = $1;
  2736.             $st = length($`);
  2737.             $end = $st + length($location);
  2738.             $key = "$st-$end";
  2739.             addType($key, "Location");     
  2740.         }
  2741.         }
  2742.        
  2743.         if ($len == 2) {
  2744.         while ($text =~ /\b($loc[1])( )($loc[2])\b/ig) {
  2745.             $loc1 = $1;
  2746.             $loc2 = $3;
  2747.             $st1 = length($`);
  2748.             $end1 = $st1 + length($loc1);
  2749.             $key1 = "$st1-$end1";
  2750.             addType($key1, "Location");
  2751.             $st2 = $end1 + length($2);
  2752.             $end2 = $st2 + length($loc2);
  2753.             $key2 = "$st2-$end2";
  2754.             addType($key2, "Location");    
  2755.         }
  2756.         }
  2757.        
  2758.         if ($len == 3) {
  2759.         while ($text =~ /\b($loc[1])( )($loc[2])( )($loc[3])\b/ig) {
  2760.             $loc1 = $1;
  2761.             $loc2 = $3;
  2762.             $loc3 = $5;
  2763.             $st1 = length($`);
  2764.             $end1 = $st1 + length($loc1);
  2765.             $key1 = "$st1-$end1";
  2766.             addType($key1, "Location");
  2767.             $st2 = $end1 + length($2);
  2768.             $end2 = $st2 + length($loc2);
  2769.             $key2 = "$st2-$end2";
  2770.             addType($key2, "Location");
  2771.             $st3 = $end2 + length($4);
  2772.             $end3 = $st3 + length($loc3);
  2773.             $key3 = "$st3-$end3";
  2774.             addType($key3, "Location");
  2775.         }
  2776.         }
  2777.                    
  2778.         if ($len == 4) {
  2779.         while ($text =~ /\b($loc[1])( )($loc[2])( )($loc[3])( )($loc[4])\b/ig) {
  2780.             $loc1 = $1;
  2781.             $loc2 = $3;
  2782.             $loc3 = $5;    
  2783.             $st1 = length($`);
  2784.             $end1 = $st1 + length($loc1);
  2785.             $key1 = "$st1-$end1";
  2786.             addType($key1, "Location");
  2787.             $st2 = $end1 + length($2);
  2788.             $end2 = $st2 + length($loc2);
  2789.             $key2 = "$st2-$end2";
  2790.             addType($key2, "Location");
  2791.             $st3 = $end2 + length($4);
  2792.             $end3 = $st3 + length($loc3);
  2793.             $key3 = "$st3-$end3";
  2794.             addType($key3, "Location");
  2795.             $st4 = $end3 + length($6);
  2796.             $end4 = $st4 + length($loc4);
  2797.             $key4 = "$st4-$end4";
  2798.             addType($key4, "Location");
  2799.         }
  2800.         }
  2801.     }
  2802.  
  2803.         #######
  2804.         #PO Box number
  2805.    
  2806.         while ($text =~ /\b(P[\.]?O[\.]? *Box *[\#]? *[0-9]+)\b/gi) {
  2807.           $location = $1;
  2808.           $st = length($`);
  2809.           $end = $st + length($location);
  2810.           $key = "$st-$end";
  2811.           addType($key, "PO Box");   
  2812.     }
  2813.  
  2814.    
  2815.  
  2816.  
  2817.  
  2818.         ######
  2819.         #Zipcodes
  2820.     foreach $loc (@us_states_abbre) {
  2821.             while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
  2822.             $location = $1;
  2823.             $st = length($`);
  2824.             $end = $st + length($location);
  2825.             $key = "$st-$end";
  2826.             addType($key, "State/Zipcode");  
  2827.         }
  2828.  
  2829.     }
  2830.  
  2831.         #Zipcodes with more US states abbreviations
  2832.     foreach $loc (@more_us_states_abbre) {
  2833.             while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
  2834.             $location = $1;
  2835.             $st = length($`);
  2836.             $end = $st + length($location);
  2837.             $key = "$st-$end";
  2838.             addType($key, "State/Zipcode");  
  2839.         }
  2840.  
  2841.     }      
  2842.     #Zipcodes with full US state names
  2843.     foreach $loc (@us_states) {
  2844.             while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
  2845.             $location = $1;
  2846.             $st = length($`);
  2847.             $end = $st + length($location);
  2848.             $key = "$st-$end";
  2849.             addType($key, "State/Zipcode");  
  2850.         }
  2851.  
  2852.     }
  2853.         ##########
  2854.         #remove US states if filter flag for State is on
  2855.  
  2856.     if ($us_state_filter =~ /y/) {
  2857.  
  2858.       foreach $loc (@us_states) {
  2859.        
  2860.         my @loc_terms = split " ", $loc;
  2861.         $len = 0;
  2862.         foreach $h (@loc_terms) {
  2863.         if (length($h) != 0) {
  2864.             $len = $len+1;
  2865.             $loc[$len] = $h;
  2866.         }
  2867.         }
  2868.        
  2869.         if ($len == 1) {
  2870.         while ($text =~ /\b($loc[1])\b/ig) {
  2871.             $location = $1;
  2872.             $st = length($`);
  2873.             $end = $st + length($location);
  2874.             $key = "$st-$end";
  2875.             addType($key, "State");    
  2876.         }
  2877.         }
  2878.        
  2879.         if ($len == 2) {
  2880.         while ($text =~ /\b(($loc[1])( )($loc[2]))\b/ig) {
  2881.             $location = $1;
  2882.             $st = length($`);
  2883.             $end = $st + length($location);
  2884.             $key = "$st-$end";
  2885.             addType($key, "State");      
  2886.         }
  2887.         }
  2888.        
  2889.         if ($len == 3) {
  2890.         while ($text =~ /\b(($loc[1])( )($loc[2])( )($loc[3]))\b/ig) {
  2891.             $location = $1;
  2892.             $st = length($`);
  2893.             $end = $st + length($location);
  2894.             $key = "$st-$end";
  2895.             addType($key, "State");
  2896.         }
  2897.         }
  2898.                    
  2899.         if ($len == 4) {
  2900.         while ($text =~ /\b(($loc[1])( )($loc[2])( )($loc[3])( )($loc[4]))\b/ig) {
  2901.             $location = $1;
  2902.             $st = length($`);
  2903.             $end = $st + length($location);
  2904.             $key = "$st-$end";
  2905.             addType($key, "State");
  2906.            
  2907.         }
  2908.         }
  2909.     }
  2910.       } #end if us_state_filter is on
  2911.  
  2912.  
  2913.  
  2914.  
  2915.  
  2916.  
  2917.  
  2918.  
  2919.         #######
  2920.     # Sub-function: hospitalIndicators()
  2921.     # Searches for hospital indicators and checks if previous and following words are hospitals
  2922.    
  2923.     foreach $h (@hospital_indicators) {
  2924.  
  2925.         while ($text =~ /((([A-Za-z\-\']+)( + *))?(([A-Za-z\-\']+)( + *))?($h\b)(( + *)([A-Za-z\-\']+))?(( + *)([A-Za-z\-\']+))?\b)/ig) {  
  2926.  
  2927.        
  2928.         my $typeadded = 0;
  2929.         $st1 = length($`);
  2930.         $end1 = $st1 + length($3);
  2931.         $key1 = "$st1-$end1";
  2932.         $st2 = $st1 + length($2);
  2933.         $end2 = $st2 + length($6);
  2934.         $key2 = "$st2-$end2";
  2935.         $st3 = $st1 + length($2) + length ($5) + length($8) + length($10);
  2936.         $end3 = $st3 + length($11);
  2937.         $key3 = "$st3-$end3";
  2938.         $st4 = $end3 + length($13);
  2939.         $end4 = $st4 + length($14);
  2940.         $key4 = "$st4-$end4";
  2941.         $st5 = $end2 + length($7);
  2942.         $end5 = $st5 + length($8);
  2943.         $key5 = "$st5-$end5";
  2944.  
  2945.        
  2946.         if (length($5)==0) {
  2947.        
  2948.             if ((length($3) > 1) &&  (!isUnambigCommon($3)) && (!(isCommon ($3)) || (isType ($key1, "Hospital", 1)))) {
  2949.             addType ($key1, "Hospital");
  2950.             #addType ($key5, "Hospital-Ind");
  2951.             $typeadded = 1;
  2952.             }
  2953.         }
  2954.  
  2955.          elsif ((length($6) > 1) && (!isUnambigCommon($6)) && (!(isCommon ($6)) || isUSState($6) || isUSStateAbbre($6) || (isType ($key2, "Hospital", 1)))) {
  2956.    
  2957.             addType ($key2, "Hospital");
  2958.                     #addType ($key5, "Hospital-Ind");
  2959.             $typeadded = 1;
  2960.            
  2961.             if ((length($3) > 1) &&  (!isUnambigCommon($3)) &&  (!(isCommon ($3)) || isUSState($3) || isUSStateAbbre($3) || (isType ($key1, "Hospital", 1)))) {
  2962.             addType ($key1, "Hospital");
  2963.             #addType ($key5, "Hospital-Ind");
  2964.             $typeadded = 1;
  2965.             }
  2966.         }
  2967.  
  2968.         #   #Generating too many false positives.
  2969.         #   #Need a better common word dictionary to enable this.
  2970.         #if ($typeadded == 0) {
  2971.         #    if ((length($11) > 1) && (!(isCommonest ($11)) || (isType ($key3, "Hospital", 1)))) {
  2972.         #   #addType ($key3, "Hospital");
  2973.                 #       #addType ($key5, "Hospital-Ind");
  2974.         #   #if ((length($14) > 1) && (!(isCommonest ($14)) || (isType ($key4, "Hospital", 1)))) {
  2975.         #   #      # addType ($key4, "Hospital");
  2976.         #   #       #addType ($key5, "Hospital-Ind");          
  2977.         #   #   }
  2978.         #    }
  2979.         #} #end if (typeadded == 0)
  2980.         }
  2981.     }
  2982.     }
  2983. }
  2984. # End of function location2()
  2985.  
  2986.  
  2987. #***********************************************************************************************************
  2988. #***********************************************************************************************************
  2989. #***********************************************************************************************************
  2990. # Function: problem()
  2991. # Checks for names preceded by "problem", pattern found in discharge summaries
  2992.  
  2993. sub problem {
  2994.     $text = $_[0];  
  2995.  
  2996.     $k = "problem";
  2997.     $l = ":";
  2998.     while ($text =~ /\b(([A-Za-z\-]+) + *($k))\b/ig) {
  2999.     if ((!isCommon($2)) || (isNameAmbig($2))) {
  3000.         $st = length($`);
  3001.         $end = $st + length($2);
  3002.         $key = "$st-$end";    
  3003.         addType ($key, "Last Name");
  3004.     }
  3005.     }
  3006. }
  3007. # End of function problem()
  3008. #***********************************************************************************************************
  3009. #***********************************************************************************************************
  3010. #***********************************************************************************************************
  3011. # Function: mrn()
  3012. # Checks for medical record numbers, i.e. numbers preceded by "mrn" or "medical record number"
  3013.  
  3014. sub mrn {
  3015.     $text = $_[0];      
  3016.     if ($unitfilter =~ /y/) {
  3017.    
  3018.     while ($text =~ /\b(mrn( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
  3019.         my $unit = $1;
  3020.         my $st = length($`);
  3021.         my $key = "$st-".((length $unit) + $st);
  3022.         addType ($key, "Medical Record Number");
  3023.     }
  3024.    
  3025.     @numbers = ("number", "no", "num", "");
  3026.    
  3027.     foreach $i (@numbers) {
  3028.         while ($text =~ /\b(medical record( *)$i?( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
  3029.         my $unit = $1;
  3030.         my $st = length($`);
  3031.         my $key = "$st-".((length $unit) + $st);
  3032.         addType ($key, "Medical Record Number");
  3033.         }
  3034.     }
  3035.     }
  3036. }
  3037. # End of function mrn()
  3038. #***********************************************************************************************************
  3039. #***********************************************************************************************************
  3040. #***********************************************************************************************************
  3041. # Function: unit()
  3042. # Checks for unit numbers in discharge summaries
  3043. # Removes the entire field of "unit <number>" or unit <number>/<number>
  3044. # If "unit" is not followed by a number, does not remove it
  3045.  
  3046. sub unit {
  3047.     $text = $_[0];      
  3048.     if ($unitfilter =~ /y/) {
  3049.    
  3050.     @numbers = ("number", "no", "num", "");
  3051.    
  3052.     foreach $i (@numbers) {
  3053.         while ($text =~ /\b(unit( ?)$i?( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
  3054.         my $unit = $1;
  3055.         my $st = length($`);
  3056.         my $key = "$st-".((length $unit) + $st);
  3057.         addType ($key, "Unit Number");
  3058.         }
  3059.     }
  3060.     }
  3061. }
  3062. # End of function unit()
  3063. #***********************************************************************************************************
  3064. #***********************************************************************************************************
  3065. #***********************************************************************************************************
  3066. # Function: name3()
  3067. # Checks every lastnameprefix
  3068. # If the following word is either a name or not commonest, identifies it as lastname
  3069.  
  3070. sub name3 {
  3071.     $text = $_[0];  
  3072.     if ($namefilter =~ /y/) {
  3073.  
  3074.     foreach $line (@prefixes_unambig) {
  3075.     while ($text =~ /\b(($line)([\s\'\-])+ *)([A-Za-z]+)\b/ig) {
  3076.         my $pre = $2;
  3077.         my $prestart = length($`);
  3078.         my $preend = $prestart+(length $pre);
  3079.         my $prekey = "$prestart-$preend";
  3080.         my $lname = $4;
  3081.         my $lstart = $prestart+length($1);
  3082.         my $lend = $lstart+length($4);
  3083.         my $lnamekey = "$lstart-$lend";
  3084.         if ((!(isCommonest ($lname))) || (isType ($lnamekey, "Name", 1))) {
  3085.         addType ($prekey, "Name Prefix (Prefixes)");
  3086.         addType ($lnamekey, "Last Name (Prefixes)");
  3087.         }
  3088.     }
  3089.     }
  3090.        #****************************************************************************************
  3091.    
  3092.     @specific_titles = ("MR", "MISTER", "MS");
  3093.     foreach $i (@specific_titles) {
  3094.        
  3095.         while ($text =~ /\b($i\.( *))([A-Za-z\'\-]+)\b/ig) {
  3096.         $potential_name = $3;
  3097.         $start = length($`)+length($1);
  3098.         $end = $start + length($potential_name);
  3099.         $key = "$start-$end";
  3100.         if (isType($key, "Name", 1)) {
  3101.             addType ($key, "Name13 (STitle)");
  3102.         }
  3103.        
  3104.         elsif (!(isCommon($potential_name))) {
  3105.             addType ($key, "Name14 (STitle)");
  3106.         }
  3107.         }
  3108.     }
  3109.         #****************************************************************************************
  3110.     # Goes through word by word looking for unspotted names
  3111.     # All words have already been marked as potential names (where appropriate) by the previous routines
  3112.     # Looks for last names following titles (Dr. Smith)
  3113.     # Also should pick up "Dr. S"
  3114.     # Looks for the last name prefixes
  3115.    
  3116.     foreach $i (@strict_titles) {      
  3117.       L:
  3118.  
  3119.         while ($text =~ /\b($i\b\.? *)([A-Za-z\'\-]+)\b/ig) { # added ' -
  3120.  
  3121.         my $tt = $1;
  3122.             my $word = $2;
  3123.            
  3124.         my $st = length($`) + length($1);
  3125.         my $fi = $st + length($2);
  3126.         my $key = "$st-$fi";
  3127.         if (exists $prefixes{uc($word)}) {
  3128.    
  3129.             addType ($key, "Last Name (STitle)");
  3130.             my $start = $fi;
  3131.             my $nextWord = substr $text, $start;
  3132.  
  3133.             if ($nextWord =~ /\A( ?)(\')?( ?)([A-Za-z]+)\b/g) {
  3134.            
  3135.             my $token = $4;
  3136.             my $lstart = $start+length($1)+length($2)+length($3);
  3137.             my $lend = $lstart+length($4);
  3138.             my $fi += length($1)+length($4);
  3139.             if (exists $prefixes{uc($token)}){
  3140.                 addType ("$start-$fi", "Last Name (STitle)");
  3141.                 my $start = $fi;
  3142.                 my $nextWord = $nextWord;
  3143.                 my $token = $token;
  3144.                 if ($nextWord =~ /\A( ?$token( ?))([A-Za-z]+)\b/g) {
  3145.                 $word = $3;
  3146.                 $key = "$fi-".($fi + length($2) + length($3));
  3147.                 } else {
  3148.                 next L;
  3149.                 }
  3150.             }
  3151.             else {
  3152.                 # Has already identified one prefix, should now check to see if next word is name or is not commonest  
  3153.                 $word = $token;
  3154.                 $key = "$lstart-$lend";
  3155.            
  3156.                 if (isProbablyName($key, $word)){              
  3157.                 addType ($key, "Last Name (STitle)");
  3158.                 }
  3159.             }
  3160.             } else {
  3161.             next L;
  3162.             }
  3163.         } else { #else $word is not a prefix
  3164.  
  3165.             if ($word =~ /\'([A-Za-z]+)/) {
  3166.             $word = $1;
  3167.             $st--;
  3168.             $key = $st."-".$fi;
  3169.             }
  3170.             if ($word =~ /([A-Za-z]+)\'/) {
  3171.             $word = $1;
  3172.             $key =  $st."-".($fi-1);   
  3173.             }
  3174.         }
  3175.        
  3176.  
  3177.         ###########################################################
  3178.         if (exists $phi{$key}) {
  3179.        
  3180.             addType($key, "Last Name (STitle)");
  3181.             if ((isType($key, "First Name", 1))) {
  3182.        
  3183.                addType($key, "First Name (STitle)");
  3184.             }
  3185.         } else {
  3186.             if (isProbablyName($key, $word)){
  3187.             addType ($key, "Last Name (STitle)");
  3188.        
  3189.             }
  3190.             else {
  3191.             addType ($key, "Last Name (STitle)");
  3192.  
  3193.             }
  3194.         } #end else (!exists $phi{$key})
  3195.  
  3196.  
  3197.  
  3198.         ################################
  3199.                 #added to catch Dr. <firstname> <lastname>
  3200.         #check the word after $word
  3201.         my ($tmpStart,$tmpEnd) = split '-', $key;
  3202.         my $following = substr $text, $tmpEnd;
  3203.  
  3204.         if($following =~/\A(\s+)([A-Za-z\-\']{2,})\b/g){
  3205.             my $fword = $2;
  3206.             my $newst = $tmpEnd + (length ($1));
  3207.             my $nextKey = "$newst-".($newst + length($2));
  3208.    
  3209.             if (isProbablyName($nextKey, $fword)){
  3210.             addType($nextKey, "Name (STitle)");
  3211.             }
  3212.         }
  3213.         ##########################################
  3214.  
  3215.         } #end while text matches the pattern
  3216.     }  #end for each $i strict_titles
  3217.  
  3218.  
  3219.  
  3220.        #****************************************************************************************
  3221.     # Goes through word by word looking for unspotted names
  3222.     # All words have already been marked as potential names (where appropriate) by the previous routines
  3223.     # Looks for last names following titles (Dr. Smith)
  3224.     # Also should pick up "Dr. S"
  3225.     # Looks for the last name prefixes
  3226.    
  3227.        #mark as ambiguous if common words
  3228.     foreach $i (@titles) {     
  3229.       L:
  3230.         while ($text =~ /\b($i\b\.? ?)([A-Za-z]+) *([A-Za-z]+)?\b/ig) {
  3231.    
  3232.         my $tt = $1;
  3233.         my $word = $2;     
  3234.         my $st = length($`) + length($1);
  3235.         my $fi = $st + length($2);
  3236.         my $key = "$st-$fi";
  3237.  
  3238.         my $wordAfter = $3; ##added to catch last names
  3239.         my $stAfter = $fi + 1;
  3240.         my $fiAfter = $stAfter + length ($3);
  3241.             my $keyAfter = "$stAfter-$fiAfter";
  3242.        
  3243.  
  3244.         if (exists $prefixes{uc($word)}) { 
  3245.            
  3246.             addType ($key, "Last Name (Titles)");
  3247.             my $start = $fi;
  3248.             my $nextWord = substr $text, $start;
  3249.             if ($nextWord =~ /\A( ?)(\')?( ?)([A-Za-z]+)\b/g) {
  3250.             my $token = $4;
  3251.             my $lstart = $start+length($1)+length($2)+length($3);
  3252.             my $lend = $lstart+length($4);
  3253.             my $fi += length($1)+length($4);
  3254.             if (exists $prefixes{uc($token)}){
  3255.                 addType ("$start-$fi", "Last Name (Titles)");
  3256.                 my $start = $fi;
  3257.                 my $nextWord = $nextWord;
  3258.                 my $token = $token;
  3259.                 if ($nextWord =~ /\A( ?$token( ?))([A-Za-z]+)\b/g) {
  3260.                 $word = $3;
  3261.                 $key = "$fi-".($fi + length($2) + length($3));
  3262.                 } else {
  3263.                 next L;
  3264.                 }
  3265.             }
  3266.             else {
  3267.                 # Has already identified one prefix, should now check to see if next word is name or is not commonest  
  3268.                 $word = $token;
  3269.                 $key = "$lstart-$lend";
  3270.  
  3271.                 if (isProbablyName($key, $word) && length($word) > 1 ){
  3272.  
  3273.                 addType ($key, "Last Name (Titles)");
  3274.                 }
  3275.             }
  3276.             } else {
  3277.             next L;
  3278.             }
  3279.         } else {
  3280.             if ($word =~ /\'([A-Za-z]+)/) {
  3281.             $word = $1;
  3282.             $st--;
  3283.             $key = $st."-".$fi;
  3284.             }
  3285.             if ($word =~ /([A-Za-z]+)\'/) {
  3286.             $word = $1;
  3287.             $key =  $st."-".($fi-1);
  3288.             }
  3289.         }
  3290.        
  3291.             if (length ($wordAfter) > 1) {
  3292.  
  3293.             my $tmp = isCommon($wordAfter);
  3294.             if (!isCommonest($wordAfter)  ||  (isType($keyAfter, "Name", 1) && isType($keyAfter, "(un)"))  ||
  3295.                 (isType($keyAfter, "Name", 1) && ($wordAfter =~ /\b(([A-Z])([a-z]+))\b/g)) ) {
  3296.            
  3297.             addType($keyAfter, "Last Name (Titles)");
  3298.             addType($key, "First Name (Titles)");
  3299.             }  
  3300.         }
  3301.  
  3302.         elsif (exists $phi{$key}) {
  3303.             if ((isType($key, "Name", 1))) {
  3304.             addType($key, "Last Name (Titles)");
  3305.             }
  3306.         } else {
  3307.             if ( length($word)  > 1 && !(isCommon($word)) ) {
  3308.             addType ($key, "Last Name (Titles)");
  3309.             }
  3310.             else {
  3311.             if (($word =~ /\b[A-Z][a-z]+\b/) || ($tt =~ /$i\. /)) {
  3312.              
  3313.                 addType ($key, "Last Name (Titles  ambig)");
  3314.                
  3315.             } else {
  3316.                 addType ($key, "Last Name (Titles ambig)");
  3317.             }
  3318.             }
  3319.         }
  3320.         }
  3321.     }
  3322.  
  3323.    
  3324.  
  3325.    
  3326.        #****************************************************************************************
  3327.     # Implements simple rules for finding names that aren't in the list or are ambiguous...
  3328.     # first name + last name -> first name + last name (ambig), first name +
  3329.         #not-on-any-safe-word-list, else save the second word and see whether it appears
  3330.         #in the patient text not associated with the first name or any other name indicator
  3331.     # Also first + initial + last name
  3332.     # Finds each prefix, labels the next not uncommonest word    
  3333.     # Finds all first names (unambig), look at following word -> make last name unambigs
  3334.    
  3335.     foreach $k (keys %phi) {
  3336.        if (((isType($k, "Male First Name", 1)) || (isType($k, "Female First Name", 1))) && ((isType($k, "(un)", 1)) || (isType($k, "pop", 1)))) {      
  3337.    
  3338.         my ($start, $end) = split '-', $k;
  3339.         my $following = substr $text, $end;
  3340.        
  3341.         # No middle initial
  3342.            
  3343.         if ($following =~ /\A( +)([A-Za-z\']{2,})\b/g) { #added to catch firstname s.a. O'Connell
  3344.             my $fword = $2;
  3345.             my $st = $end + (length $1);
  3346.             my $nextKey = "$st-".($st + length($2));
  3347.  
  3348.             if (exists $phi{$nextKey}) {
  3349.  
  3350.             if ((isType($nextKey, "Name", 1) == 1) && isProbablyName($nextKey, $fword)) {    
  3351.                 addType($nextKey, "Last Name (NamePattern1)");
  3352.                 addType($k,"First Name4 (NamePattern1)"); # make it unambig
  3353.             }
  3354.             }
  3355.             else {
  3356.             if (isProbablyName($nextKey, $fword)){
  3357.              
  3358.                 addType ($nextKey, "Last Name (NamePattern1)");
  3359.                 addType($k,"First Name5 (NamePattern1)"); }}}# make it unambig
  3360.                
  3361.                 # Looks for that middle initial
  3362.                 if ($following =~ /\A( +)([A-Za-z])(\.? )([A-Za-z\-][A-Za-z\-]+)\b/g) {
  3363.                     my $initial = $2;
  3364.                     my $lastN = $4;
  3365.                     my $st = $end + (length $1);
  3366.                     my $iniKey = "$st-".($st+1);
  3367.                     my $stn = $st + (length $2) + (length $3);
  3368.                     my $nextKey = "$stn-".($stn + (length $4));
  3369.                     if (exists $phi{$nextKey}) {
  3370.                     if ((isType($nextKey, "Last Name", 0) == 0)) {
  3371.                         addType($nextKey, "Last Name (NamePattern1)");
  3372.                         addType($iniKey, "Initial (NamePattern1)");
  3373.                         addType($k,"First Name11 (Name Pattern1)");
  3374.                     }
  3375.                     }
  3376.                     else {         
  3377.                     if ($following =~ /\A( +)([A-Za-z])(\.? )([A-Za-z][A-Za-z]+)\b\s*\Z/g){
  3378.                         addType ($nextKey, "Last Name (NamePattern1)");
  3379.                         addType($iniKey, "Initial (NamePattern1)");
  3380.                         addType($k,"First Name6 (NamePattern1)");
  3381.                     }
  3382.                     elsif (!(isCommonest($lastN))) {
  3383.                         addType ($nextKey, "Last Name (NamePattern1)");
  3384.                         addType($iniKey, "Initial (NamePattern1)");
  3385.                         addType($k,"First Name7 (NamePattern1)");
  3386.                     }
  3387.                     }
  3388.                 }
  3389.         }
  3390.     }
  3391.    
  3392.     # Finds all last names (unambig), looks at proceeding word -> make first names unambigs
  3393.     foreach $k (keys %phi) {
  3394.         if (isType($k, "Last Name", 1) && (isType($k, "(un)", 1))) {
  3395.        
  3396.         my ($start, $end) = split '-', $k;
  3397.         my $preceding = substr $text, 0, $start;
  3398.        
  3399.         if ($preceding =~ /\b([A-Za-z]+)( *)\Z/g) {  
  3400.             my $pword = $1;
  3401.             my $st = length($`);
  3402.             my $prevKey = "$st-".($st + (length $1));
  3403.             if (exists $phi{$prevKey}) {
  3404.             #my $result = isNameIndicator($pword);
  3405.             #print "pword is $pword, isNameIndicator returns $result";
  3406.             #if ((isType($prevKey, "First Name", 1)) && (!isType($prevKey, "Name Indicator", 0))) {
  3407.             if ((isType($prevKey, "First Name", 1)) && (!isNameIndicator($pword)) ) {
  3408.                 addType($prevKey, "First Name8 (NamePattern2)");
  3409.             } # Else it's been positively identified as something that is not a name so leave it
  3410.             }
  3411.             else {
  3412.             # Sees whether it appears in the common words...
  3413.             if (!(isCommon($pword))) {
  3414.                
  3415.                 addType ($prevKey, "First Name9 (NamePattern2)");
  3416.             }
  3417.             }
  3418.         }
  3419.         }
  3420.     }  
  3421.         #****************************************************************************************
  3422.     # Looks for compound last names -> last name + last name (ambig), last name + not-on-any-safe-word-list, last name "-" another word
  3423.     # Last name with an ambiguous name preceding it has already labeled the preceding thing a first name; no huge loss if it's just a weird first part of a compound last name
  3424.    
  3425.     foreach $k (keys %phi) {
  3426.         if (isType($k, "Last Name", 0)) {
  3427.        
  3428.         my ($start, $end) = split '-', $k;
  3429.         my $following = substr $text, $end;
  3430.        
  3431.         if ($following =~ /\A-([A-Za-z]+)\b/g) { #hypen-ated last name
  3432.             my $newend = $end+length($1)+1;
  3433.             my $nextKey = "$end-$newend";
  3434.             addType ($nextKey, "Last Name (NamePattern3)");
  3435.         }
  3436.         if ($following =~ /\A( *)([A-Za-z]+)\b/g) {
  3437.             my $fword = $2;
  3438.         my $st = $end + (length $1);
  3439.             my $nextKey = "$st-".($st + length($2));
  3440.             if (exists $phi{$nextKey}) {
  3441.             if (!(isType($nextKey, "ambig", 1))) {
  3442.             if (isType($nextKey, "Last Name", 0) == 0) {
  3443.                 addType($nextKey, "Last Name (NamePattern3)");
  3444.             }
  3445.             } # Else it's been positively identified as something that is not a name so leaves it
  3446.         }
  3447.             else {
  3448.             # Sees whether it appears in the common words
  3449.             if (!(isCommon($fword))) {
  3450.                 addType ($nextKey, "Last Name (NamePattern3)");
  3451.             }
  3452.             }
  3453.         }
  3454.         }
  3455.     }
  3456.        #****************************************************************************************
  3457.     # Looks for initials
  3458.     # Many last names get classified as first names and other PHI -> looks for initial before all unambig names and locations
  3459.    
  3460.       INI:
  3461.     foreach $k (keys %phi) {
  3462.         if (  ((!(isType($k, "ambig", 1))) ||    isType($k, "(un)",1))   && (isType($k, "Name", 1))) {
  3463.         #if (isType($k, "Name", 1)) {  
  3464.         my ($start, $end) = split '-', $k;
  3465.         my $preceding = substr $text, 0, $start;
  3466.        
  3467.         # Checks for two initials
  3468.    
  3469.         if ($preceding =~ /\b([A-Za-z][\. ] ?[A-Za-z]\.?) ?\Z/g) {         
  3470.             my $key = (length ($`))."-".(length($`) + (length $1));
  3471.             addType ($key, "Initials (NamePattern4)");
  3472.             if (!(isType($k, "Last Name", 0))) {
  3473.             }
  3474.         }
  3475.        
  3476.         # Checks if preceding word is an initial
  3477.         elsif ($preceding =~ /\b([A-Za-z]\.?) ?\Z/g) { #1 initial
  3478.        
  3479.             my $tmp = substr $text, $start, $end - $start +1;
  3480.        
  3481.             my $init = $1;
  3482.             my $key = (length ($`))."-".(length($`) + (length $1));
  3483.             if (lc($init) eq "s") {
  3484.             if ((substr $preceding, (length($`) - 1), 1) eq "'") {  #for 's
  3485. #           next INI;
  3486.             }
  3487.             }
  3488.             if ((lc($init) eq "a") || (lc($init) eq "i")) {
  3489.             if (isCommon(substr $text, $start, ($end - $start))) {
  3490. #           next INI;
  3491.             }
  3492.             }
  3493.        
  3494.             if (length($init)==2 || length($init)==1) {
  3495.             addType ($key, "Initials (NamePattern4)");
  3496.             }
  3497.             if (!(isType($k, "Last Name", 0))) {
  3498.             addType ($k, "Last Name (NamePattern4)");
  3499.             }
  3500.         }      
  3501.         }
  3502.     }
  3503.        #****************************************************************************************
  3504.        # Looks for initials; similar to previous code block
  3505.    
  3506.     foreach $k (keys %phi) {
  3507.         if (isType($k, "Last Name", 1) && (!isType ($k, "ambig", 1))) {
  3508.        
  3509.         my ($start, $end) = split '-', $k;
  3510.         my $preceding = substr $text, 0, $start;
  3511.         #two initials (why would they write that?  Why not?) {
  3512.         if ($preceding =~ /\b([A-Za-z][\. ] ?[A-Za-z]\.?) ?\Z/g) {
  3513.             my $key = (length ($`))."-".(length($`) + (length $1));
  3514.             addType ($key, "Initials (NamePattern5)");
  3515.             if (!(isType($k, "Last Name", 0))) {
  3516.             addType ($k, "Last Name (NamePattern5)");
  3517.             }
  3518.         }
  3519.        
  3520.         elsif ($preceding =~ /\b([A-Za-z]\.?) ?\Z/g) { #1 initial
  3521.             my $init = $1;
  3522.             my $key = (length ($`))."-".(length($`) + (length $1));
  3523.             if (lc($init) eq "s") {
  3524.             if ((substr $preceding, (length($`) - 1), 1) eq "'") {  #for 's
  3525.                 #next INI;
  3526.             }
  3527.             }
  3528.             if ((lc($init) eq "a") || (lc($init) eq "i")) {
  3529.             if (isCommon(substr $text, $start, ($end - $start))) {
  3530. #           next INI;
  3531.             }
  3532.             }
  3533.    
  3534.         }      
  3535.         }
  3536.     }
  3537.         #****************************************************************************************
  3538.  
  3539.     # Searches for patterns "name and/or," comma list names
  3540.     foreach $k (keys %phi) {
  3541.        
  3542.         if ((isType($k, "Last Name", 0)) || (isType($k, "Male First Name", 0)) || (isType($k, "Female First Name", 0))) {
  3543.         my ($start, $end) = split '-', $k;
  3544.         my $following = substr $text, $end;
  3545.        
  3546.         if ((length $following) == 0) { next; }
  3547.        
  3548.         # First just looks for "and"/"or"
  3549.         if ($following =~ /\A and ([A-Za-z]+)\p{IsPunct}/ig) {
  3550.             my $word = $text1;
  3551.             my $key = ($end + 5)."-".($end + 5 + length($1));
  3552.             if ((isType($key, "Name", 1))  || (!(isCommon($word)))) {
  3553.             addType ($key, "Last Name (NamePattern6)");
  3554.             }
  3555.         }
  3556.         elsif ($following =~ /\A and ([A-Za-z]+)\b/ig) {
  3557.             my $word = $1;
  3558.             my $key = ($end + 5)."-".($end + 5 + length($1));
  3559.             if (!(isCommon($word))) {
  3560.             addType ($key, "Last Name (NamePattern6)");
  3561.             }
  3562.         }
  3563.         elsif ($following =~ /\A or ([A-Za-z]+)\b/ig) {
  3564.             my $word = $1;
  3565.             my $key = ($end + 4)."-".($end + 4 + length($1));
  3566.             if (!(isCommon($word))) {
  3567.             addType ($key, "Last Name (NamePattern6)");
  3568.             }
  3569.         }
  3570.         elsif ($following =~ /\A( ?[\&\+] ?)([A-Za-z]+)\b/ig) {
  3571.             my $word = $2;
  3572.             my $st = $end + (length $1);
  3573.             my $key = "$st-".($st + length($2));
  3574.             if (!(isCommon($word))) {
  3575.             addType ($key, "Last Name (NamePattern6)");
  3576.             }
  3577.         }
  3578.         elsif ($following =~ /\A, ([A-Za-z]+)(,? and )([A-Za-z]+)\b/ig) {
  3579.             # Searches up to 3 names in a list
  3580.             my $name1 = $1;
  3581.             my $name2 = $3;
  3582.             my $st2 = $end + 2 + (length $name1) + length($2);
  3583.             my $key1 = ($end + 2)."-".($end + 2 + (length $name1));
  3584.             my $key2 = "$st2-".($st2 + length($name2));
  3585.             if (!(isCommon($name1))) {
  3586.             addType ($key1, "Last Name (NamePattern6)");
  3587.             }
  3588.             if (!(isCommon($name2))) {
  3589.             addType ($key2, "Last Name (NamePattern6)");
  3590.             }
  3591.         }
  3592.         }
  3593.     }
  3594. }
  3595. }
  3596. # End of function name3()
  3597.  
  3598.  
  3599.  
  3600. #***********************************************************************************************************
  3601. #***********************************************************************************************************
  3602. #***********************************************************************************************************
  3603. # Function: commonHoliday()
  3604. # Searches for some common holiday names that can identify the date
  3605. # Extension: Add new holiday names to this regex
  3606.  
  3607. sub commonHoliday() {
  3608.     $text = $_[0];
  3609.     if ($datefilter =~ /y/) {
  3610.    
  3611.     while ($text =~ /\b(christmas|thanksgiving|easter|hannukah|rosh hashanah|ramadan)\b/ig) {
  3612.         $holidayname = $1;
  3613.         $start = length($`);
  3614.         $end = $start + length($holidayname);
  3615.         $key = $start."-".$end;
  3616.         addType ($key, "Holiday");
  3617.     }
  3618.     }
  3619. }
  3620. # End of function commonHoliday()
  3621. #***********************************************************************************************************
  3622. #***********************************************************************************************************
  3623. #***********************************************************************************************************    
  3624. # Function: knownPatientName()
  3625. # Searches for PID-specific patient names known a priori, i.e. the patient first and last names for this particular PID
  3626. # Indiscriminately removes these PHI from anywhere in the text
  3627. # Extension: To include new PID-patient name mappings, extend the file $patient_file
  3628.  
  3629. sub knownPatientName {
  3630.     $text = $_[0];
  3631.     if ($namefilter =~ /y/) {
  3632.  
  3633.     foreach $i (@known_first_name) {
  3634.         while ($text =~ /\b($i)\b/ig) {
  3635.         my $start = length($`);
  3636.         my $end = $start + length($1);
  3637.         my $key = "$start-$end";
  3638.         addType ($key, "Known patient firstname");
  3639.         }
  3640.     }
  3641.    
  3642.     foreach $j (@known_last_name) {
  3643.         while ($text =~ /\b($j)\b/ig) {
  3644.         my $start = length($`);
  3645.         my $end = $start + length($1);
  3646.         my $key = "$start-$end";
  3647.         addType ($key, "Known patient lastname");
  3648.         }
  3649.     }      
  3650.     }
  3651. }
  3652. # End of function knownPatientName()
  3653.  
  3654.  
  3655.  
  3656.  
  3657. #***********************************************************************************************************
  3658. #***********************************************************************************************************
  3659. #***********************************************************************************************************
  3660. #Returns true if the number passed in matches a defined us area code
  3661. sub isCommonAreaCode  {
  3662.     $areacode = $_[0];
  3663.     return ($us_area_code{$areacode});
  3664.  
  3665. }
  3666.  
  3667.  
  3668.  
  3669. #***********************************************************************************************************
  3670. #***********************************************************************************************************
  3671. #***********************************************************************************************************
  3672. # Function: telephone()
  3673. # Searches for telephone numbers, with or without area codes, with or without extensions
  3674. # Extension: To add new formats, add a new rule
  3675.  
  3676. sub telephone() {
  3677.     $text = $_[0];
  3678.     if ($telfilter =~ /y/) {
  3679.         while ($text =~ /\b\(?\d{3}\s*[ \-\.\/\=\,]*\s*\d{4}\)?\b/g) { #added back \b to avoid high fp
  3680.     #while ($text =~ /\b\d{3}\s*[ \-\.\/\=\,]*\s*\d{4}\b/g) { #added back \b to avoid high fp
  3681.         my $start = length($`);
  3682.         my $end = $start + length($&);
  3683.  
  3684.         my $nextSeg = substr $text, $end, 20;
  3685.  
  3686.         #catch extensions
  3687.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {       
  3688.         $end += length($1);
  3689.         } elsif ($nextSeg =~ /\A(\s*ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3690.         $end += length($1);
  3691.         } elsif ($nextSeg =~ /\A(\s*extension\s*\(?[\d]+\)?)\b/) {
  3692.         $end += length($1);
  3693.         }
  3694.  
  3695.        
  3696.         my $key = "$start-$end";
  3697.  
  3698.         #now checks the context
  3699.         $context_len = 20;
  3700.         my $start_pos = $start - $context_len;
  3701.         if ($start_pos < 0) {
  3702.         $start_pos = 0;
  3703.         }
  3704.         my $len = $context_len;
  3705.         if  (length ($text) < ($end + $context_len)){
  3706.         $len = length($text) - $end;
  3707.         }
  3708.         my $textBefore = substr $text, $start_pos, ($start - $start_pos);
  3709.         my $textAfter = substr $text, $end, $len;
  3710.         if (isProbablyPhone($textBefore)){
  3711.         addType ($key, "Telephone/Fax (1)");
  3712.         }
  3713.     }
  3714.    
  3715.    
  3716.        #pattern such as ###-###-####
  3717.        #let's not worry about patterns such as ###-HART and ###-LUNG for now
  3718.     #while ($text =~ /\(?\d{3}?\s?[\)\.\/\-\=\, ]*\s?\d{3}\s?[ \-\.\/\=]*\s?\d{4}\b/g) {
  3719.         while ($text =~ /\d{3}\s*[\)\.\/\-\, ]*\s*\d{3}\s*[ \-\.\/]*\s*\d{4}/g) {
  3720.  
  3721.        
  3722.     #if (isCommonAreaCode($1)){
  3723.         my $st = length($`);
  3724.         my $end = $st + length($&);
  3725.  
  3726.         my $nextSeg = substr $text, $end, 20;
  3727.        
  3728.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3729.        
  3730.         $end += length($1);
  3731.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3732.        
  3733.         $end += length($1);
  3734.         }  elsif ($nextSeg =~ /\A([\,]?\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3735.        
  3736.         $end += length($1);
  3737.         } elsif ($nextSeg =~ /\b(\s?extension\s*\(?[\d]+\)?)\b/) {
  3738.        
  3739.         $end += length($1);
  3740.         }
  3741.  
  3742.         my $key = "$st-$end";      
  3743.         addType ($key, "Telephone/Fax (2)");
  3744.  
  3745.     }
  3746.  
  3747.     #allow arbitrary line break (almost) anywhere in the phone numbers (except first 3 digit to reduce fp)
  3748.     #only scrubbs the pattern, if it's a known area code
  3749.     while ($text =~ /(\d\d\d)\s*[\)\.\/\-\, ]*\s*\d\s*\d\s*\d\s*[ \-\.\/]*\s*\d\s*\d\s*\d\s*\d/g) {
  3750.        
  3751.         if (isCommonAreaCode($1)){
  3752.         my $st = length($`);
  3753.         my $end = $st + length($&);
  3754.  
  3755.        
  3756.         my $nextSeg = substr $text, $end, 20;
  3757.        
  3758.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3759.        
  3760.             $end += length($1);
  3761.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3762.        
  3763.             $end += length($1);
  3764.         }  elsif ($nextSeg =~ /\A([\,]?\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3765.        
  3766.             $end += length($1);
  3767.         } elsif ($nextSeg =~ /\b(\s?extension\s*\(?[\d]+\)?)\b/) {
  3768.        
  3769.             $end += length($1);
  3770.         }
  3771.  
  3772.         my $key = "$st-$end";      
  3773.         addType ($key, "Telephone/Fax (2)");
  3774.         }
  3775.     }
  3776.  
  3777.     #check phone pattern that has 1 extra or 1 less digit at end
  3778.     #in case pattern such as xxx-xxx-xxx?, check if the first 3 digits match with
  3779.         #common area code
  3780.  
  3781.     while (($text =~ /\(?(\d{3})\s*[\)\.\/\-\=\, ]*\s*\d{3}\s*[ \-\.\/\=]*\s*\d{3}\b/g)){
  3782.        
  3783.         #match it with common local area code
  3784.         if (isCommonAreaCode($1)){
  3785.         my $st = length($`);
  3786.         my $end = $st + length($&);    
  3787.        
  3788.         my $nextSeg = substr $text, $end, 20;
  3789.        
  3790.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3791.        
  3792.             $end += length($1);
  3793.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3794.        
  3795.             $end += length($1);
  3796.         }  elsif ($nextSeg =~ /\A(\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3797.        
  3798.             $end += length($1);
  3799.         } elsif ($nextSeg =~ /\A(\s?extension\s*\(?[\d]+\)?)\b/) {
  3800.        
  3801.             $end += length($1);
  3802.         }
  3803.        
  3804.         my $key = "$st-$end";      
  3805.         addType ($key, "Telephone/Fax (3)");
  3806.         } #end if the first 3 digits are area codes
  3807.     }  #end while
  3808.  
  3809.     #check phone pattern that has 1 extra  digit at end
  3810.     #in case pattern such as xxx-xxx-xxxxx, check if the first 3 digits match with
  3811.         #common area code
  3812.     while (
  3813.            ($text =~ /\(?(\d{3})\s*[\)\.\/\-\=\, ]*\s*\d{3}\s*[ \-\.\/\=]*\s*\d{5}\b/g)) {
  3814.  
  3815.        
  3816.         #match it with common local area code
  3817.         if (isCommonAreaCode($1)){
  3818.         my $st = length($`);
  3819.         my $end = $st + length($&);    
  3820.        
  3821.         my $nextSeg = substr $text, $end, 20;
  3822.        
  3823.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3824.        
  3825.             $end += length($1);
  3826.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3827.        
  3828.             $end += length($1);
  3829.         }  elsif ($nextSeg =~ /\A(\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3830.            
  3831.             $end += length($1);
  3832.         } elsif ($nextSeg =~ /\A(\s?extension\s*\(?[\d]+\)?)\b/) {
  3833.            
  3834.             $end += length($1);
  3835.         }
  3836.    
  3837.         my $key = "$st-$end";      
  3838.         addType ($key, "Telephone/Fax (4)");
  3839.         } #end if the first 3 digits are area codes
  3840.     }  #end while
  3841.  
  3842.  
  3843.      #in case typed in pattern such as ###-####-###
  3844.        while ($text =~ /\(?\d{3}?\s?[\)\.\/\-\=\, ]*\s?\d{4}\s?[ \-\.\/\=]*\s?\d{3}\b/g) {
  3845.         my $st = length($`);
  3846.         my $end = $st + length($&);
  3847.         my $nextSeg = substr $text, $end, 20;
  3848.  
  3849.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3850.        
  3851.         $end += length($1);
  3852.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3853.        
  3854.         $end += length($1);
  3855.         }
  3856.         elsif ($nextSeg =~ /\A(\s*ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3857.        
  3858.         $end += length($1);
  3859.         } elsif ($nextSeg =~ /\A(\s*extension\s*\(?[\d]+\)?)\b/) {
  3860.        
  3861.         $end += length($1);
  3862.         }
  3863.  
  3864.  
  3865.         my $key = "$st-$end";      
  3866.        
  3867.         addType ($key, "Telephone/Fax (5)");
  3868.     }
  3869.     }    
  3870. }
  3871. # End of function telephone()
  3872.  
  3873. #***********************************************************************************************************
  3874. #***********************************************************************************************************
  3875. #***********************************************************************************************************
  3876. # Add new function here >>>>>>>>>>
  3877. # Follow format shown here if necessary
  3878. # sub functionName {
  3879. #    $text = $_[0];
  3880. #    while ($text =~ /(<search pattern>)/) {
  3881. #       $startIndex = length($');
  3882. #       $endIndex = $startIndex + length($1);
  3883. #       $phiKey = $startIndex."-".$endIndex;
  3884. #       addType ($phiKey, "Name of PHI Category");
  3885. #   }
  3886. # }
  3887.  
  3888. #***********************************************************************************************************
  3889. #***********************************************************************************************************
  3890. #***********************************************************************************************************
  3891. # Function: pruneKeys()
  3892. # Arguments: hash %keys, string $text
  3893. # Returns: array @keylst
  3894. # Called by: findPHI()
  3895. # Description: Extracts PHI locations from hash %keys, compares each loc with previous loc to prevent overlaps
  3896. # Returns an array of pruned PHI keys
  3897.  
  3898. sub pruneKeys {
  3899.     my ($keyh,$text) = @_;
  3900.     my $lk = "";
  3901.     my ($ls,$le)=(0,0);
  3902.     my ($cs,$ce)=(0,0);
  3903.     my @keylst = ();
  3904.     foreach $k (sort numerically keys %{$keyh}) {
  3905.     #print "prunkey, key = $k  values = \n";
  3906.     $ls = $cs;
  3907.     $le = $ce;
  3908.     ($cs,$ce)= split ('-',$k);
  3909.     if ($cs > $le){push (@keylst,$lk);} # proper relation
  3910.     elsif($cs > $ls){
  3911.         if($ce > $le) {
  3912.         my $stgl = substr($text,$ls,$le-$ls);          
  3913.         my $stgc = substr($text,$cs,$ce-$cs);  
  3914.         $cs = $ls; $k = "$ls-$ce"; $$keyh{$k} = $$keyh{$lk}} # include both transfer types
  3915.         else{$cs = $ls; $ce = $le; $k = $lk;}} # use previous (current in previous)
  3916.     elsif($le > $ce){$cs = $ls; $ce = $le; $k = $lk;} # use previous (current in previous)
  3917.     $lk = $k;}
  3918.     #print "pushing $lk to keylst\n";
  3919.     push (@keylst,$lk); # last one
  3920.     return (@keylst)
  3921.     }    
  3922. # End of pruneKeys()
  3923. #***********************************************************************************************************
  3924. #***********************************************************************************************************
  3925. #***********************************************************************************************************
  3926. # Function: addType()
  3927. # Arguments: string $key ("start-end" of PHI loc), string $type (PHI type)
  3928. # Returns: None
  3929. # Called by: findPHI()
  3930. # Description: Pushes PHI key and type into the hash %phi
  3931. # Keeps track of all possible PHI types for each PHI key
  3932.  
  3933. sub addType {
  3934.  
  3935.     my ($key,$type) = @_;
  3936.     ($st,$end) = split '-',$key;
  3937.     if ($end > $end{$st}) {
  3938.     $end{$st} = $end;
  3939.     }
  3940.     #print "in addType, key is $key\n";
  3941.     push @{$phi{$key}}, $type;
  3942.     $t = (@{$phiT{$key}});
  3943.     $start = $st - 1 - 64;
  3944.     $ending = $end - 1 - 64;
  3945.     return;
  3946. }
  3947. # End of addType()
  3948. #***********************************************************************************************************
  3949. #***********************************************************************************************************
  3950. #***********************************************************************************************************
  3951. # Function: isType()
  3952. # Arguments: string $key ("start-end" of PHI loc), string $type (PHI type), int $pattern (1 if PHI type can be matched with '=~'; 0 otherwise)
  3953. # Returns: 1 if PHI $key is of PHI type $type; 0 otherwise
  3954. # Called by: findPHI()
  3955. # Description: Given a PHI loc, checks its PHI type in the existing PHI hash. If the type in the hash is equal to the given type, then returns 1.
  3956.  
  3957. sub isType {
  3958.     my ($key, $type, $pattern) = @_;
  3959.     foreach $tt (@{$phi{$key}}){
  3960. #   print "isType, tt is $tt key is $key\n";
  3961.     if ($pattern) {
  3962.         if ($tt =~ /$type/) {
  3963.         return 1;
  3964.         }
  3965.     }
  3966.     elsif ($tt eq $type) {
  3967.         return 1;
  3968.     }
  3969.     }
  3970.     return 0;
  3971. }
  3972.  
  3973. # End of isType()
  3974.  
  3975. #***********************************************************************************************************
  3976. #***********************************************************************************************************
  3977. #***********************************************************************************************************
  3978. # Function: isPHIType()
  3979. # Arguments: string $mytype (PHI type),  array of string @phiTypes
  3980. # Returns: 1 if PHI $mytype appears in @phiTypes, 0 otherwise.
  3981. # Called by: deid()
  3982. # Description: Given a PHI type, checks if it appears in @phiTypes, if so, returns 1. Returns 0 otherwise.
  3983.  
  3984. sub isPHIType {
  3985.   my (  $mytype,  @phiTypes) = @_;
  3986.    
  3987.     #foreach $tt (@{$phi{$key}}){
  3988.     foreach $tt (@phiTypes){
  3989.    
  3990.     if ($tt =~ /$mytype/) {
  3991.         return 1;
  3992.     } #end if
  3993.     } #end foreach
  3994.     return 0;
  3995. }
  3996.  
  3997. # End of isPHIType()
  3998.  
  3999.  
  4000. #***********************************************************************************************************
  4001. #***********************************************************************************************************
  4002. #***********************************************************************************************************
  4003. # Function: isCommon()
  4004. # Arguments: string $word
  4005. # Returns: 1 if given word is a common word; 0 otherwise
  4006. # Called by: findPHI()
  4007. # Description: Compares the given word to the common_words association, compiled from dictionary files for common English words and from SNOMED.
  4008. # Returns 1 if given word is in one of those lists, i.e. is a common word.
  4009.  
  4010. sub isCommon {
  4011.     my $word = $_[0];
  4012.     chomp $word;
  4013.     $word = uc($word);
  4014.     return  ($common_words{$word} || $unambig_common_words{$word});
  4015.    
  4016. }
  4017. # End of isCommon()
  4018.  
  4019. #***********************************************************************************************************
  4020. #***********************************************************************************************************
  4021. #***********************************************************************************************************
  4022. # Function: isUnambigCommon()
  4023. # Arguments: string $word
  4024. # Returns: 1 if given word is a really common word or unambig med terms; 0 otherwise
  4025. sub isUnambigCommon {
  4026.     my $word = $_[0];
  4027.     $word = uc($word);
  4028.     return $unambig_common_words{$word};
  4029. }
  4030. # End of isUnambigCommon()
  4031.  
  4032.  
  4033. #***********************************************************************************************************
  4034. #***********************************************************************************************************
  4035. #***********************************************************************************************************
  4036. # Function: isNameAmbig()
  4037. # Arguments: string $word
  4038. # Returns: 1 if given word is an ambiguous name
  4039. # Called by: findPHI()
  4040. # Description: Searches for given word in lists of ambiguous male, female and last names
  4041. # Returns 1 if word is in any of those lists
  4042. sub isNameAmbig {
  4043.     my $word = $_[0];
  4044.     $word = uc($word);
  4045.     return (($male_ambig{$word}) || ($female_ambig{$word}) || ($last_ambig{$word}));
  4046. }
  4047.  
  4048. #***********************************************************************************************************
  4049. #***********************************************************************************************************
  4050. #***********************************************************************************************************
  4051. #Function: isProbablyName()
  4052. #Arguments: string $key, $potential_name
  4053. #Returns true if
  4054. #name is Not a common word  OR
  4055. #name is Unambiguous name OR
  4056. #name maybe ambiguous BUT starts with Capital_letter followed by small_case letter OR
  4057. #name is popular
  4058. sub isProbablyName{
  4059.     my ($key, $potential_name) = @_;
  4060.  
  4061.     if ( (!isCommon($potential_name)) ||
  4062.      ((isType($key, "Name", 1) && isType($key, "(un)"))  ||
  4063.       (isType($key, "Name", 1) && ($potential_name =~ /\b(([A-Z])([a-z]+))\b/g)) ||
  4064.       (isType($key, "popular",1)) )) {
  4065.      
  4066.     return 1;
  4067.     } else {
  4068.     return 0;
  4069.     }
  4070.  
  4071. }
  4072. #***********************************************************************************************************
  4073. #***********************************************************************************************************
  4074. #***********************************************************************************************************
  4075. #Function: isUSStateAbbre
  4076. #Data structure used for us states
  4077. #Returns true if the word is a US state abbreviation
  4078. sub isUSStateAbbre {
  4079.     my $word = $_[0];
  4080.     $word = uc($word);
  4081.  
  4082.     foreach $loc (@us_states_abbre){
  4083.     if ($word =~/\b$loc\b/gi){
  4084.         return 1;
  4085.     }
  4086.     }
  4087.     foreach $loc (@more_us_states_abbre){
  4088.     if ($word =~/\b$loc\b/gi){
  4089.         return 1;
  4090.     }
  4091.     }
  4092.     return 0;
  4093.  
  4094. }
  4095.  
  4096. #***********************************************************************************************************
  4097. #***********************************************************************************************************
  4098. #***********************************************************************************************************
  4099. #Function: isNameIndicator
  4100. #Returns true if the word is a name indicator
  4101. sub isNameIndicator {
  4102.     my $word = $_[0];
  4103.     $word = uc($word);
  4104.  
  4105.     foreach $nam (@name_indicators){
  4106.     #print "nam in name indicators is $nam";
  4107.     if ($word =~/\b$nam\b/gi){
  4108.        
  4109.         return 1;
  4110.     }
  4111.     }
  4112.  
  4113.     return 0;
  4114. }
  4115.  
  4116. #***********************************************************************************************************
  4117. #***********************************************************************************************************
  4118. #***********************************************************************************************************
  4119. #Function: isUSState
  4120. #Returns true if the word is a US State
  4121. sub isUSState {
  4122.     my $word = $_[0];
  4123.     $word = uc($word);
  4124.     #return (($us_states{$word}));
  4125.     foreach $loc (@us_states){
  4126.     if ($word =~/\b$loc\b/gi){
  4127.         return 1;
  4128.     }
  4129.     }
  4130.     return 0;
  4131.  
  4132.  
  4133. }
  4134.  
  4135. #***********************************************************************************************************
  4136. #***********************************************************************************************************
  4137. #***********************************************************************************************************
  4138. # Function: isFirstName()
  4139. # Arguments: string $word
  4140. # Returns: 1 if given word is an ambiguous or unambiguous firstname
  4141. # Called by: findPHI()
  4142.  
  4143. sub isFirstName {
  4144.     my $word = $_[0];
  4145.     #$word = ($word);
  4146.     $word = uc($word);
  4147.     return (($male_ambig{$word}) || ($female_ambig{$word}) || ($male_unambig{$word}) || ($female_unambig{$word}) || ($male_popular{$word}) || ($female_popular{$word}));
  4148. }
  4149. # End of isFirstName()
  4150.  
  4151. #***********************************************************************************************************
  4152. #***********************************************************************************************************
  4153. #***********************************************************************************************************
  4154. # Function: isCommonest()
  4155. # Arguments: string $word
  4156. # Returns: 1 if given word is a commonest word; 0 otherwise
  4157. # Called by: findPHI()
  4158. # Description: Compares the given word to the commonest_words association, compiled from dictionary file for commonest English words.
  4159. # Returns 1 if given word is in that list, i.e. is a commonest word.
  4160.  
  4161. sub isCommonest {
  4162.     my $word = $_[0];
  4163.     $word = uc($word);
  4164.     return ($very_common_words{$word} || isUnambigCommon($word));
  4165. }
  4166.  
  4167. # End of isCommonest()
  4168. #***********************************************************************************************************
  4169. #***********************************************************************************************************
  4170. #***********************************************************************************************************
  4171. # Function: getNote()
  4172. # Arguments: int $patient (patient number), int $note (note number)
  4173. # Returns: string $noteText
  4174. # Called by: deidStats() (obsolete)
  4175. # Description: Given patient and note numbers, looks up header in $header_file
  4176. # If it finds the header, extracts the body of the note, and returns text until the end pattern
  4177. sub getNote{
  4178.  
  4179.     my ($patient, $note) = @_;    
  4180.     open DF, $data_file or die "Cannot open $data_file";
  4181.     my $noteFound = 0;
  4182.     my $noteText = "";
  4183.    
  4184.   D:    
  4185.     # Parses the data file line-by-line to match the header found in the header file
  4186.     while ($line = <DF>) {
  4187.     chomp $line;
  4188.    
  4189.  
  4190.     # If header is found in the text, then matches the end pattern, and sets the body of the note (excluding the header) as the note text
  4191.  
  4192.     if ($line =~ /\b$patient\|\|\|\|$note\|\|\|\|/) {
  4193.         $noteFound = 1;
  4194.         $noteText = "";
  4195.     }
  4196.     else {     
  4197.         if ($noteFound) {
  4198.         if ($line eq "||||END_OF_RECORD"){
  4199.             #$noteText = $noteText."\n".$1;
  4200.             #$noteText = $noteText.$1."\n";
  4201.             $end = $2;
  4202.             last D;
  4203.         }
  4204.         else {
  4205.             $noteText = $noteText.$line."\n";
  4206.         }
  4207.         }
  4208.     }
  4209.     }
  4210.     close DF;
  4211.    
  4212.     # If the note text has zero length, prints an error message
  4213.     if ((length $noteText) == 0) {
  4214.     print("Warning. No text found for Patient $patient, Note $note ");
  4215.     }
  4216.  
  4217.     # Returns the body of the note (everything excluding the header) as the note text
  4218.     return $noteText;
  4219. }
  4220. # End of getNote()
  4221.  
  4222.  
  4223.  
  4224.  
  4225. #***********************************************************************************************************
  4226. #***********************************************************************************************************
  4227. #***********************************************************************************************************
  4228. # Function: isValidDate()
  4229. # Arguments: int $month, int $day, int $year (If the date being validated doesn't specify year, the "year" argument should be "-1")
  4230. # Returns: 1 if given date is valid based on the calendar
  4231. # Called by: findPHI(), isValidDay()
  4232. # Description: Verifies if the given date is valid or not.
  4233.  
  4234. sub isValidDate{
  4235.     my ($month, $day, $year) = @_;
  4236.     if (($year!=-1) && (length $year) == 2) {
  4237.     #if ($year < 30) {
  4238.     if ($year <= $TWO_DIGIT_YEAR_THRESHOLD) {
  4239.         $year = "20".$year;
  4240.     }
  4241.     else {
  4242.         $year = "19".$year;
  4243.     }
  4244.     }
  4245.    
  4246.     #if (($year != -1) && ($year < 1900 || $year > 2030)){
  4247.     #if (($year != -1) && ($year <= $VALID_YEAR_LOW || $year >= $VALID_YEAR_HIGH)){
  4248.     if (($year != -1) && ($year < $VALID_YEAR_LOW || $year > $VALID_YEAR_HIGH)){
  4249.    
  4250.     return 0;
  4251.     }
  4252.    
  4253.     # Invalid months and days
  4254.     if (($month< 1) || ($month > 12) || ($day < 1) || ($day > 31)) {
  4255.     return 0;
  4256.     }
  4257.  
  4258.     # Checks validity of February days
  4259.     if ($month == 2) {
  4260.     if (($year != -1) && (($year % 4) == 0) && ($year != 2000)) {
  4261.         return ($day <= 29);
  4262.     }
  4263.     return ($day <= 28);
  4264.    
  4265.     # Checks validity of months consisting of 30 days
  4266.     }
  4267.     elsif (($month == 4) || ($month == 6) || ($month== 9) || ($month == 11)) {
  4268.     return ($day <= 30);
  4269.     }
  4270.    
  4271.     # Checks validity of months consisting of 31 days
  4272.     return ($day <= 31);
  4273. }
  4274. # End of isValidDate()
  4275. #***********************************************************************************************************
  4276. #***********************************************************************************************************
  4277. #***********************************************************************************************************
  4278. # Function: isValidDay()
  4279. # Arguments: int $day, string $month, int $year
  4280. # Returns: 1 if given date is valid; 0 otherwise
  4281. # Called by: findPHI()
  4282. # Description: Verifies validity of date when month is printed, by calling isValidDate()
  4283.  
  4284. sub isValidDay {
  4285.     my ($day, $month, $year) = @_;
  4286.     my $mnum = 0;
  4287.  
  4288.     # Converts printed out months to numerical months
  4289.     if ($month =~ /Jan|January|Januar/i) { $mnum = 1; }
  4290.     elsif ($month =~ /Feb|February|Februar/i) { $mnum = 2; }
  4291.     elsif ($month =~ /Mar|March|Maerz/i) { $mnum = 3; }
  4292.     elsif ($month =~ /Apr|April/i) { $mnum = 4; }
  4293.     elsif ($month =~ /May|Mai/i) { $mnum = 5; }
  4294.     elsif ($month =~ /June|Jun|Juni/i) { $mnum = 6; }
  4295.     elsif ($month =~ /July|Jul|Juli/i) { $mnum = 7; }
  4296.     elsif ($month =~ /August|Aug/i) { $mnum = 8; }
  4297.     elsif ($month =~ /September|Sept|Sep/i) { $mnum = 9; }
  4298.     elsif ($month =~ /October|Oct/i) { $mnum = 10; }
  4299.     elsif ($month =~ /November|Nov/i)  { $mnum = 11; }
  4300.     elsif ($month =~ /December|Dec/i) { $mnum = 12; }
  4301.     if ($mnum == 0) { return 0; }
  4302.    
  4303.     return (isValidDate($mnum, $day, $year));
  4304. }
  4305. # End of isValidDay()
  4306.  
  4307.  
  4308. #***********************************************************************************************************
  4309. #***********************************************************************************************************
  4310. #***********************************************************************************************************
  4311. # Function: outputText()
  4312. # Arguments: hash %deids (key=PHI start index, value=PHI end index), hash %phiT (key=PHI start-end indices, value=PHI type)
  4313. # Returns: None
  4314. # Called by: deid()
  4315. # Description: Creates the de-identified version of the text
  4316. # Replaces dates with shifted dates, and other PHI with their PHI types
  4317. sub outputText {
  4318.     my %deids = %{$_[0]};
  4319.     my %phiT = %{$_[1]};
  4320.  
  4321.     # These are the date and PID of the medical record. The year needs to be initialized.
  4322.    # my $checkYear = "2005";
  4323.     my $checkYear;
  4324.     my $checkDate;
  4325.     my $checkID;
  4326.  
  4327.     # Forms associations between printed months and numerical ones
  4328.     # This is because the date-shifting function only accepts numerical months, and months in the text can be non-numerical as well
  4329.     my %months;
  4330.     $months{"jan"} = 1;
  4331.     $months{"feb"} = 2;
  4332.     $months{"mar"} = 3;
  4333.     $months{"apr"} = 4;
  4334.     $months{"may"} = 5;
  4335.     $months{"jun"} = 6;
  4336.     $months{"jul"} = 7;
  4337.     $months{"aug"} = 8;
  4338.     $months{"sep"} = 9;
  4339.     $months{"oct"} = 10;
  4340.     $months{"nov"} = 11;
  4341.     $months{"dec"} = 12;
  4342.  
  4343.     #TF is the .res file (de-identified corpus), and OF is the (.phi file) list of PHIs
  4344.     # open TF, ">$deid_text_file" or die "Cannot open $deid_text_file";
  4345.     open TF, ">>$deid_text_file" or die "Cannot open $deid_text_file";   #now open in append mode
  4346.     my $lastEnd = 0;
  4347.     #open OF, ">>$output_file_old" or die "Cannot open $output_file_old";
  4348.  
  4349.     my $phiType = "";
  4350.  
  4351.     # Prints the PHI locations to the output file (.phi)
  4352.     foreach $k (sort numerically keys %deids) {        
  4353.     my @deidsval = @{$deids{$k}};
  4354.     local $added = 0;
  4355.     my $phiID;
  4356.  
  4357.     # Loops over each PHI recorded in %deids
  4358.     # %deids maps each PHI start index to an array of 3 items
  4359.     # These items are in order: the PHI end index, the PID of the PHI, the record date of the PHI    
  4360.     #if (exists ${@{$deids{$k}}}[0]) {
  4361.     if (exists $deidsval[0]){
  4362.         # Sets some key variables for the PHI
  4363.  
  4364.         #the following no longer works with perl v5.10
  4365.         #$deidsend = ${@{$deids{$k}}}[0]; # End index
  4366.         #$checkID = ${@{$deids{$k}}}[1]; # PID
  4367.         #$checkDate = ${@{$deids{$k}}}[2]; # Record date
  4368.  
  4369.         $deidsend = $deidsval[0]; # End index
  4370.         $checkID = $deidsval[1]; # PID
  4371.         $checkDate = $deidsval[2]; # Record date
  4372.         $checkYear = (substr $checkDate, 6, 4); # Record year
  4373.         if (length($checkYear) ==0) {
  4374.         $checkYear = extractYear($DEFAULT_DATE);
  4375.         print "Warning, in outputText(), cannot extract year from noteDate, setting year to default year $checkYear.";
  4376.         }
  4377.  
  4378.         # Immediately prints the start and end indices of the PHI to the .phi file
  4379.         #print OF "$k\t$deidsend\n";
  4380.  
  4381.         # Sets the $key to the current PHI
  4382.         my $key = $k."-".$deidsend;
  4383.         my $lastlast = $lastEnd;
  4384.         my $phiText;
  4385.        
  4386.         # If this PHI is a date element, shifts the date and replaces it in the text
  4387.         # Output format is YYYY/MM/DD always
  4388.         # For month/date formats, assumes that year = record year; for month/year formats, assumes that day = 1
  4389.  
  4390.         # Needs to go over %phiT for every PHI in %deids
  4391.         # %phiT maps each PHI key to its type, e.g. "Mary" -> "First Name"
  4392.         # This part is necessary because re-identification depends on the PHI's type
  4393.         foreach $ky (sort numerically keys %phiT) {
  4394.  
  4395.         my $ky = $ky;
  4396.         ($startp, $endp) = split "-", $ky;
  4397.         $notAmbig = 0; 
  4398.        
  4399.         # Checks to see if PHI type matches any of the date patterns
  4400.         # Each PHI may have more than one listed type, e.g. First Name AND Last Name
  4401.         # For each PHI type listed for the specific PHI
  4402.         foreach $t (@{$phiT{$ky}}) {
  4403.  
  4404.             $datephi1 = "Year/Month/Day"; # e.g. 1999/2/23
  4405.             $datephi2 = "Year/Day/Month"; # e.g. 1999/23/2, note: pattern currently not filtered in sub date()!
  4406.             $datephi3 = "Month/Day/Year"; # e.g. 2/23/1999
  4407.             $datephi5 = "Day/Month/Year"; # 23/2/1999, note: pattern currently not filtered in sub date()!
  4408.             $datephi4 = "Month/Day"; # e.g. 2/23, using record year as year
  4409.             $datephi6 = "Day/Month"; # e.g. 23/2, using record current year as year, note: pattern currently not filtered in sub date()!
  4410.             $datephi7 = "Month/Year";  # e.g. 2/1999, using 1 as day
  4411.             $datephi8 = "Year/Month";  # e.g. 1999/2, using 1 as day
  4412.             $datephi9 = "Day Month Year"; # e.g. 23 february 1999
  4413.             $datephi10 = "Month Day Year"; # e.g. feb 23 1999 or feb. 23rd 1999
  4414.             $datephi11 = "Month Day"; # e.g. feb 23, using record year as year
  4415.             $datephi12 = "Day Month"; # e.g. 23 february, using record year as year
  4416.             $datephi13 = "Month Year"; # e.g. feb 1999, or february of 1999 or feb. 1999, using 1 as day
  4417.             $datephi14 = "Header Date"; # not important
  4418.             $datephi15 = "4 digits"; # 4-digit year, e.g. 1999
  4419.             $datephi16 = "2 digits"; # 2-digit year, e.g. '99
  4420.             $datephi17 = "Day Month Year 2"; # e.g. 23rd february 1999
  4421.  
  4422.            
  4423.             #if ($ky =~/$key/) {$phiType = $t;}
  4424.             if ($ky eq $key) {$phiType = $t;}
  4425.  
  4426.             # Calls the date-shifting function alterdate() with a date argument appropriate for the date pattern
  4427.             # This is because alterdate() accepts an argument of a fixed date pattern
  4428.             # Prints the resulting shifted date in deid_text_file (.res)
  4429.            
  4430.             # If the current PHI has not been output to the .res file yet, checks if PHI type is date
  4431.             # Shifts the date and writes the shifted date to TF (.res file)
  4432.             if ($added == 0) {
  4433.            
  4434.             #if (($t =~ /$datephi1/)  && ($ky=~/$key/)){
  4435.             if (($t =~ /$datephi1/)  && ($ky eq $key)){        
  4436.                 $date = (substr $allText, $k, ($deidsend-$k));             
  4437.                 $date =~ s/\-/\//g;    
  4438.                 $date =~ s/\./\//g;
  4439.                 $altereddate = &alterdate($date, $checkID);
  4440.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4441.                 $longyear = $1;
  4442.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";                        
  4443.                 $added = 1;
  4444.                 $lastEnd = $deidsend;
  4445.                
  4446.             }
  4447.             elsif (($t =~ /$datephi17/)  && ($ky eq $key)){  
  4448.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4449.                
  4450.                 $date =~ /\b(((\d{1,2})(|st|nd|rd|th|)?\s+(of\s)?[\-]?\b([A-Za-z]+)\.?,?)\s+(\d{2,4}))\b/ig; # 12-Apr, or Second of April
  4451.  
  4452.                 $mon = $6;
  4453.                 $day = $3;
  4454.                 $year = $7;
  4455.                 foreach $m (sort keys %months) {
  4456.                 if ($mon =~ /$m/ig) {
  4457.                     $month = $months{$m};
  4458.                 }
  4459.                 }              
  4460.                 $date =  "$year/$month/$day";
  4461.  
  4462.                 $altereddate = &alterdate($date, $checkID );
  4463.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4464.                
  4465.                 $added = 1;
  4466.                 $lastEnd = $deidsend;
  4467.             }              
  4468.             #elsif (($t =~ /$datephi9/)  && ($ky=~/$key/)){
  4469.             elsif (($t =~ /$datephi9/)  && ($ky eq $key)){  
  4470.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4471.                 $date =~ /(\d+) ([A-Za-z]+)\,? (\d+)/;
  4472.                 $mon = $2;
  4473.                 $day = $1;
  4474.                 $year = $3;
  4475.                 foreach $m (sort keys %months) {
  4476.                 if ($mon =~ /$m/ig) {
  4477.                     $month = $months{$m};
  4478.                 }
  4479.                 }              
  4480.                 $date =  "$year/$month/$day";
  4481.                 $longyear = $5;
  4482.                 $altereddate = &alterdate($date, $checkID );
  4483.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4484.                
  4485.                 #local $added = 1;
  4486.                 $added = 1;
  4487.                 $lastEnd = $deidsend;
  4488.             }          
  4489.             #elsif (($t =~ /$datephi10/)  && ($ky=~/$key/)){  
  4490.             elsif (($t =~ /$datephi10/)  && ($ky eq $key)){
  4491.  
  4492.                 $date = (substr $allText, $k, ($deidsend-$k)); 
  4493.                
  4494.                 $date =~ /\b(([A-Za-z]+)\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?[\,\s]+ *\'?(\d{2,4}))\b/ig;
  4495.                 $mon = $2;
  4496.                 $day = $3;
  4497.                 $year = $5;
  4498.                 #$date =~ /([A-Za-z]+) (\d+)\,? (\d+)/;
  4499.                 #$mon = $1;
  4500.                 #$day = $2;
  4501.                 #$year = $3;
  4502.                 #print "DatePHI10: Before date shift: month= $mon, day = $day , year = $year \n";
  4503.  
  4504.                 foreach $m (sort keys %months) {
  4505.                 if ($mon =~ /$m/ig) {
  4506.                     $month = $months{$m};
  4507.                 }
  4508.                 }
  4509.                 $date =  "$year/$month/$day";
  4510.                 $altereddate = &alterdate($date, $checkID);
  4511.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4512.                
  4513.                 #local $added = 1;
  4514.                 $added = 1;
  4515.                 $lastEnd = $deidsend;
  4516.             }                  
  4517.             #elsif (($t =~ /$datephi2/)  && ($ky=~/$key/)){  
  4518.             elsif (($t =~ /$datephi2/)  && ($ky eq $key)){  
  4519.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4520.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4521.                 $date =  "$1/$5/$3";
  4522.                 $longyear = $1;
  4523.                 $altereddate = &alterdate($date, $checkID);
  4524.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4525.                 #local $added = 1;
  4526.                 $added = 1;
  4527.                 $lastEnd = $deidsend;
  4528.             }      
  4529.             #elsif (($t =~ /$datephi3/)  && ($ky=~/$key/)){  
  4530.             elsif (($t =~ /$datephi3/)  && ($ky eq $key)){  
  4531.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4532.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4533.                 $date =  "$5/$1/$3";
  4534.                 $altereddate = &alterdate($date,  $checkID);
  4535.                 $longyear = $5;
  4536.                
  4537.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4538.                 $added = 1;
  4539.                 $lastEnd = $deidsend;              
  4540.             }
  4541.             #elsif (($t =~ /$datephi5/)  && ($ky=~/$key/)){    
  4542.             elsif (($t =~ /$datephi5/)  && ($ky eq $key)){                 
  4543.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4544.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4545.                 $date =  "$5/$3/$1";
  4546.                 $altereddate = &alterdate($date, $checkID );
  4547.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4548.                 $longyear = $5;
  4549.                 $added = 1;
  4550.                 $lastEnd = $deidsend;              
  4551.             }  
  4552.             #elsif (($t =~ /$datephi4/)  && ($ky=~/$key/)){  
  4553.             elsif (($t =~ /$datephi4/)  && ($ky eq $key)){                 
  4554.                 $date = (substr $allText, $k, ($deidsend-$k)); 
  4555.                 $date =~ s/\-/\//g;    
  4556.                 $date = "$checkYear/$date";            
  4557.                 $altereddate = &alterdate($date, $checkID);
  4558.                 $altereddate = (substr $altereddate, 5, (length($altereddate)-5));
  4559.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4560.                 $added = 1;
  4561.                 $lastEnd = $deidsend;              
  4562.             }  
  4563.             #elsif (($t =~ /$datephi11/)  && ($ky=~/$key/)){  
  4564.             elsif (($t =~ /$datephi11/)  && ($ky eq $key)){                
  4565.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4566.                 #$date =~ /([A-Za-z]+) (\d+)/;
  4567.                 #$mon = $1;
  4568.                 #$day = $2;
  4569.                 $date =~ /\b(([A-Za-z]+)\b\.?,?\s*(\d{1,2})(|st|nd|rd|th|)?)\b/ig;  # Apr. 12
  4570.                 $mon = $2;
  4571.                 $day = $3;
  4572.                
  4573.                 foreach $m (sort keys %months) {
  4574.                 if ($mon =~ /$m/ig) {
  4575.                     $month = $months{$m};
  4576.                 }
  4577.                 }
  4578.                 $date =  "$checkYear/$month/$day";
  4579.                 $altereddate = &alterdate($date,  $checkID);
  4580.                 $altereddate = substr($altereddate, 5, (length($altereddate)-5));              
  4581.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4582.                 $added = 1;
  4583.                 $lastEnd = $deidsend;
  4584.             }
  4585.                
  4586.             #elsif (($t =~ /$datephi6/)  && ($ky=~/$key/)){  
  4587.             elsif (($t =~ /$datephi6/)  && ($ky eq $key)){                 
  4588.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4589.                 $date =~ /(\d+)(.)(\d+)/;
  4590.                 #print "DatePHI6: date is $date, 3 is $3, 1 is $1\n";
  4591.                 $date =  "$3/$1";
  4592.                 $date = "$checkYear/$date";
  4593.                 $altereddate = &alterdate($date,  $checkID);
  4594.                 $altereddate = substr($altereddate, 5, (length($altereddate)-5));
  4595.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4596.                 $added = 1;
  4597.                 $lastEnd = $deidsend;              
  4598.             }
  4599.             #elsif (($t =~ /$datephi12/)  && ($ky=~/$key/)){    
  4600.             elsif (($t =~ /$datephi12/)  && ($ky eq $key)){                
  4601.                 $date = (substr $allText, $k, ($deidsend-$k)); 
  4602.                 #$date =~ /(\d+) ([A-Za-z]+)/;
  4603.                 #$mon = $2;
  4604.                 #$day = $1;
  4605.                 $date =~ /\b((\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b([A-Za-z]+))\b/ig;
  4606.                
  4607.                 $mon = $5;
  4608.                 $day = $2;
  4609.                 #print "month is $month, day is $day\n";
  4610.                 foreach $m (sort keys %months) {
  4611.                 if ($mon =~ /$m/ig) {
  4612.                     $month = $months{$m};
  4613.                 }
  4614.                 }
  4615.                 $date =  "$checkYear/$month/$day";
  4616.                 $altereddate = &alterdate($date,  $checkID);
  4617.                 $altereddate = substr($altereddate, 5, (length($altereddate)-5));              
  4618.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4619.                 $added = 1;
  4620.                 $lastEnd = $deidsend;
  4621.             }
  4622.             #elsif (($t =~ /$datephi13/)  && ($ky=~/$key/)){  
  4623.             elsif (($t =~ /$datephi13/)  && ($ky eq $key)){  
  4624.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4625.                 $date =~ /([A-Za-z]+)(\.)?(\s+)(of\s+)?(\d+)/ig;
  4626.                 $mon = $1;
  4627.                 $year = $5;
  4628.                 foreach $m (sort keys %months) {
  4629.                 if ($mon =~ /$m/ig) {              
  4630.                     $month = $months{$m};
  4631.                 }
  4632.                 }
  4633.  
  4634.                 $date = "$year/$month/1";
  4635.                 $longyear = $3;
  4636.                 $altereddate = &alterdate($date, $checkID );
  4637.                
  4638.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4639.                 $added = 1;
  4640.                 $lastEnd = $deidsend;
  4641.             }
  4642.             #elsif (($t =~ /$datephi7/)  && ($ky=~/$key/)){    
  4643.             elsif (($t =~ /$datephi7/)  && ($ky eq $key)){                 
  4644.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4645.                 $date =~ /(\d+)(.)(\d+)/;      
  4646.                 $date =  $3.'/'.$1.'/1';
  4647.                 $altereddate = &alterdate($date,  $checkID);
  4648.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4649.                 $longyear = $3;
  4650.                 $added = 1;
  4651.                 $lastEnd = $deidsend;
  4652.             }
  4653.             #elsif (($t =~ /$datephi8/)  && ($ky=~/$key/)){  
  4654.             elsif (($t =~ /$datephi8/)  && ($ky eq $key)){                 
  4655.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4656.                 $date =~ /(\d+)(.)(\d+)/;      
  4657.                 $date =  $1.'/'.$3.'/1';
  4658.                 $altereddate = &alterdate($date,  $checkID);
  4659.                 $longyear = $1;
  4660.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4661.                 $added = 1;
  4662.                 $lastEnd = $deidsend;
  4663.             }
  4664.             #elsif (($t =~ /$datephi14/)  && ($ky=~/$key/)) {
  4665.             elsif (($t =~ /$datephi14/)  && ($ky eq $key)) {
  4666.                 $date = (substr $allText, $k, ($deidsend-$k));
  4667.                 $date =~ /(\d+)(\-)(\d+)(\-)(\d+)/;
  4668.                 $date = $1.'/'.$3.'/'.$5;
  4669.                 $altereddate = &alterdate($date,  $checkID);
  4670.                 $longyear = $1;
  4671.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."$date";
  4672.                 $added = 1;
  4673.                 $lastEnd = $deidsend;
  4674.             }
  4675.             #elsif (($t =~ /$datephi15/)  && ($ky=~/$key/)) {
  4676.             elsif (($t =~ /$datephi15/)  && ($ky eq $key)) {
  4677.                 $year  = (substr $allText, $k, ($deidsend-$k));
  4678.                 $date = $year.'/1/1';
  4679.                 $altereddate = &alterdate($date, $checkID);
  4680.                 $alteredyear = (substr $altereddate, 0, 4);
  4681.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$alteredyear**]";
  4682.                 $added = 1;
  4683.                 $lastEnd = $deidsend;
  4684.             }  
  4685.             #elsif (($t =~ /$datephi16/)  && ($ky=~/$key/)) {
  4686.             elsif (($t =~ /$datephi16/)  && ($ky eq $key)) {
  4687.                 $year  = (substr $allText, $k, ($deidsend-$k));
  4688.  
  4689.                 #if ($year > 20) {
  4690.                 #   $year = '19'.$year;
  4691.                 #} else {
  4692.                 #   #$year = '20.'.$year;  
  4693.                 #   $year = '20'.$year;
  4694.                 #    }
  4695.              
  4696.                 $date = $year.'/1/1';
  4697.  
  4698.                 $altereddate = &alterdate($date, $checkID);
  4699.                 $alteredyear = (substr $altereddate, 2, 2);
  4700.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$alteredyear**]";
  4701.                 $added = 1;
  4702.                 $lastEnd = $deidsend;
  4703.             }
  4704.             # If the current PHI is not a date, then indicates that it has not been output yet
  4705.             else { my $added = 0;}         
  4706.             }
  4707.         }
  4708.         }    
  4709.        
  4710.         # If the PHI is not a date, replaces it in deid_text_file (.res) by its PHI type tag
  4711.         if ($added==0) {
  4712.        
  4713.         if ($k > $lastEnd || ($k==0)) {
  4714.             $phiText = (substr $allText, $k, ($deidsend-$k));
  4715.        
  4716.  
  4717.             # Parentheses are eliminated so that they do not trip up the run
  4718.             $phiText =~ s/\(//g;
  4719.             $phiText =~ s/\)//g;
  4720.             $phiText =~ s/\+//g;       
  4721.            
  4722.             $found = 0;
  4723.  
  4724.             # Assigns a unique ID to each PHI, e.g. all instances of "Mary" may be assigned "1", but "John" may be assigned "2"
  4725.             # %ID maps each PHI (e.g. "Mary") to its ID (e.g. "1")
  4726.             foreach $phik (keys %ID) {
  4727.             if ($phik =~/$phiText/ig) {
  4728.                 $found = 1;
  4729.             }
  4730.             }
  4731.  
  4732.             # If the current PHI to be added to .res file is already recorded in %ID, then retrieves its unique ID
  4733.             if ($found==1) {
  4734.             $phiID = $ID{$phiText};
  4735.             }
  4736.  
  4737.             # If the current PHI is not recorded in %ID, records it in %ID and assigns the PHI a unique ID
  4738.             else {
  4739.             $ID{$phiText} = keys(%ID) + 1;
  4740.             $phiID = $ID{$phiText};
  4741.             }
  4742.                        
  4743.             # Prints the PHI type and PHI ID in place of the original PHI in the .res file
  4744.             print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$phiType $phiID**]";
  4745.             $lastEnd = $deidsend;
  4746.         }
  4747.  
  4748.         else {
  4749.             if ($lastEnd < $deidsend) {
  4750.             $lastEnd = $deidsend;
  4751.             }
  4752.         }
  4753.        
  4754.         if ($lastEnd == 0) {
  4755.             $lastEnd = $lastlast;
  4756.         }
  4757.         }      
  4758.     }
  4759.     }
  4760.     #close OF;
  4761.     #print "Finished outputing to the .phi file.";
  4762.    
  4763.     # Prints the remaining non-PHI text to the .res file    
  4764.     print TF (substr $allText, $lastEnd);  
  4765.     #print "finished outputing to the .res file";
  4766.  
  4767.     close TF;  
  4768. }
  4769. # End of outputText()
  4770.  
  4771.  
  4772. #***********************************************************************************************************
  4773. #***********************************************************************************************************
  4774. #***********************************************************************************************************
  4775. # Function: alterdate()
  4776. # Arguments: string $date (yyyy/mm/dd), int $pid (patient ID)
  4777. # Returns: string $year-$month-$day
  4778. # Called by: outputText()
  4779. # Description: Converts given date to shifted date, depending on date-shift-mode, by calling doalterdate().
  4780. # Returns shifted date
  4781.  
  4782. sub alterdate {
  4783.     # Separates date fields by splitting along "/" or "-"
  4784.     # Then calls doalterdate on the resulting array of date elements
  4785.     $d = $_[0];  
  4786.     if (substr($d,0,6) =~ /(\/)/) {
  4787.     @d = split '/', $d;
  4788.     }
  4789.     elsif (substr($d,0,6) =~ /(\-)/) {
  4790.     @d = split '-', $d;
  4791.     }
  4792.  
  4793.     ($entryyear, $entrymonth)=@d[0..1];
  4794.     return join "-", &doalterdate(@d, $_[1]);
  4795. }
  4796. # End of alterdate()
  4797. #***********************************************************************************************************
  4798. #***********************************************************************************************************
  4799. #***********************************************************************************************************
  4800. # Function: doalterdate()
  4801. # Arguments: int $year, int $month, int $day, int $pid (patient ID)
  4802. # Returns: array @($year, $month, $day)
  4803. # Called by: alterdate()
  4804. # Description: Converts given date, depending on date-shift-mode, to shifted date.
  4805. # Returns shifted date in an array format to calling function alterdate(). alterdate() converts the shifted date to a string.
  4806.  
  4807. sub doalterdate {
  4808.     my $year=$_[0];
  4809.     if (length $year == 2) {
  4810.  
  4811.             # Limits 2-digit years between 1900 and 2020
  4812.         # Converts them to 4-digit years
  4813.         #if ($year<=10) {$year = 2000+$year;}  
  4814.         #if ($year<=20) {$year = 2000+$year;}
  4815.         if ($year<=$TWO_DIGIT_YEAR_THRESHOLD) {$year = 2000+$year;}
  4816.         else {$year = 1900+$year;}
  4817.     }
  4818.     my $month=$_[1];
  4819.     my $day=$_[2];
  4820.     my $pid=$_[3];          
  4821.    
  4822.     if ($pid_dateshift_list =~ /y/) {
  4823.         open SF, $date_shift_file or die "Cannot open $date_shift_file";
  4824.         while ($line = <SF>) {
  4825.         chomp $line;
  4826.         if ($line =~ /\A($pid)\|\|\|\|([0-9\-]+)/) {
  4827.             $offset = $2;
  4828.         }      
  4829.         }
  4830.         close SF;
  4831.     }
  4832.  
  4833.     my $ml=&monthlength($month, $year);
  4834.    
  4835.     # $offset = days of offset (positive or negative shift)
  4836.     # Sets the shifted year
  4837.     $offset_local = $offset;
  4838.     if ($offset_local>0) {
  4839.         $year += 4*int($offset_local/1461);
  4840.         $offset_local -=1461*int($offset_local/1461);
  4841.     }
  4842.     if ($offset_local<0){
  4843.         $year -= 4*int(-$offset_local/1461);
  4844.         $offset_local +=1461*int(-$offset_local/1461);
  4845.     }
  4846.    
  4847.     # Shifts number of days
  4848.     $day +=$offset_local;
  4849.    
  4850.     $ml=&monthlength($month, $year);
  4851.  
  4852.     # Changes $day, $month, $year based on the remaining offset after shifting $year
  4853.     while ($day>$ml) {
  4854.         $ml=&monthlength($month, $year);
  4855.         $day=$day-$ml;
  4856.         $month++;
  4857.         if ($month>12) {
  4858.         $month -=12;
  4859.         $year++;
  4860.         }
  4861.         $ml = &monthlength($month, $year);
  4862.     }
  4863.     while ($day<1) {
  4864.         $ml=&monthlength($month-1, $year);
  4865.         $day=$day + $ml;
  4866.         $month--;
  4867.         if ($month<1) {
  4868.         $month +=12;
  4869.         $year--;
  4870.         }
  4871.     }
  4872.    
  4873.     # Formats the output of single-digit month and day: "2" becomes "02"
  4874.     if (length($month)<2) {
  4875.         $month="0".$month;
  4876.     }
  4877.     if (length($day)<2) {
  4878.         $day="0".$day;
  4879.     }
  4880.    
  4881.     # Returns the shifted date as an array
  4882.     return ($year, $month, $day);    
  4883.     }
  4884. # End of doalterdate()
  4885. #***********************************************************************************************************
  4886. #***********************************************************************************************************
  4887. #***********************************************************************************************************
  4888. # Function: altermonthdate()
  4889. # Arguments: int $month, int $day, int $pid (patient ID)
  4890. # Returns: call doalterdate(), returns array ($year, $month, $day)
  4891. # Called by: None
  4892. # Description: Calls doalterdate() which performs dateshift
  4893. # This function is not used
  4894.  
  4895. sub altermonthdate {
  4896.     my ($month, $day, $pid)=@_;
  4897.     my $year;
  4898.    
  4899.     if (($month-$entrymonth)%12<6) {
  4900.     if ($month<$entrymonth) {
  4901.         $year=$entryyear+1;
  4902.     }
  4903.     else {
  4904.         $year=$entryyear;
  4905.     }
  4906.     }
  4907.     else {
  4908.     if ($month>$entrymonth) {
  4909.         $year=$entryyear-1;
  4910.     }
  4911.     else {
  4912.         $year=$entryear;
  4913.     }
  4914.     }
  4915.     return (&doalterdate($year, $month, $day, $pid))[1,2];    
  4916. }
  4917. # End of altermonthdate()
  4918. #***********************************************************************************************************
  4919. #***********************************************************************************************************
  4920. #***********************************************************************************************************
  4921. # Function: getcentury()
  4922. # Arguments: int 2-digit $year
  4923. # Returns: int
  4924. # Called by: None
  4925. # Description: Converts a 2-digit year into a 4-digit year based on entryyear
  4926. # This function is not used
  4927.  
  4928. sub getcentury {
  4929.     my $year=$_[0];
  4930.  
  4931.     if (($year-$entryyear)%100<10) {
  4932.     $year=$entryyear+(($year-$entryyear)%100);
  4933.     }
  4934.     else {
  4935.     $year=$entryyear-(($entryyear-$year)%100);
  4936.     }
  4937. }
  4938. # End of getcentury()
  4939. #***********************************************************************************************************
  4940. #***********************************************************************************************************
  4941. #***********************************************************************************************************
  4942. # Function: monthlength()
  4943. # Arguments: int $m, int $y
  4944. # Returns: int
  4945. # Called by: doalterdate()
  4946. # Description: Returns the number of days in given month-year
  4947.  
  4948. sub monthlength {
  4949.     my($m, $y)=@_;
  4950.  
  4951.     while ($m<=0) {
  4952.     $m += 12;
  4953.     $y --;
  4954.     }
  4955.     while ($m>=13) {
  4956.     $m -= 12;
  4957.     $y ++;
  4958.     }    
  4959.     # Checks for February
  4960.     if ($m==2) {
  4961.     if ($y % 4 ==0) {
  4962.         if($y % 100 ==0) {
  4963.         if ($y % 400 ==0){
  4964.             return 29;
  4965.         }
  4966.         else {
  4967.             return 28;
  4968.         }
  4969.         }
  4970.         else {     
  4971.         return 29;
  4972.         }
  4973.     }
  4974.     else {     
  4975.         return 28;
  4976.     }
  4977.     }
  4978.     # Checks for months consisting of 30 days    
  4979.     elsif (($m==4) || ($m==6) || ($m==9) || ($m==11)) {
  4980.     return 30;
  4981.     }
  4982.     # Checks for months consisting of 31 days    
  4983.     else {     
  4984.     return 31;
  4985.     }
  4986. }
  4987. # End of monthlength()
  4988.  
  4989.  
  4990.  
  4991.  
  4992.  
  4993.  
  4994.  
  4995.  
  4996.  
  4997. #***********************************************************************************************************
  4998. #***********************************************************************************************************
  4999. #***********************************************************************************************************
  5000. # Application-specific code follows. Code may contain patterns specific to our medical notes.
  5001. # Customize by replacing with your application-specific filters.
  5002. #***********************************************************************************************************
  5003. #***********************************************************************************************************
  5004. #***********************************************************************************************************
  5005.  
  5006.  
  5007. #***********************************************************************************************************
  5008. #***********************************************************************************************************
  5009. #***********************************************************************************************************
  5010. #Function: isProbablyPhone
  5011. #Argument: context string
  5012. #Returns: Always returns 1 (true) unless the context is one of the words defined in
  5013. #phone_pre_disqualifier. For future extensions, can add qualifier words
  5014. #such as "phone", "pager", etc.
  5015.  
  5016. sub isProbablyPhone {
  5017.     @phone_pre_disqualifier = ("HR","Heart", "BP", "SVR", "STV", "VT", "Tidal Volumes", "Tidal Volume", "TV", "CKS");
  5018.     $context = $_[0];
  5019.     foreach $i (@phone_pre_disqualifier) {
  5020.     if ($context =~ /\b$i\b/i){
  5021.         return 0;
  5022.     }
  5023.     }      
  5024.     return 1;
  5025. }
  5026. #end of isProbablyPhone()
  5027.  
  5028. #***********************************************************************************************************
  5029. #***********************************************************************************************************
  5030. #***********************************************************************************************************
  5031. # Function: wardname()
  5032. # Argument: string
  5033. # Searches for ward names specific to our hospital
  5034. sub wardname() {
  5035.     $text = $_[0];
  5036.    
  5037.     #Added to catch gs specific wardnames
  5038.     if ($gs_specific_filter =~ /y/){
  5039.        
  5040.     @ward_indicators = ("Quartermain");
  5041.     foreach $ward_ind (@ward_indicators){
  5042.         while ($text =~ /\b(($ward_ind) ?(\d))\b/ig){
  5043.         $wardname = $1;
  5044.         $start = length($`);
  5045.         $end = $start + length($wardname);
  5046.         $key = $start."-".$end;
  5047.         addType ($key, "Wardname");
  5048.         }
  5049.     }
  5050.     }
  5051.  
  5052. }
  5053.  
  5054. # End of function wardname()
  5055.  
  5056.  
  5057.  
  5058. #***********************************************************************************************************
  5059. #***********************************************************************************************************
  5060. #***********************************************************************************************************
  5061. #Function: isProbablyMeasurement
  5062. #Argument: context string
  5063. #Returns: Returns true if the text passed in contains (more specifically, ends with) any of the measurement indicators.
  5064. sub isProbablyMeasurement {
  5065.  @measurement_indicators_pre = ("increased to","decreased from","rose to","fell from", "down to",
  5066.                "increased from", "dropped to", "dec to", "changed to","remains on", "change to");
  5067.     $context = $_[0];
  5068.  
  5069.     foreach $i (@measurement_indicators_pre) {
  5070.  
  5071.     if ($context =~ /\b$i\b/i){  #only match if it ends with the phrase
  5072.         return 1;
  5073.     }
  5074.     }      
  5075.  
  5076.     return 0;
  5077. }
  5078.  
  5079. #end isProbablyMeasurement()
  5080.  
  5081. #***********************************************************************************************************
  5082. #***********************************************************************************************************
  5083. #***********************************************************************************************************
  5084. #Function: isProbablyDate
  5085. #Argument: textBefore string and textAfter string
  5086. #Returns: returns 1 if the context check determines that textBefore or textAfter
  5087. #are most likely strings preceding or following a date PHI; returns 0 otherwise.
  5088. #Description: Subroutine performs a context check on the textBefore and textAfter. If it contains one of the
  5089. #keywords (see below) that indicates it's probably not a date, then return false; otherwise return true.
  5090. #Called by dateWithContextCheck on partial dates MM-DD
  5091. sub isProbablyDate{
  5092.     my ($textBefore, $textAfter) = @_;
  5093.    
  5094.     if ((!isProbablyMeasurement($textBefore)) && ($textBefore !~ /\b(drop|up|cc|dose|doses|range|ranged|pad|rate|bipap|pap|unload|ventilation|scale|cultures|blood|at|up|with|in|of|RR|ICP|CVP|strength|PSV|SVP|PCWP|PCW|BILAT|SRR|VENT|PEEP\/PS|flowby|drinks|stage) ?\Z/i) && ($textAfter !~ /\A ?(packs|litres|puffs|mls|liters|L|pts|patients|range|psv|scale|beers|per|esophagus|tabs|tablets|systolic|sem|strength|hours|pts|times|drop|up|cc|mg|\/hr|\/hour|mcg|ug|mm|PEEP|hr|hrs|hour|hours|bottles|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns|units|amp|qd|chest pain|intensity)\b/i)) {
  5095.      return 1;
  5096.  }
  5097.  
  5098.  return 0;
  5099.  
  5100. }
  5101. #end isProbablyDate()
  5102.  
  5103. #***********************************************************************************************************
  5104. #***********************************************************************************************************
  5105. #***********************************************************************************************************
  5106. #Function: isProbablyDate2
  5107. #Argument: textBefore string and textAfter string
  5108. #Returns: returns 1 if the context check determines that textBefore or textAfter
  5109. #are most likely strings preceding or following a date PHI; returns 0 otherwise.
  5110. #Subroutine performs a context check on the textBefore and textAfter.  If it contains one of the
  5111. #keywords (see below) that indicates it's probably not a date, then return false; otherwise return true.
  5112. #Called by dateWithContextCheck on partial dates MM/DD.  
  5113. sub isProbablyDate2{
  5114.  
  5115. if ( (!isProbablyMeasurement($textBefore)) && ($textBefore !~ /\b(drop|up|cc|range|ranged|pad|rate|rating|bipap|pap|unload|ventilation|scale|blood|at|up|with|RR|ICP|CVP|strength|PSV|SVP|PCWP|BILAT|SRR|VENT|PEEP\/PS|flowby) ?\Z/i) && ($textAfter !~ /\A ?(packs|litres|puffs|mls|liters|L|pts|patients|range|psv|scale|drinks|beers|per|esophagus|tabs|tab|tablet|tablets|systolic|sem|strength|hours|pts|times|drop|up|cc|mg|\/hr|\/hour|mcg|ug|mm|hr|hrs|hour|hours|bottles|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|drops|breaths|wbcs|beat|beats|ns|units|amp)\b/i)) {
  5116.      return 1;
  5117.  }
  5118.  return 0;
  5119.  
  5120.  
  5121. }
  5122. #end isProbablyDate2()
  5123.  
  5124. #***********************************************************************************************************
  5125. #***********************************************************************************************************
  5126. #Function: extractYear
  5127. #Argument: a date string in the format MM/DD/YYYY
  5128. #Returns: the 4-digit year if the date is in the correct format
  5129. #returns 0000 otherwise.
  5130. sub extractYear{
  5131.  
  5132.   $date = $_[0];
  5133.  
  5134.   if ($date =~ /\b(\d\d)\/(\d\d)\/(\d\d\d\d)\b/){
  5135.       $year = $3;
  5136.   } else{
  5137.       $year = 0000;
  5138.   }
  5139.   return $year;
  5140. }
  5141.  
  5142. #***********************************************************************************************************
  5143. #***********************************************************************************************************
  5144. #***********************************************************************************************************
  5145. #Function: dateWithContextCheck
  5146. #Argument: text string
  5147. #Returns: none
  5148. #Description: Find date patterns.  Performs context check on text before and after the date patterns
  5149. #to determine if its an actual date.
  5150. sub dateWithContextCheck{
  5151.     #$text = $_[0];    
  5152.     my ($text, $date) = @_;
  5153.     my $year = extractYear($date);
  5154.  
  5155.     #my $year = substr $date, 0, 4;  
  5156.     #**********************************************************************************************
  5157.     # Searches for numerical date formats
  5158.     # Checks if dates should be filtered
  5159.     if ($datefilter =~ /y/) {
  5160.    
  5161.     # Searches for mm/dd or mm/yy
  5162.     while ($text =~ /\b([A-Za-z0-9%\/]+ +)?(\d\d?)([\/\-])(\d\d?)\/?\/?( +[A-Za-z]+)?\b/g) {               
  5163.    
  5164.         $pre = $1;
  5165.         $post = $5;
  5166.         my $first_num = $2;
  5167.         my $divider = $3;
  5168.         my $second_num = $4;
  5169.         my $postdate = $5;
  5170.            
  5171.         my $startI = length($`) + length($pre);
  5172.         my $endI = $startI + length($first_num)+length($divider)+length($second_num);
  5173.         my $key = $startI."-".$endI;
  5174.        
  5175.         my $beginr = substr $text, ($startI - 2), 2;
  5176.         my $endr = substr $text, $endI, 2;
  5177.        
  5178.         #Excludes nn/nn formats when preceded by any in array @prev
  5179.         #@prev = ("cvp", "noc", "peep/ps","%");
  5180.         @prev = ("cvp", "noc", "%", "RR", "PCW");
  5181.         my $dateorno = 1;
  5182.         foreach $j (@prev) {
  5183.         if ((!($pre =~ /\b$j/ig)) && (!($post =~ /\bpersantine\b/ig))) {
  5184.         }
  5185.         else {
  5186.             $dateorno = 0;
  5187.         }
  5188.         }
  5189.  
  5190.         my $context_len = 12; #number of characters we extract before the date
  5191.  
  5192.         if ($dateorno == 1) {
  5193.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%]/) && ($endr !~ /\S\d/)) {
  5194.            
  5195.             # Checks if date identified is valid as mm/dd; then adds the date as PHI
  5196.             if (isValidDate ($first_num, $second_num, -1)) {
  5197.            
  5198.             if ($second_num == 5) {
  5199.                
  5200.                 my $start_pos = $startI - $context_len;
  5201.                 if ($start_pos < 0) {
  5202.                 $start_pos = 0;
  5203.                 }
  5204.                 my $len = $context_len;
  5205.                 if  (length($text) < ($endI + $context_len)){
  5206.                 $len = length($text) - $endI;
  5207.                 }
  5208.                    
  5209.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5210.                 my $textAfter = substr $text, $endI, $len;
  5211.  
  5212.                
  5213.                 if (  (!isProbablyMeasurement($textBefore)) &&   ($textBefore !~ /\bPSV? \Z/i) && ($textBefore !~ /\b(CPAP|PS|range|bipap|pap|pad|rate|unload|ventilation|scale|strength|drop|up|cc|rr|cvp|at|up|in|with|ICP|PSV|of) \Z/i) && ($textAfter !~ /\A ?(packs|psv|puffs|pts|patients|range|scale|mls|liters|litres|drinks|beers|per|esophagus|tabs|pts|tablets|systolic|sem|strength|times|bottles|drop|drops|up|cc|mg|\/hr|\/hour|mcg|ug|mm|PEEP|L|hr|hrs|hour|hours|dose|doses|cultures|blood|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns)\b/i)) {
  5214.  
  5215.                 addType ($key, "Month/Day (1)");
  5216.                 }
  5217.            
  5218.             } elsif ($second_num == 2) {
  5219.            
  5220.                 my $start_pos = $startI - $context_len;
  5221.                 if ($start_pos < 0) {
  5222.                 $start_pos = 0;
  5223.                 }
  5224.                 my $len = $context_len;
  5225.                 if  (length ($text) < ($endI + $context_len)){
  5226.                 $len = length($text) - $endI;
  5227.                 }
  5228.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5229.                 my $textAfter = substr $text, $endI, $len;
  5230.  
  5231.                
  5232.                 if (   (!isProbablyMeasurement($textBefore)) &&   ($textAfter !~ /\A ?hour\b/i) && ($textBefore !~ /\b(with|drop|bipap|pap|range|pad|rate|unload|ventilation|scale|strength|up|cc|rr|cvp|at|up|with|in|ICP|PSV|of) \Z/i) && ($textAfter !~ /\A ?hr\b/i) && ($textAfter !~ /\A ?(packs|L|psv|puffs|pts|patients|range|scale|dose|doses|cultures|blood|mls|liters|litres|pts|drinks|beers|per|esophagus|tabs|tablets|systolic|sem|strength|bottles|times|drop|cc|up|mg|\/hr|\/hour|mcg|ug|mm|PEEP|hr|hrs|hour|hours|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns)\b/i)) {
  5233.                
  5234.                
  5235.                 addType ($key, "Month/Day (2)");
  5236.                 }
  5237.             #} elsif (($divider eq "-") && ($startI > 4)) {
  5238.             } elsif (($divider eq "-")) {
  5239.                 my $start_pos = $startI - $context_len;
  5240.                 if ($start_pos < 0) {
  5241.                 $start_pos = 0;
  5242.                 }
  5243.                 my $len = $context_len;
  5244.                 if  (length ($text) < ($endI + $context_len)){
  5245.                 $len = length($text) - $endI;
  5246.                 }
  5247.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5248.                 my $textAfter = substr $text, $endI, $len;
  5249.                                
  5250.                 if (isProbablyDate($textBefore, $textAfter)){
  5251.                
  5252.                 addType ($key, "Month/Day (3)");
  5253.                 }
  5254.             }
  5255.             else {
  5256.  
  5257.                 my $start_pos = $startI - $context_len;
  5258.                 if ($start_pos < 0) {
  5259.                 $start_pos = 0;
  5260.                 }
  5261.                 my $len = $context_len;
  5262.                 if  (length ($text) < ($endI + $context_len)){
  5263.                 $len = length($text) - $endI;
  5264.                 }
  5265.                 my $textAfter = substr $text, $endI, $len;  
  5266.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5267.                            
  5268.                
  5269.                 if (isProbablyDate($textBefore, $textAfter)){  
  5270.                 addType ($key, "Month/Day (4)");
  5271.                 }
  5272.             }
  5273.             }
  5274.            
  5275.             # Checks if date identified is valid as mm/yy; then adds the date as PHI
  5276.                     # Checks for years of length 2, restricted to 1950-2030
  5277.             if (($first_num <= 12) && ($first_num > 0) && ((length $second_num) == 2)
  5278.             && (($second_num>=50) || ($second_num<=30))) {     
  5279.  
  5280.                 #my $textAfter = substr $text, $endI, 9;
  5281.                 my $start_pos = $startI - $context_len;
  5282.                 if ($start_pos < 0) {
  5283.                 $start_pos = 0;
  5284.                 }
  5285.                 my $len = $context_len;
  5286.                 if  (length ($text) < ($endI + $context_len)){
  5287.                 $len = length($text) - $endI;
  5288.                 }
  5289.                 my $textAfter = substr $text, $endI, $len;  
  5290.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5291.                #  print "checking mm/yy text before is $textBefore, text after is $textAfter\n";
  5292.                 if (isProbablyDate($textBefore, $textAfter)){
  5293.                 addType ($key, "Month/Year (2)");  
  5294.                 }
  5295.                } #end if the first num and second num are month/year
  5296.  
  5297.         }
  5298.         }  #end if dateno ==1          
  5299.  
  5300.     } #end while the pattern match
  5301.     } # end if datefilter is on
  5302. }
  5303.  
  5304. #end dateWithContextCheck()
  5305.  
  5306. #***********************************************************************************************************
  5307. #***********************************************************************************************************
  5308. #***********************************************************************************************************
  5309. #Function: yearWithContextCheck
  5310. #Argument: text string
  5311. #Returns: none
  5312. #Description: Find date patterns.  Performs context check on text before and after the year patterns
  5313. #to determine if it is a year.
  5314. sub yearWithContextCheck {
  5315.     #$text = $_[0];    
  5316.     my ($text, $date) = @_;
  5317.     #my $year = substr $date, 0, 4;  
  5318.     my $year = extractYear($date);
  5319.  
  5320.  
  5321.     # Checks for 2-digit year written as '01, &c, only when preceded by the following medical terms
  5322.     while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|avr|x2|x3|CHOLECYSTECTOMY|cva|ca|PTC|PTCA|stent|since|surgery|year) + *(\')?)(\d\d)\b/ig) {
  5323.         my $num = $1;
  5324.         #print "2-digit date, 1 is $1, 2 is $2, 3 is $3, 4 is $4, 5 is $5\n";
  5325.         #my $key = (1 + (length($`))+length($1))."-".(pos $text);
  5326.         my $startd =  (length($`))+length($1);
  5327.         my $endd = $startd + length($4);
  5328.         my $key =  $startd."-".$endd;
  5329.  
  5330.         #check if the match is part of number followed by decimal point and a number
  5331.         my $textAfter = substr $text, $endd, 2;
  5332.        
  5333.         if ($textAfter !~ /\.\d/){
  5334.         addType($key, "Year (2 digits)");
  5335.        
  5336.         }
  5337.      }
  5338.        
  5339.  
  5340.      while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|avr|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|surgery|year)(\'+))(\d\d)\b/ig) {
  5341.         my $num = $1;
  5342.  
  5343.         my $startd =  (length($`))+length($1);
  5344.         my $endd = $startd + length($4);
  5345.         my $key =  $startd."-".$endd;
  5346.  
  5347.         #check if the match is part of number followed by decimal point and a number
  5348.         #or if it is a time (HH:MM)
  5349.         my $textAfter = substr $text, $endd, 2;
  5350.        
  5351.         if ($textAfter !~ /(\.|\:)\d/){
  5352.         addType($key, "Year (2 digits)");
  5353.         }
  5354.      }
  5355.  
  5356.      # Checks for 4-digit year written as 2001, &c, only when preceded by the following medical terms
  5357.      while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|in|PTCA|since|from|year) + *)(\d{4})((\,? )\d{4})?\b/ig) {
  5358.         my $num1 = $1;
  5359.         $s1 = length($`) + length($1);
  5360.         $e1 = $s1+length($3);
  5361.         $s2 = $e1+length($5);
  5362.         $e2 = $e1+length($4);
  5363.         $k1 = "$s1-$e1";
  5364.         $k2 = "$s2-$e2";
  5365.  
  5366.        
  5367.         #for 4-digit year, check if the matched number is in the range of [$VALID_YEAR_LOW,$VALID_YEAR_HIGH]
  5368.         #if ($3 <= 2030 && $3 >= 1950){
  5369.         if ($3 <= $VALID_YEAR_HIGH && $3 >= $VALID_YEAR_LOW){
  5370.             addType($k1, "Year (4 digits)");
  5371.             addType($k2, "Year (4 digits)");
  5372.             }
  5373.        }
  5374.        
  5375.       # Looks for year only (esp Patient Medical History): looks for year numbers within the 30 years before
  5376.       # and 2 years after the date passed in as an argument.
  5377.        
  5378.       # for $n (($year - 30) .. $year) {
  5379.       for $n (($year - 30) .. ($year+2)) {
  5380.         my $short = substr $n, 2, 2;
  5381.         if ($n =~ /\d\d\d\d/) {
  5382.         while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|in|PTCA|since|from|year) + *)$n\b/ig) {
  5383.             my $key = (length($`)+length($1))."-".(pos $text);
  5384.             addType ($key, "Year (4 digits)");
  5385.         }}
  5386.         if ($short =~ /\d\d/) {
  5387.         #while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|from|year) + *(\'?))$short\b/ig) {
  5388.         while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|year) + *(\'?))$short\b/ig) {
  5389.             my $key = (length($`)+length($1))."-".(pos $text);
  5390.             addType ($key, "Year (2 digits)");
  5391.         }}
  5392.         if ($short =~ /\d\d/) {
  5393.     #   while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|from|year)(\'+))$short\b/ig) {
  5394.         while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|year)(\'+))$short\b/ig) {
  5395.             my $key = (length($`)+length($1))."-".(pos $text);
  5396.             addType ($key, "Year (2 digits)");
  5397.         }}
  5398.      }
  5399.  
  5400. }
  5401.  
  5402. #end yearWithContextCheck()
  5403.  
  5404.  
  5405.  
  5406.  
  5407. #***********************************************************************************************************
  5408. #***********************************************************************************************************
  5409. #***********************************************************************************************************
  5410. # End of De-Identification Algorithm
  5411. #***********************************************************************************************************
  5412. #***********************************************************************************************************
  5413. #***********************************************************************************************************
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement