Advertisement
Guest User

Untitled

a guest
Jan 9th, 2020
165
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 187.37 KB | None | 0 0
  1.  #!/usr/bin/perl
  2.  
  3. #**************************************************************************************************************
  4. #file: deid.pl   Original author: M. Douglass 2004 
  5. #Last revised:    Apr 2009  DeID 1.1
  6. #                
  7. #
  8. #_______________________________________________________________________________
  9. #
  10. #deid.pl: De-identification algorithm -- scrubs PHI from free-text medical records
  11. #(e.g. Discharge Summaries and Nursing Notes)
  12. #
  13. #Copyright (C) 2004-2007 Margaret Douglas and  Ishna Neamatullah
  14. #
  15. #This code is free software; you can redistribute it and/or modify it under
  16. #the terms of the GNU Library General Public License as published by the Free
  17. #Software Foundation; either version 2 of the License, or (at your option) any
  18. #later version.
  19. #
  20. #This library is distributed in the hope that it will be useful, but WITHOUT ANY
  21. #WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
  22. #PARTICULAR PURPOSE.  See the GNU Library General Public License for more
  23. #details.
  24. #
  25. #You should have received a copy of the GNU Library General Public License along
  26. #with this code; if not, write to the Free Software Foundation, Inc., 59
  27. #Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  28. #
  29. #You may contact the author by e-mail or postal mail
  30. #(MIT Room E25-505, Cambridge, MA 02139 USA).  For updates to this software,
  31. #please visit PhysioNet (http://www.physionet.org/).
  32. #_______________________________________________________________________________
  33. #
  34. # De-identification Algorithm: Scrubs PHI from free-text medical records
  35. #(e.g. Discharge Summaries and Nursing Notes)
  36. # Original version written by:
  37. #   Margaret Douglass (douglass AT alum DOT mit DOT edu)
  38. #   William Long (wjl AT mit DOT edu)
  39. # Modified by:
  40. #   Ishna Neamatullah (ishna AT alum DOT mit DOT edu) in Sept 5, 2006
  41. # Last modified by:
  42. #   Li-wei Lehman (lilehman AT alum DOT mit DOT edu) in April, 2009
  43. #
  44. # Command to run the software:
  45. # perl deid.pl <filename> <config_filename>
  46. #
  47. # Input arguments:
  48. # filename (without extension, where extension must be .text): file to be de-identified
  49. # config_filename: configuration file
  50. #
  51. # Required library file: stat.pm
  52. #
  53. # Output files:
  54. # filename.res: de-identified file
  55. # filename.phi: file containing all PHI locations, used in calculating performance statistics
  56. # filename.info: file containing information useful for debugging
  57. # code performance statistics printed on screen if Gold Standard available for nursing notes (details in README)
  58. #**************************************************************************************************************
  59.  
  60. #use Stat;
  61.  
  62. # Declaring some variables for algorithm run configuration
  63.  
  64. # Variables to switch on/off filter functions
  65. $allfilters = "";
  66. $ssnfilter = "";
  67. $urlfilter = "";
  68. $emailfilter = "";
  69. $telfilter = "";
  70. $unitfilter = "";
  71. $agefilter = "";
  72. $locfilter = "";
  73. $datefilter = "";
  74. $namefilter = "";
  75. $us_state_filter = "";
  76. $ds_specific_filter = ""; #filter for discharge summmary specific patterns
  77. $gs_specific_filter = ""; #filter for gold std specific patterns
  78.  
  79. my $offset; # positive date shift in number of days
  80. my $comparison = ""; # 1=comparison with gold standard, 0=no comparison with gold standard
  81.  
  82. # Variables to switch on/off dictionaries/lists
  83. $alllists = "";
  84. $pid_patientname_list = "";
  85. $pid_dateshift_list = "";
  86. $country_list = "";
  87. $company_list = "";
  88. $ethnicity_list = "";
  89. $hospital_list = "";
  90. $doctor_list = "";
  91. $location_list = "";
  92. $local_list = "";
  93. $state_list = "";
  94.  
  95. use Time::Local;
  96. my ($mday,$mon,$year) = (localtime(time))[3,4,5];
  97. my $shortyear = substr($year,1,4);
  98. my $longyear = '20'.$shortyear;
  99.  
  100. # Nursing note date as retrieved from the header
  101. my $nn_year;
  102.  
  103. # Sets whether a de-identified version should be output: 0 = no new version of text output, 1 = fully de-identified version of text output
  104. # Note: Generally keep output_deid_text = 1
  105. $output_deid_text = 1;
  106.  
  107. #Default date used to de-identify the Gold Std if no default date is specified in the
  108. #config file.  You can change the default date by setting the "Default date" variable to
  109. #some other dates (in MM/DD/YYYY format).
  110. $DEFAULT_DATE = "01/01/2000";
  111.  
  112. #The "Two Digit Year Threshold" is used to determine whether
  113. #to interpret the year as a year in the 1900's or 2000's.
  114. #Must be a 1- or 2-digit number.
  115. #Two digit years > Threshold are  interepreted as in the 1900's
  116. #Two digit years <=  Threshold are interpreted as in the 2000's
  117. $TWO_DIGIT_YEAR_THRESHOLD = 30;#change this default by setting "Two Digit Year Threshold" in config file.
  118.  
  119. # "Valid Year Low" and "Valid Year High" (must be 4-digit numbers) are
  120. # used in certain date pattern checking routines to determine if a
  121. # four digit number that appear in a potential
  122. # date pattern is a year or not -- it is a valid year if it is
  123. # in the range of [Valid Year Low, Valid Year High].  
  124. $VALID_YEAR_LOW = 1900;
  125. $VALID_YEAR_HIGH = 2030;
  126.  
  127.  
  128.  
  129. my @known_phi;
  130. my @known_first_name;
  131. my @known_last_name;
  132.  
  133. # Hash that stores PHI information from de-identification
  134. # KEY = (patient number), VALUE = array with each element = (%HASH with KEY = start-end, VALUE = array of types of PHI for the note number/index of the array)
  135. %all_phi;
  136. my %ID;
  137.  
  138. # Declares some global variables
  139. %lhash = ();
  140. @typename = ();
  141. $ntype = 1;
  142. @extend = ('phrases');
  143.  
  144.  
  145. ##########################################################################################
  146. # Sets paths to lists and dictionaries in working directory that will be used in this algorithm
  147. $date_shift_file = "shift.txt"; # contains mapping of PID to date offset
  148. $countries_file = "lists/countries_unambig.txt";
  149. $ethnicities_unambig_file = "lists/ethnicities_unambig.txt";
  150. $companies_file = "lists/company_names_unambig.txt";
  151. $companies_ambig_file = "lists/company_names_ambig.txt";
  152. $common_words_file = "dict/common_words.txt";
  153. $medical_words_file = "dict/sno_edited.txt";
  154. $very_common_words_file = "dict/commonest_words.txt";
  155. $female_unambig_file = "lists/female_names_unambig.txt";
  156. $female_ambig_file = "lists/female_names_ambig.txt";
  157. $female_popular_file = "lists/female_names_popular.txt";
  158. $male_unambig_file = "lists/male_names_unambig.txt";
  159. $male_ambig_file = "lists/male_names_ambig.txt";
  160. $male_popular_file = "lists/male_names_popular.txt";
  161. $last_unambig_file = "lists/last_names_unambig.txt";
  162. $last_ambig_file = "lists/last_names_ambig.txt";
  163. $last_popular_file = "lists/last_names_popular.txt";
  164. #$last_name_prefixes_file = "lists/last_name_prefixes.txt";
  165. $doctor_first_unambig_file = "lists/doctor_first_names.txt";
  166. $doctor_last_unambig_file = "lists/doctor_last_names.txt";
  167. $prefixes_unambig_file = "lists/prefixes_unambig.txt";
  168. $locations_unambig_file = "lists/locations_unambig.txt";
  169. $locations_ambig_file = "lists/locations_ambig.txt";
  170. $local_places_ambig_file = "lists/local_places_ambig.txt";
  171. $local_places_unambig_file = "lists/local_places_unambig.txt";
  172. $hospital_file = "lists/stripped_hospitals.txt";
  173. $last_name_prefix_file = "lists/last_name_prefixes.txt";
  174. $patient_file = "lists/pid_patientname.txt"; # contains mapping of PID to patient name
  175. $us_states_file = "lists/us_states.txt";
  176. $us_states_abbre_file = "lists/us_states_abbre.txt";
  177. $more_us_states_abbre_file = "lists/more_us_state_abbreviations.txt";
  178. $us_area_code_file = "lists/us_area_code.txt";
  179. $medical_phrases_file = "dict/medical_phrases.txt";
  180. $unambig_common_words_file = "dict/notes_common.txt";
  181.  
  182. ############################################################################################################
  183. # Declares some arrays of context words that can be used to identify PHI
  184. # Days of the month
  185. @days = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday");
  186.  
  187. # Titles that precede last names (ignore .'s)
  188. @titles = ("MISTER", "DOCTOR", "DOCTORS", "MISS", "PROF", "PROFESSOR", "REV", "RABBI", "NURSE", "MD", "PRINCESS", "PRINCE", "DEACON", "DEACONESS", "CAREGIVER", "PRACTITIONER", "MR", "MS");
  189.  
  190. @strict_titles = ("Dr", "DRS", "Mrs");  #treat words after these strict_titles as PHI
  191.  
  192. %titles = ();
  193. foreach $title (@titles){
  194.     $titles{$title} = 1;
  195. }
  196.  
  197. # Name indicators that precede or follow names
  198. @name_indicators = ("problem","problem:", "proxy", "daughter","daughters", "dtr", "son", "brother","sister", "mother", "mom", "father", "dad", "wife", "husband", "neice", "nephew", "spouse", "partner", "cousin", "aunt", "uncle", "granddaughter", "grandson", "grandmother", "grandmom", "grandfather", "granddad", "relative", "friend", "neighbor", "visitor", "family member", "lawyer", "priest", "rabbi", "coworker", "co-worker", "boyfriend", "girlfriend", "name is", "named", "rrt", "significant other", "jr", "caregiver", "proxys", "friends", "sons", "brothers", "sisters", "sister-in-law", "brother-in-law", "mother-in-law", "father-in-law", "son-in-law", "daughter-in-law", "dtr-in-law", "surname will be", "name will be", "name at discharge will be", "name at discharge is");
  199.  
  200.  
  201. # Phrases that precede locations
  202. @location_indicators = ("lives in", "resident of", "lived in", "lives at", "comes from", "called from", "visited from", "arrived from", "returned to");
  203.  
  204. @employment_indicators_pre = ("employee of", "employed by", "employed at", "CEO of", "manager at", "manager for", "manager of", "works at", "business");
  205.  
  206.  
  207. # Hospital indicators that follow hospital names
  208. @hospital_indicators = ("Hospital", "General Hospital", "Gen Hospital", "gen hosp", "hosp", "Medical Center", "Med Center", "Med Ctr", "Rehab", "Clinic", "Rehabilitation", "Campus", "health center", "cancer center", "development center", "community health center", "health and rehabilitation", "Medical", "transferred to", "transferred from", "transfered to", "transfered from");
  209.  
  210. # Location indicators that follow locations
  211. @loc_indicators_suff = ("city", "town", "beach", "valley","county", "harbor", "ville", "creek", "springs", "mountain", "island", "lake", "lakes", "shore", "garden", "haven", "village", "grove", "hills", "hill", "shire", "cove", "coast", "alley", "street", "terrace", "boulevard", "parkway", "highway", "university", "college", "tower");
  212.  
  213. # Location indicators that are most likely preceded by locations
  214. @loc_ind_suff_c = ("town", "ville", "harbor", "tower");
  215.  
  216. # Location indicators that precede locations
  217. #@loc_indicators_pre = ("cape", "fort", "lake", "mount", "santa", "los", "great","east","west","north","south");
  218. @loc_indicators_pre = ("cape", "fort", "lake", "mount", "santa", "los", "east","west","north","south");
  219.  
  220.  
  221. @apt_indicators = ("apt", "suite"); #only check these after the street address is found
  222. @street_add_suff = ("park", "drive", "street", "road", "lane", "boulevard", "blvd", "avenue", "highway", "circle","ave", "place", "rd", "st");
  223.  
  224. #Strict street address suffix: case-sensitive match on the following,
  225. #and will be marked as PHI regardless of ambiguity (common words)
  226. @strict_street_add_suff = ("Park", "Drive", "Street", "Road", "Lane", "Boulevard", "Blvd", "Avenue", "Highway","Ave",,"Rd", "PARK", "DRIVE", "STREET", "ROAD", "LANE", "BOULEVARD", "BLVD", "AVENUE", "HIGHWAY","AVE", "RD");
  227.  
  228. # Age indicators that follow ages
  229. @age_indicators_suff = ("year old", "y\. o\.", "y\.o\.", "yo", "y", "years old", "year-old", "-year-old", "years-old", "-years-old", "years of age", "yrs of age");
  230.  
  231. # Age indicators that precede ages
  232. @age_indicators_pre = ("age", "he is", "she is", "patient is");
  233.  
  234. # Digits, used in identifying ages
  235. @digits = ("one","two","three","four","five","six","seven","eight","nine", "");
  236.  
  237. # Different variations of the 12 months
  238. @months = ("January", "Jan", "February", "Feb", "March", "Mar", "April", "Apr", "May", "June", "Jun", "July", "Jul", "August", "Aug", "September", "Sept", "Sep", "October", "Oct", "November", "Nov", "December", "Dec");
  239.  
  240. ######################################################################################
  241. # If the correct number of input argument is provided, sets the input and output filenames.
  242. if ($#ARGV == 1) {
  243.     $data_file = "train_data/" + "$ARGV[0].text";      # data_file: input file
  244.     $output_file = "train_data/" +"$ARGV[0].phi";     # output_file: file containing PHI locations
  245.     $debug_file = "train_data/" +"$ARGV[0].info";     # debug_file: file used for debugging, contains PHI and non-PHI locations
  246.     $deid_text_file = "train_data/" +"$ARGV[0].res";  # deid_text_file: de-identified text file
  247.     $gs_file = "train_data/" +"$ARGV[0].deid";        # gs_file: Gold Standard of the input file
  248.  
  249.     if not -e $output_file
  250.  
  251.  
  252.     print "\n*******************************************************************************************************************\n";
  253.     print "De-Identification Algorithm: Identifies Protected Health Information (PHI) in Discharge Summaries and Nursing Notes";
  254.     print "\n*******************************************************************************************************************\n";
  255.  
  256.  
  257.    
  258.     $config_file = $ARGV[1];
  259.     open CF, $config_file or die "Cannot open $config_file";
  260.     while ($cfline = <CF>) {
  261.     chomp $cfline;
  262.     if ($cfline =~ /\A[\#]+/){
  263.        next;
  264.     }
  265.     if ($cfline =~ /\bGold\s+standard\s+comparison\s*\=\s*([0-9])/ig) {
  266.         $comparison = ($1);
  267.     }
  268.     #Date default expects MM/DD/YYYY
  269.     if ($cfline =~ /\bDate\s+default\s*\=\s*(\d\d)\/(\d\d)\/(\d\d\d\d)/ig) {
  270.        
  271.         my $mm = $1; $dd = $2; $yyyy = $3;
  272.         $DEFAULT_DATE = "$mm/$dd/$yyyy";
  273.         #print "Default date is $DEFAULT_DATE\n";
  274.     }
  275.  
  276.     #The "Two Digit Year Threshold" is used to determine whether
  277.         #to interpret the year as a year in the 1900's or 2000's
  278.     if ($cfline =~ /\bTwo\s+Digit\s+Year\s+Threshold\s*=\s*(\d{1,2})/ig) {
  279.         $TWO_DIGIT_YEAR_THRESHOLD = "$1";
  280.         #print "Two Digit Year Threshold is $TWO_DIGIT_YEAR_THRESHOLD\n";
  281.     }
  282.  
  283.     if ($cfline =~ /\bDate\s+offset\s*\=\s*([0-9]+)/ig) {
  284.         $offset = ($1);
  285.         #print "Date offset is $1\n";
  286.     }
  287.     if ($cfline =~ /\bSSN\s+filter\s*\=\s*([a-z])/ig) {
  288.         $ssnfilter = ($1);     
  289.     }
  290.     if ($cfline =~ /\bURL\s+filter\s*\=\s*([a-z])/ig) {
  291.         $urlfilter = ($1);
  292.     }
  293.     if ($cfline =~ /\bEmail\s+filter\s*\=\s*([a-z])/ig) {
  294.         $emailfilter = ($1);
  295.     }
  296.     if ($cfline =~ /\bTelephone\s+filter\s*\=\s*([a-z])/ig) {
  297.         $telfilter = ($1);
  298.     }
  299.     if ($cfline =~ /\bUnit\s+number\s+filter\s*\=\s*([a-z])/ig) {
  300.         $unitfilter = ($1);
  301.     }
  302.     if ($cfline =~ /\bAge\s+filter\s*\=\s*([a-z])/ig) {
  303.         $agefilter = ($1);
  304.     }
  305.     if ($cfline =~ /\bLocation\s+filter\s*\=\s*([a-z])/ig) {
  306.         $locfilter = ($1);
  307.     }
  308.     if ($cfline =~ /\bDate\s+filter\s*\=\s*([a-z])/ig) {
  309.         $datefilter = ($1);
  310.     }
  311.     if ($cfline =~ /\bName\s+filter\s*\=\s*([a-z])/ig) {
  312.         $namefilter = ($1);
  313.     }
  314.  
  315.     if ($cfline =~ /\bState\s+filter\s*\=\s*([a-z])/ig) {
  316.         $us_state_filter = ($1);
  317.  
  318.     }
  319.  
  320.     if ($cfline =~ /\bDS\s+filter\s*\=\s*([a-z])/ig) {
  321.         $ds_specific_filter = ($1);
  322.  
  323.     }
  324.  
  325.     if ($cfline =~ /\bGS\s+filter\s*\=\s*([a-z])/ig) {
  326.         $gs_specific_filter = ($1);
  327.  
  328.     }
  329.  
  330.     ######################################################
  331.     #get the config info for dictionaries loading
  332.     if ($cfline =~ /\bPID\s+to\s+patient\s+name\s+mapping\s*\=\s*([a-z])/ig) {
  333.         $pid_patientname_list = ($1);
  334.  
  335.     }  
  336.     if ($cfline =~ /\bPID\s+to\s+date\s+offset\s+mapping\s*\=\s*([a-z])/ig) {
  337.         $pid_dateshift_list = ($1);
  338.     }
  339.     if ($cfline =~ /\bCountry\s+names\s*\=\s*([a-z])/ig) {
  340.         $country_list = ($1);
  341.     }
  342.     if ($cfline =~ /\bCompany\s+names\s*\=\s*([a-z])/ig) {
  343.         $company_list = ($1);
  344.     }
  345.     if ($cfline =~ /\bEthnicities\s*\=\s*([a-z])/ig) {
  346.         $ethnicity_list = ($1);
  347.     }
  348.     if ($cfline =~ /\bHospital\s+names\s*\=\s*([a-z])/ig) {
  349.         $hospital_list = ($1);
  350.     }
  351.     if ($cfline =~ /\bLocation\s+names\s*\=\s*([a-z])/ig) {
  352.         $location_list = ($1);
  353.  
  354.     }  
  355.     if ($cfline =~ /\bDoctor\s+names\s*\=\s*([a-z])/ig) {
  356.         $doctor_list = ($1);
  357.  
  358.     }
  359.  
  360.     if ($cfline =~ /\bLocalPlaces\s+names\s*\=\s*([a-z])/ig) {
  361.         $local_list = ($1);
  362.     }
  363.  
  364.     if ($cfline =~ /\bState\s+names\s*\=\s*([a-z])/ig) {
  365.         $state_list = ($1);
  366.     }
  367.  
  368.     }
  369.  
  370. }
  371.  
  372. # Prints an error message on the screen if number of arguments is incorrect
  373. else {
  374.     print "\n===========================================================================================";
  375.     print "\nError: Wrong number of arguments entered";
  376.     print "\nThe algorithm takes 2 arguments:";
  377.     print "\n  1. filename (the filename of medical notes, without extension, where extension must be .text)";
  378.     print "\n  2. config_filename (the configuration filename)";
  379.     print "\nExample (for Gold Standard Comparison): perl deid.pl id deid.config";
  380.     print "\nExample (for output mode using Gold Standard): perl deid.pl id deid-output.config";
  381.     print "\nFor further documentation, please consult the README.txt file";
  382.     print "\n===========================================================================================\n";
  383.     exit;
  384.  
  385. }
  386.  
  387. # After setting file names and configuring the run, indicates that de-identification has commenced
  388. print "\n\nStarting de-identification (version 1.1) ...\n\n";
  389.  
  390.  
  391. #check if we can open the .phi file
  392. open F, ">$output_file" or die "Cannot open $output_file";
  393. close F;
  394.  
  395. # Calls setup to create some lookup lists in memory
  396. setup();
  397.  
  398.  
  399. if ($comparison==1) {
  400.     print "Running deid in performance comparison mode.\n";    
  401.     print "Using PHI locations in $gs_file as comparison. Output files will be:\n";
  402.     print "$output_file: the PHI locations found by the code.\n";
  403.     print "$debug_file: debug info about the PHI locations.\n";
  404.     #check if the gold std file exists
  405.     open GS, $gs_file or die "Cannot open $gs_file. Make sure that the gold standard file exists!\n";   # GS = Gold Standard file
  406.     close GS;
  407. }
  408. else {
  409.     #check if we can open the .res file
  410.     open F, ">$deid_text_file" or die "Cannot open $deid_text_file";
  411.     close F;
  412.     print "Running deid in output mode. Output files will be: \n";
  413.     print "$output_file: the PHI locations found by the code.\n";
  414.     print "$deid_text_file: the scrubbed text.\n";
  415.     print "$debug_file: debug info about the PHI locations.\n";
  416.      
  417. }
  418.  
  419. deid();
  420.  
  421. # Calls function stat() to calculate code performance statistics, if comparison mode = 1
  422. if ($comparison==1) {
  423.     require "stat.pm";
  424.     &stat($gs_file, $output_file);
  425.  
  426. }
  427.  
  428. # End of top level of code
  429. #***********************************************************************************************************
  430. #***********************************************************************************************************
  431. #***********************************************************************************************************
  432. sub numerically {
  433.               #print "a is $a  b is $b\n";
  434.               $a <=> $b;
  435.  }
  436. #***********************************************************************************************************
  437. #***********************************************************************************************************
  438. #***********************************************************************************************************
  439. # Reads in a file and pushes each line onto an array. Returns the array.
  440.  
  441. sub preload {
  442.     my ($file) = @_;
  443.     my @res = ();
  444.     open FILE, $file or die "Cannot open file $file";
  445.     while ($line = <FILE>) {
  446.     chomp $line;
  447.     push(@res, uc($line));
  448.     }
  449.     close FILE;
  450.     return @res;
  451. }
  452. # End of preload()
  453. #***********************************************************************************************************
  454. #***********************************************************************************************************
  455. #***********************************************************************************************************
  456. # Reads in a file and pushes each line onto an array. Returns the array.
  457.  
  458. sub preload_uc {
  459.     my ($file) = @_;
  460.     my @res = ();
  461.     open FILE, $file or die "Cannot open file $file";
  462.     while ($line = <FILE>) {
  463.     chomp $line;
  464.     push(@res, uc($line));
  465.     }
  466.     close FILE;
  467.     return @res;
  468. }
  469. # End of preload_uc()
  470. #***********************************************************************************************************
  471. #***********************************************************************************************************
  472. #***********************************************************************************************************
  473. # Reads in a file and creates a dictionary that records each line in that file under the given association, by mapping the line to a '1' in the dictionary of the association
  474.  
  475. sub preload_assoc {
  476.     my ($file,$assoc) = @_;
  477.     open FILE, $file or die "Cannot open file $file";
  478.  
  479.     while ($line = <FILE>) {
  480.     chomp $line;
  481.     $$assoc{uc($line)}=1;
  482.     }
  483.     close FILE;
  484. }
  485. # End of preload_assoc()
  486. #***********************************************************************************************************
  487. #***********************************************************************************************************
  488. #***********************************************************************************************************
  489. # Reads in a file and calls setup_hash on each line
  490.  
  491. sub setup_hash {
  492.     my ($file, $tname) = @_;
  493.     my @entry;
  494.     $typename[$ntype]= $tname;
  495.     open FILE, $file or die "Cannot open file $file";
  496.     while ($line = <FILE>) {
  497.     chomp $line;
  498.     &setup_item($ntype,$line);
  499.     }
  500.     $ntype++;
  501.     close FILE;
  502. }
  503. # End of setup_hash()
  504. #***********************************************************************************************************
  505. #***********************************************************************************************************
  506. #***********************************************************************************************************
  507. # Reads in a file and calls setup_hash on each line
  508. sub setup_lst_hash {
  509.     my ($tname, @hlst) = @_;
  510.     $typename[$ntype]= $tname;
  511.     foreach $line (@hlst) {
  512.     &setup_item($ntype,$line);
  513.     }
  514.     $ntype++;
  515. }
  516. # End of setup_lst_hash()
  517. #***********************************************************************************************************
  518. #***********************************************************************************************************
  519. #***********************************************************************************************************
  520. sub setup_item {
  521.     my ($type,$line) = @_;
  522.     my ($head, @lst) = split (/([^a-zA-Z0-9_\']+)/,uc($line));
  523.     my $ix = $type;
  524.  
  525.     if(@lst){
  526.  
  527.     push @extend, [@lst];
  528.     $ix = "$type,$#extend";
  529.     }
  530.     my $entry = $lhash{$head};
  531.     if ($entry){
  532.     $lhash{$head} .= "|" . $ix;
  533.     }
  534.     else{$lhash{$head}=$ix;
  535.      }
  536. }
  537.  
  538. # End of setup_item()
  539. #***********************************************************************************************************
  540. #***********************************************************************************************************
  541. #***********************************************************************************************************
  542. sub typename {
  543.     my ($num) = @_;
  544.     return($typename[$num]);
  545. }
  546. # End of typename()
  547. #***********************************************************************************************************
  548. #***********************************************************************************************************
  549. #***********************************************************************************************************
  550. # Takes in an array of words and compares them with hashes of known PHI. Recognizes and adds PHI using addType().
  551.  
  552. sub lookhash {
  553.     my @txtlst = @_;
  554.     my $pos = 0;
  555.     my $npos = 0;
  556.     my $tl,$ty;
  557.     my $txt = 1; # item is text not separator
  558.     my $item;
  559.  
  560.     while (@txtlst) {
  561.     $item = shift(@txtlst);
  562.  
  563.     $npos = $pos + length($item);
  564.     if ($item =~ /([a-zA-Z\']+)/ig) {
  565.         if($item){
  566.         $item = uc($item);
  567.         $tl = $lhash{$item};
  568.  
  569.         #if the term ends with 's, remove it and see if
  570.                 #there is a match in PHI hash
  571.         #if (!($tl) && $item =~/([a-zA-Z\'\-]+)\'s$/ig) {
  572.         if (!($tl) && $item =~/([a-zA-Z\']+)\'s$/ig) {
  573.             $tl = $lhash {$1};
  574.         }
  575.  
  576.         if($tl){
  577.  
  578.             # Compares with known PHI by calling bestmatch()
  579.             ($xpos, $done, @types)=&bestmatch($tl,@txtlst);
  580.             splice(@txtlst,0,$#txtlst-$done);
  581.             $npos += $xpos;
  582.  
  583.             foreach $typ (@types){
  584.             #print "item $item, adding type $type for position $pos-$npos\n";
  585.             #print "type is $typ , key is $pos - $npos  \n";
  586.             addType("$pos-$npos",$typ);    
  587.             #print "positions are $pos-$npos, typ is $typ\n";
  588.             }
  589.         }
  590.         }  #end if $item
  591.         $txt = 0;
  592.     }  #end if $txt
  593.     else {
  594.         $txt = 1;
  595.     }
  596.     $pos = $npos;
  597.    }
  598. }
  599. # End of lookhash()
  600. #***********************************************************************************************************
  601. #***********************************************************************************************************
  602. #***********************************************************************************************************
  603. sub bestmatch {
  604.    my ($tl,@txtlst) = @_;
  605.    my $pos = 0;
  606.    my $bestpos = 0;
  607.    my $bestrest = $#txtlst;
  608.    my @besttyp = ();
  609.    my ($type, $rest, $xpos, @nlst);
  610.  
  611.    foreach $ck (split '\|',$tl) {
  612.     ($type, $rest) = split ',',$ck;
  613.  
  614.     if($rest) {
  615.         ($xpos , @nlst) = &matchrest($rest,@txtlst);
  616.         if ($xpos) {
  617.         if($xpos > $bestpos) {
  618.  
  619.             $bestrest = $#nlst; $bestpos = $xpos;
  620.             @besttyp = ($typename[$type]);
  621.         }
  622.         elsif($xpos == $bestpos) {
  623.             push @besttyp,$typename[$type];
  624.         }
  625.         }
  626.     }
  627.     elsif ($bestpos == 0) {
  628.         push @besttyp,$typename[$type];
  629.     }
  630.    }
  631.    return ($bestpos, $bestrest, @besttyp);
  632. }
  633. # End of bestmatch()
  634. #***********************************************************************************************************
  635. #***********************************************************************************************************
  636. #***********************************************************************************************************
  637. sub matchrest {
  638.    my ($rest, @txtlst) = @_;
  639.    my @mlst = @{$extend[$rest]};
  640.    my $item;
  641.    my $pos = 0;
  642.    my $pm = 0;
  643.  
  644.    my $tmppos = 0;
  645.  
  646.    if ($mlst[0] eq '|') {
  647.     $pm = 1; shift(@mlst);
  648.    }
  649.    foreach $i (@mlst) {
  650.     if ($i !~ /([a-zA-Z\'\-]+)$/ig){
  651.         next;
  652.     }
  653.  
  654.     $item = shift(@txtlst);
  655.     $pos += length($item);
  656.  
  657.  
  658.     while ( $#txtlst >= 0 && ($item !~ /([a-zA-Z\'\-]+)/ig)){
  659.         $item = shift(@txtlst);
  660.         $pos += length ($item);
  661.         #print "item is $item, len is $#txtlst";
  662.     }
  663.  
  664.     if ($pm ? (uc($item) !~ /$i/) : ($i ne uc($item))) {
  665.         #print "pm is $pm, returning zero\n";
  666.         return 0;
  667.     }
  668.    }
  669.    return ($pos, @txtlst);
  670. }
  671. # End of matchrest()
  672. #***********************************************************************************************************
  673. #***********************************************************************************************************
  674. #***********************************************************************************************************
  675. # Function: setup()
  676. # Arguments: None
  677. # Returns: None
  678. # Called by: Topmost level of code
  679. # Description: Creates some lookup lists to have in memory
  680. sub setup {
  681.    
  682.    # This part is not necessary
  683.    open LP, $last_name_prefix_file or die "Cannot open $last_name_prefix_file";
  684.    while ($line = <LP>) {
  685.     chomp $line;
  686.     $prefixes{uc($line)} = 1;
  687.    }
  688.    close LP;
  689.  
  690.    #Added to reduce false positives
  691.    &setup_hash($medical_phrases_file,"MedicalPhrase");
  692.  
  693.    # Sets up hashes of some PHI lists for direct identification
  694.    if ($namefilter =~ /y/) {
  695.     &setup_hash($female_unambig_file,"Female First Name (un)");
  696.     &setup_hash($female_ambig_file,"Female First Name (ambig)");
  697.     &setup_hash($male_unambig_file,"Male First Name (un)");
  698.     &setup_hash($male_ambig_file,"Male First Name (ambig)");
  699.     &setup_hash($last_unambig_file,"Last Name (un)");
  700.     &setup_hash($last_popular_file,"Last Name (popular/ambig)");
  701.     &setup_hash($last_ambig_file,"Last Name (ambig)");
  702.     &setup_hash($female_popular_file, "Female First Name (popular/ambig)");
  703.     &setup_hash($male_popular_file, "Male First Name (popular/ambig)");
  704.    
  705.     if ($doctor_list =~ /y/) {
  706.         &setup_hash($doctor_first_unambig_file, "Doctor First Name");
  707.         &setup_hash($doctor_last_unambig_file, "Doctor Last Name");
  708.     }
  709.    
  710.    }
  711.  
  712.    if ($locfilter =~ /y/) {
  713.     if ($location_list =~ /y/) {
  714.         &setup_hash($locations_ambig_file,"Location (ambig)");
  715.         &setup_hash($locations_unambig_file,"Location (un)");
  716.      
  717.     } else {
  718.         @loc_unambig = ();
  719.         @more_loc_unambig = ();
  720.         @loc_ambig = ();
  721.     }
  722.  
  723.  
  724.     if ($hospital_list =~ /y/) {
  725.         &setup_hash($hospital_file,"Hospital");
  726.     }
  727.     if ($ethnicity_list =~ /y/) {      
  728.         &setup_hash($ethnicities_unambig_file, "Ethnicity");
  729.     }
  730.     if ($company_list =~ /y/) {
  731.         &setup_hash($companies_file, "Company");
  732.         &setup_hash($companies_ambig_file, "Company (ambig)");
  733.    
  734.     }
  735.     if ($country_list =~ /y/) {
  736.         &setup_hash($countries_file, "Country");
  737.     }
  738.  
  739.     if ($local_list =~ /y/){
  740.         &setup_hash($local_places_unambig_file, "Location (un)");
  741.         &setup_hash($local_places_ambig_file, "Location (ambig)");
  742.    
  743.     }
  744.    }
  745.  
  746.    # Preloads PHI in some lists into corresponding arrays    
  747.    @female_popular = &preload($female_popular_file);
  748.    @male_popular = &preload($male_popular_file);
  749.    #@last_name_prefixes = &preload_uc($last_name_prefixes_file);
  750.    @prefixes_unambig = &preload_uc($prefixes_unambig_file);
  751.    
  752.  
  753.    if ($hospital_list =~ /y/) {
  754.     @hospital = &preload($hospital_file);
  755.    } else {@hospital = ();}
  756.  
  757.    if ($state_list =~ /y/){
  758.     @us_states = &preload($us_states_file);
  759.     @us_states_abbre =  &preload($us_states_abbre_file);
  760.     @more_us_states_abbre =  &preload($more_us_states_abbre_file);
  761.    }
  762.  
  763.  
  764.    # Generates associations between PHI in some lists and PHI categories
  765.    &preload_assoc($common_words_file,"common_words");
  766.    &preload_assoc($medical_words_file,"common_words");
  767.  
  768.    &preload_assoc($very_common_words_file,"very_common_words");
  769.    &preload_assoc($unambig_common_words_file, "unambig_common_words");
  770.    &preload_assoc($male_unambig_file, "male_unambig");
  771.    &preload_assoc($female_unambig_file, "female_unambig");
  772.    &preload_assoc($female_ambig_file, "female_ambig");
  773.    &preload_assoc($male_ambig_file, "male_ambig");
  774.    &preload_assoc($last_ambig_file, "last_ambig");
  775.    &preload_assoc($male_popular_file, "male_popular");
  776.    &preload_assoc($female_popular_file, "female_popular");
  777.    &preload_assoc($us_area_code_file,"us_area_code");
  778.  
  779.     # Opens debug file for debugging
  780.    open D, ">".$debug_file;
  781.    close D;
  782. }
  783. # End of setup()
  784. #***********************************************************************************************************
  785. #***********************************************************************************************************
  786. #***********************************************************************************************************
  787. # Function: deid()
  788. # Arguments: None
  789. # Returns: None
  790. # Called by: Topmost level of code
  791. # Description: One of the 2 major branches of the code
  792. # This function takes over de-identification if no performance statistic is required
  793. # Function first loads the patient name list file.
  794. # Then reads the medical text line by line.
  795. #  If the line indicates START_OF_RECORD, read the Patient ID (PID), Note ID (NID), (and Note Date if any) info
  796. # Scan for PHI a paragraph at a time
  797. # output PHI locations into .phi file
  798. my $currentID;
  799. my @known_phi;
  800. my @known_first_name;
  801. my @known_last_name;
  802. my %pidPtNames; # key = pid, value = [0] first name [1] last name
  803.  
  804. #***********************************************************************************************************
  805. #***********************************************************************************************************
  806. #***********************************************************************************************************
  807.  
  808. sub deid {
  809.    
  810.    $allText = "";
  811.    $allallText = "";
  812.    open DF, $data_file or die "Cannot open $data_file";
  813.  
  814.    my $paraCount = 0;
  815.  
  816.    my $stpos = 0;
  817.  
  818.  
  819.    my %deids; #key = start index, value = array of (end index, ID)
  820.    my %phiTerms; #key = lc(word), value = # of occurrences
  821.    my %phiT;
  822.    my %phiTT;
  823.    my $noteDate;
  824.    my $line;
  825.  
  826.    $currentID = 0; #initialize current ID to a non-existent PID 0
  827.  
  828.    # Code runs through text by paragraph so things that extend over lines aren't missed
  829.     $para = "";
  830.  
  831.     #load the patient name file
  832.     if ($pid_patientname_list =~ /y/) {
  833.     open PF, $patient_file or die "Cannot open $patient_file";
  834.    
  835.     while ($pfline = <PF>) {
  836.         chomp $pfline;
  837.  
  838.         if ($pfline =~ /((.)+)\|\|\|\|((.)+)\|\|\|\|((.)+)/ig) {       
  839.         my $pid = $1;
  840.  
  841.         $known_first_names = $3;
  842.         $known_last_names = $5;        
  843.         $pidPtNames{$pid}[0] = $known_first_names;
  844.         $pidPtNames{$pid}[1] = $known_last_names;      
  845.        
  846.         }# end if pfline = ~ /((.)+)\|\|\|\|((.)+)\|\|\|\|((.)+)/ig)
  847.     } # end while $pfline = <PF>
  848.     } #end if pid_patient_name_list ~= y
  849.     ###End loading the patient names
  850.  
  851.     while ($line = <DF>) {
  852.    
  853.     #If this is a new record, set PID, Note ID
  854.     #we assume all notes for the same patient go together
  855.     if ( $line =~ /\ASTART_OF_RECORD=(([0-9]+)(\|)+([0-9]+)(\|)+(\d\d\/\d\d\/\d{4})?)/) {
  856.        
  857.        # $currentNote = $2;
  858.        # $thePT = $4;
  859.        # $noteDate =$6; #noteDate is empty in gold std
  860.  
  861.         #print "pt $thePT, currentNote $currentNote\n";
  862.  
  863.         #if it's gold standard, the header contains PID then NID
  864.        
  865.         $currentNote = $4;
  866.         $thePT = $2;
  867.         $noteDate =$6; #noteDate is empty in gold std
  868.  
  869.         my $label = "Patient $thePT\tNote $currentNote";
  870.         open DEST, ">>".$debug_file or die "Cannot open $debug_file";
  871.         print DEST "$label\n";
  872.         close DEST;
  873.        
  874.             #Gold Standard corpus does not specify note date, so assign a default
  875.             #If you would like deid to date shift the notes on a per note basis, make sure you
  876.             #specify the record date in the header!
  877.         if (length($noteDate) ==0){
  878.             #$noteDate="01/01/2020";
  879.         $noteDate = $DEFAULT_DATE;
  880.         }
  881.  
  882.         #if this is a new patient, set the PID, and lookup patient name
  883.         if ($thePT != $currentID) {  #This is a new patient!   
  884.         $currentID = $thePT;
  885.         %phiTerms = ();  #clear the phiTerms on new pt
  886.  
  887.  
  888.         # Find the patient name for the current PID
  889.         #lookup first and last name of this patient
  890.         if ($pid_patientname_list =~ /y/ && exists $pidPtNames{$currentID} ) {
  891.            
  892.             $known_first_names = $pidPtNames{$currentID}[0];
  893.             $known_last_names = $pidPtNames{$currentID}[1];
  894.            
  895.             #print "Found patient names, first = $known_first_names   last = $known_last_names\n";
  896.  
  897.             if ($known_first_names =~ /([a-z][a-z]+)[\s\-]([a-z][a-z]+)/ig) {
  898.                 @known_first_name = ($1, $2);}
  899.             else {
  900.             if ($known_first_names =~ /\b([a-z])(\.?)\s([a-z]+)/ig) {
  901.                 @known_first_name = ($3);
  902.             }
  903.                
  904.             elsif ($known_first_names =~ /\b([a-z][a-z]+)\s([a-z])(\.?)/ig) {
  905.                 @known_first_name = ($1);
  906.             }
  907.             else {
  908.                 @known_first_name = ($known_first_names);
  909.             }
  910.             }
  911.            
  912.             if ($known_last_names =~ /([a-z][a-z]+)[\s\-]([a-z][a-z]+)/ig) {
  913.             @known_last_name = ($1, $2);}
  914.             else {
  915.             if ($known_last_names =~ /\b([a-z])(\.?)\s([a-z]+)/ig) {
  916.                 @known_last_name = ($3);
  917.             }
  918.             elsif ($known_last_names =~ /\b([a-z][a-z]+)\s([a-z])(\.?)/ig) {
  919.                 @known_last_name = ($1);
  920.             }
  921.            
  922.             else {
  923.                 @known_last_name = ($known_last_names);
  924.             }
  925.             }
  926.  
  927.  
  928.         } else {  #if no pid/patient name file, just set the first name and last name to null
  929.             @known_first_name = ();
  930.             @known_last_name = ();
  931.         }
  932.         } # end if this is a new patient
  933.  
  934.         #output the header to output file      
  935.             $allText = "";       #reset,
  936.         $allallText = "";    #reset
  937.         $stpos = 0;
  938.         $para = "";
  939.  
  940.         #if output mode, output the header line (with patient and note ID) to .res file
  941.             if ($comparison == 0) {
  942.           open TF, ">>$deid_text_file" or die "Cannot open $deid_text_file";   #now open in append mode
  943.           print TF "\n$line";
  944.           close TF;
  945.             }  
  946.         next; #skip to next line
  947.       }  #end if this is start of a record
  948.     else {  #else this is not the start of a record, just append the line to the end of the current text
  949.         chomp $line;  
  950.         $allText .= $line."\n";
  951.         $allallText .= $line."\n";   
  952.            
  953.             #$myline = $line;
  954.             #chomp $myline;
  955.         #$allText .= $myline."\n";
  956.         #$allallText .= $myline."\n";    
  957.     }
  958.  
  959.     #Look for paragraph separator: if this is a line is entirely non-alphanumeric or
  960.         #if it starts with spaces, or if it is an empty line, or if this line marks the end of record,
  961.         #then call findPHI() for the current paragraph we have so far.
  962.         #If end of record is encoutnered, output the de-id text; else if
  963.         #it's not end of record yet, append the line to the paragraph.
  964. #   if ( (!($line =~ /[a-zA-Z\d]+/)) || $line =~ /^ +/ || $line eq "" || ($line =~ /\|\|\|\|END_OF_RECORD/) ) {  
  965.     if (   (!($line =~ /[a-zA-Z\d]+/)) || ($line eq "") || ($line =~ /\|\|\|\|END_OF_RECORD/) ) {  
  966.         if ($para =~ /\w/ ){  #if para contains alphanumeric
  967.                
  968.                 # Calls findPHI() with current paragraph; resulting PHI locations are stored in %phiT
  969.         #%phiT = findPHI("Para $paraCount", $date, $stpos, $para);
  970.         %phiT = findPHI("Para $paraCount", $noteDate, $stpos, $para);
  971.         $paraCount++;
  972.         # %phiT is copied over to %phiTT
  973.         foreach $x (sort numerically keys %phiT) {
  974.             @{$phiTT{$x}} = @{$phiT{$x}};
  975.         }
  976.        
  977.         #Sorts keys in %phiT; outputs text accordingly
  978.         foreach $k (keys %phiT) {        
  979.             my ($start, $end) = split '-', $k;
  980.             # $deids_end = ${@{$deids{$start}}}[0]; #does not work with perl v5.10
  981.             my @deidsval =  ${@{$deids{$start}}};
  982.             $deids_end = $deidsval[0];
  983.                
  984.             $found = $phiT{$k};
  985.             foreach $t (@{$phiT{$k}}) {
  986.             }
  987.             my $word = lc(substr $allallText, $start, ($end - $start));
  988.  
  989.             #print "Key in PhiT = $k, word = $word\n"; #DEBUG
  990.  
  991.            # if ($end > ${@{$deids{$start}}}[0]) {  
  992.             if ($end > $deidsval[0]) {
  993.             $deids{$start}[0] = $end;
  994.             $deids{$start}[1] = $currentID;
  995.             $deids{$start}[2] = $noteDate;                     
  996.             }
  997.  
  998.  
  999.             #############################################################################
  1000.             #Now remember the PHI terms that are important names for checking for repeated occurrences of PHIs
  1001.             #PHI Name Tags
  1002.                     #(NI)       Name indicators
  1003.                     #(LF)       Lastname Firstnames
  1004.                     #(PTitle)   plural titles
  1005.                     #(MD)       followed by  "MD" or "M.D"
  1006.                     #(PRE)      checks up to 3 words following "PCP Name" ("PCP", "physician", "provider", "created by", "name");
  1007.                     #(NameIs)   followed by pattern "name is"
  1008.                     #(Prefixes) for @prefixes_unambig)
  1009.                     #(STitle)   @specific_titles = ("MR", "MISTER", "MS");
  1010.                     #(Titles)       @titles
  1011.                     #(NamePattern)  all other name patterns in sub name3
  1012.             #remember all the PHI of type name (strict_titles) and name (indicators)
  1013.             #print "checking for repeated occurences of PHIs: word is $word, phitype is (@{$phiT{$k}})\n";
  1014.  
  1015.             if ( ($word !~ /\d/) && ((length $word) > 3) && !(isCommon($word)) &&
  1016.              ( isPHIType( "(NI)", @{$phiT{$k}}) ||
  1017.                isPHIType( "(PTitle)", (@{$phiT{$k}})) || isPHIType( "(LF)", (@{$phiT{$k}})) ||  
  1018.                isPHIType( "(NamePattern)", (@{$phiT{$k}})) ||
  1019.                isPHIType( "(MD)", (@{$phiT{$k}})) ||                         
  1020.                isPHIType( "(NameIs)", (@{$phiT{$k}})) || isPHIType("(STitle)",  (@{$phiT{$k}})) ||
  1021.                isPHIType("(Titles)", (@{$phiT{$k}}) ))
  1022.             ) {    
  1023.                 #$phiTerms{$word}++;
  1024.                 if (!(exists $phiTerms{$word})) {
  1025.                
  1026.                 $phiTerms{$word} = 1;}
  1027.                 else {
  1028.                
  1029.                 $phiTerms{$word} = $phiTerms{$word}+ 1;
  1030.                 }
  1031.  
  1032.  
  1033.             } #end if
  1034.         }  # end foreach $k (keys %phiT)
  1035.             } #end if ($para =~ /\W/)
  1036.  
  1037.         if ($line =~ /\|\|\|\|END_OF_RECORD/ ) {  
  1038.        
  1039.         open DEST, ">>".$debug_file or die "Cannot open $debug_file";
  1040.         ####################################################
  1041.         #check for repeated occurences of PHIs for this note
  1042.         while ($allallText =~ /\b([A-Za-z]+)\b/g) {
  1043.             my $token = $1;
  1044.             my $start = (length ($`));  #$` is the string preceding what was matched by the last successful match
  1045.             my $end = $start + length($token);
  1046.  
  1047.            if (!(exists    $deids{$start})   ) {
  1048.             #if (!(exists ${@{$deids{$start}}}[0])) {
  1049.               L:
  1050.             foreach $word (keys %phiTerms) {
  1051.        
  1052.                 if ( (uc($token) eq uc($word))  ) {
  1053.                
  1054.                 $deids{$start}[0] = $end;
  1055.                 $deids{$start}[1] = $currentID;
  1056.                 $deids{$start}[2] = $noteDate;
  1057.                
  1058.                 $term = substr $text, $start, ($end - $start +1);
  1059.                 $outstr = "$start \t $end \t $term \t Name (Repeated Occurrence) \n";
  1060.                 print DEST $outstr;
  1061.                
  1062.                 next L;
  1063.                 } #end if
  1064.             } # end foreach
  1065.             } # end if
  1066.         } # end while
  1067.         #end checking for repeated occurences of PHIs
  1068.         close DEST;
  1069.         #####################################################
  1070.  
  1071.         ##output PHI locations to the .phi file
  1072.         open OUTF, ">>$output_file" or die "Cannot open $output_file";
  1073.         print OUTF "\nPatient $currentID\tNote $currentNote";
  1074.         foreach $k (sort numerically keys %deids) {
  1075.             my @deidvals = @{$deids{$k}};
  1076.             $thisend = $deidvals[0];
  1077.             if ($thisend ){
  1078.             print OUTF "\n$k\t$k\t$thisend";
  1079.             }
  1080.         }
  1081.         close OUTF;
  1082.  
  1083.         ###output de-ided text to .res file
  1084.         if ($comparison==0) {
  1085.             outputText(\%deids, \%phiTT);  
  1086.         }
  1087.  
  1088.         #now that we have output text for this record, we reset the
  1089.                 #variables to get ready for the next record
  1090.         $para = "";
  1091.         $paraCount = 0;
  1092.         $stpos = 0;
  1093.        
  1094.         %deids=(); #clear the deid hash
  1095.         %phiTT=(); #clear the phiTT hash
  1096.         %phiT=();
  1097.  
  1098.  
  1099.         $allText = "";       #reset,
  1100.         $allallText = "";    #reset
  1101.  
  1102.         } else {  # this is not end of record yet ...
  1103.         my $tmp = length($para);
  1104.         $stpos += length ($para);
  1105.         $para = $line.' ';
  1106.         }
  1107.  
  1108.                
  1109.     }  #end if line starts with empty spaces || empty line || end of record
  1110.     else { # else this line is still a part of the current paragraph
  1111.         #$para .= ' '.$line;  #just append to end of current paragraph
  1112.         #$para .= $line.' '; #just append to end of current paragraph
  1113.          if ($line eq "") {
  1114.         $para .= "\n";
  1115.         } else {
  1116.         $para .= $line.' '; #just append to end of current paragraph
  1117.         }
  1118.     }
  1119.     } #end while ($line=<DF>)
  1120.    
  1121.     close DF;
  1122.  
  1123. }
  1124. # End of deid()
  1125.  
  1126.  
  1127.  
  1128.  
  1129.  
  1130. #***********************************************************************************************************
  1131. #***********************************************************************************************************
  1132. #***********************************************************************************************************
  1133. # Function: findPHI()
  1134. # Arguments: string $label ("Para $paraCount"), string $date (e.g. "yyyy-mm-dd"), int $startIndex (normally 0), string $text (paragraph of text)
  1135. # Returns: hash %approved (key=start-end of each PHI, value=PHI types)
  1136. # Called by: deid()
  1137. # Description: Dispatched from deid() to perform de-identification
  1138. # Reads in a paragraph of text and runs the de-identification algorithm on it
  1139.  
  1140.  
  1141. sub findPHI {
  1142.     my ($label, $curr_date, $startIndex, $text) = @_;
  1143.  
  1144.  
  1145.     # Initializes the hash %phi which stores PHI locations and their PHI types
  1146.     %phi = (); #key = start-end, value = (type1, type2, ...)
  1147.     local %end = (); # key = start, value = end (dynamic scope so addType can do it)
  1148.     if ($text !~ /\w/) {   
  1149.     return %phi;
  1150.     } #[wjl] skip blank lines
  1151.    
  1152.     # Splits the text into separate items at spaces
  1153.     #my @txtlst = split (/([^a-zA-Z0-9_\'\-]+)/,$text); # used by lookhash and appx match
  1154.     my @txtlst = split (/([^a-zA-Z0-9_\']+)/,$text);
  1155.    
  1156.     # Performs exact matching with the hashes of PHI lists
  1157.     &lookhash(@txtlst);
  1158.  
  1159.     # Initializes hash %phik which stores only start and end indices of each PHI in %phi
  1160.     %phik = ();
  1161.  
  1162.     foreach $k (keys %phi) {
  1163.     ($st,$end) = split ('-',$k);
  1164.     $phik{$st} = $end;
  1165.     }  
  1166.  
  1167.  
  1168.     # Calls each filter module
  1169.  
  1170.     if ($datefilter =~ /y/) {
  1171.     &commonHoliday($text, 0);
  1172.     &date ($text, $curr_date);
  1173.     &dateWithContextCheck($text, $curr_date);
  1174.     &yearWithContextCheck($text, $curr_date);
  1175.     &seasonYear ($text, 0);
  1176.     }
  1177.  
  1178.     if ($telfilter =~ /y/) {
  1179.     &telephone($text, 0);
  1180.     &pager ($text, 0);
  1181.     }
  1182.  
  1183.     if ($locfilter =~ /y/) {
  1184.     &wardname($text, 0);
  1185.     &location1 ($text, 0);
  1186.     &location2 ($text, 0);
  1187.     }
  1188.  
  1189.     if ($emailfilter =~ /y/) {
  1190.     &email ($text, 0);
  1191.     }
  1192.  
  1193.     if ($urlfilter =~ /y/) {
  1194.     &url ($text, 0);
  1195.     }
  1196.  
  1197.     if ($ssnfilter =~ /y/) {   
  1198.     &ssn ($text, 0);
  1199.     }
  1200.  
  1201.     if ($agefilter =~ /y/) {
  1202.     &age ($text, 0);
  1203.     }
  1204.  
  1205.     if ($namefilter =~ /y/) {
  1206.  
  1207.     &name1 ($text, 0);
  1208.     &name2 ($text, 0);
  1209.     &name3 ($text, 0);
  1210.     &knownPatientName($text, 0);
  1211.     &problem ($text, 0);
  1212.     &signatureField ($text, 0);
  1213.     }
  1214.  
  1215.     if ($unitfilter =~ /y/) {
  1216.     &mrn ($text, 0);
  1217.     &unit ($text, 0);
  1218.     &providerNumber ($text, 0);
  1219.     }
  1220.  
  1221.  
  1222.    
  1223.     if ($ds_specific_filter =~ /y/){
  1224.     #discharge summary specific filters
  1225.     #filter not enabled for this version
  1226.     #&dischargeSummarySpecific ($text, 0);
  1227.     }
  1228.  
  1229.  
  1230.     # Call new function here >>>>>>>>>>
  1231.     # Follow format shown here if necessary
  1232.     # &functionName ($text, 0);
  1233.  
  1234.     open DEST, ">>".$debug_file or die "Cannot open $debug_file";
  1235.     #print DEST "$label\n";
  1236.  
  1237.     # Sub-function: finalPHICheck()
  1238.     # After findPHI() has performed most PHI checks, goes through the identified PHI before adding them to the final PHI files
  1239.    
  1240.     my %approved;
  1241.     my ($startp, $endp) = (0, 0);
  1242.     my $notAmbig = 0;
  1243.     my $stg1 = "";
  1244.     my $stg2 = "";
  1245.     my $prevAmbig = 0;
  1246.     my $oldk = "";
  1247.     my $prevKey = "";
  1248.     my ($oldstartp, $oldendp,) = (0, 0);
  1249.     my $oldtext = "";
  1250.    
  1251.     # Prunes keys and checks whether each PHI is ambiguous or is an indicator (e.g. hospital indicator)
  1252.     foreach $k (&pruneKeys("phi",$text)) {
  1253.     ($startp, $endp) = split "-", $k;
  1254.  
  1255.     my $the_word = (substr $text, $startp, ($endp - $startp));
  1256.  
  1257.  
  1258.     $notAmbig = 0;  #so by default, the term is ambiguous
  1259.     foreach $tt (@{$phi{$k}}){
  1260.         #if(($tt !~ /ambig/) && ($tt !~ /Indicator/)    ) {
  1261.         if (($tt !~ /ambig/) && ($tt !~ /Indicator/)  && ($tt !~ /MedicalPhrase/)) {
  1262.         #so IF the term matches ANY type that's non-ambiguous, THEN set it as non-ambiguous
  1263.         $notAmbig = 1; last;}
  1264.     }  #end for each
  1265.  
  1266.        
  1267.     my $notIndicator = 1; #default to be not an indicator
  1268.    
  1269.     foreach $tt (@{$phi{$k}}){
  1270.         if ($tt =~ /Indicator/) {
  1271.         $notIndicator = 0; last;
  1272.         }
  1273.     }
  1274.    
  1275.     $prevText = (substr $text, $oldstartp, ($oldendp - $oldstartp));
  1276.     $newText = (substr $text, $startp, ($endp - $startp));
  1277.        
  1278.  
  1279.     $a = (isType($prevKey, "Male First Name", 1) && ($prevAmbig==1) && (!isCommon($prevText)));
  1280.     $b = (isType($k, "Last Name", 1) && ($notAmbig==0) && (!isCommon($newText)));
  1281.    
  1282.  
  1283.     #if (this is ambig) and (previous is ambig) and ...
  1284.     if ((($notAmbig==0) && ($prevAmbig==1) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText)) && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
  1285.     #if (this is not-ambig) and (previous is ambig) and ...
  1286.         (($notAmbig==1) && ($prevAmbig==1) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
  1287.     #if (this is not-ambig) and (previous is ambig) and ...
  1288.         (($notAmbig==1) && ($prevAmbig==1) && isType($prevKey, "Last Name", 1) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "First Name", 1) && (($startp-$oldendp)<3)) ||
  1289.         #commented out on 1/31/07
  1290.         (($notAmbig==0) && ($prevAmbig==0) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
  1291.  
  1292.     #if (this is ambig) and (previous is not ambig) and ...
  1293.         (($notAmbig==0) && ($prevAmbig==0) && isType($prevKey, "Last Name", 1) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "First Name", 1) && (($startp-$oldendp)<3))) {
  1294.  
  1295.         print DEST ($startIndex + $oldstartp)."\t".($startIndex+$oldendp)."\t".(substr $text, $oldstartp, ($oldendp - $oldstartp +1));
  1296.  
  1297.  
  1298.         ###################
  1299.  
  1300.         my $oldtext = $text;
  1301.         my $newKey = ($startIndex + $oldstartp)."-".($startIndex + $oldendp);
  1302.         ###my $text = (substr $text, $oldstartp, ($oldendp - $oldstartp));
  1303.         foreach $tt (@{$phi{$prevKey}}) {
  1304.         print DEST "\t$tt";
  1305.         push @{$approved{$newKey}}, $tt;}
  1306.         print DEST "\n";
  1307.         print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t".(substr $oldtext, $startp, ($endp - $startp +1));
  1308.        
  1309.         my $newKey = ($startIndex + $startp)."-".($startIndex + $endp);
  1310.         ###my $text = (substr $oldtext, $startp, ($endp - $startp));
  1311.         foreach $tt (@{$phi{$k}}) {
  1312.         print DEST "\t$tt";
  1313.         push @{$approved{$newKey}}, $tt;
  1314.         }
  1315.         print DEST "\n";
  1316.  
  1317.  
  1318.     }
  1319.    
  1320.     # If the PHI is not ambiguous and not an indicator, recognizes it as PHI; add it to PHI file
  1321.     elsif ($notAmbig && $notIndicator) {
  1322.  
  1323.         ###################
  1324.  
  1325.        
  1326.         print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t".(substr $text, $startp, ($endp - $startp +1));
  1327.         my $newKey = ($startIndex + $startp)."-".($startIndex + $endp);
  1328.         ###my $text = (substr $text, $startp, ($endp - $startp));
  1329.         foreach $tt (@{$phi{$k}}){     
  1330.         print DEST "\t$tt";
  1331.         if (($tt !~ /ambig/) && ($tt !~ /Indicator/)) {
  1332.             push @{$approved{$newKey}}, $tt;
  1333.         }
  1334.         }
  1335.         print DEST "\n";
  1336.  
  1337.     } # Else ck keys discarded
  1338.    
  1339.     else {
  1340.         ###################
  1341.        
  1342.         # Otherwise keeps the remaining PHI as non-PHI
  1343.         print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t# ".(substr $text, $startp, ($endp - $startp +1));
  1344.         foreach $tt (@{$phi{$k}}) {
  1345.         print DEST "\t$tt";
  1346.         }
  1347.         print DEST "\n";}
  1348.  
  1349.     # Sets ambiguous variables for current PHI to be recognized as previous PHI for the next round
  1350.     if ($notAmbig==0) {
  1351.         $prevAmbig = 1;
  1352.         $prevKey = $k;
  1353.         ($oldstartp, $oldendp) = split "-", $prevKey;
  1354.         $oldtext = $text;
  1355.  
  1356.     }
  1357.     else {
  1358.         $prevAmbig = 0;
  1359.     }
  1360.     }    
  1361.     close DEST;
  1362.  
  1363.     # End of sub-function finalPHICheck()
  1364.     #***********************************************************************************************************
  1365.  
  1366.  
  1367.     return %approved;
  1368. }
  1369. # End of findPHI()
  1370.  
  1371.  
  1372.  
  1373. #***********************************************************************************************************
  1374. #***********************************************************************************************************
  1375. #***********************************************************************************************************
  1376. # Function: date ()
  1377. # Searches for date ranges following formats that appear most frequently in text
  1378. sub date {
  1379.     #$text = $_[0];    
  1380.     my ($text, $date) = @_;
  1381.     my $year = substr $date, 0, 4;  
  1382.     # Checks if dates should be filtered
  1383.     if ($datefilter =~ /y/) {
  1384.    
  1385.     # Searches for the pattern mm/dd-mm/dd where the items are valid dates
  1386.     while ($text =~ /\b((\d\d?)\/(\d\d?)\-(\d\d?)\/(\d\d?))\b/ig) {
  1387.         if (isValidDate($2,$3,-1) && isValidDate($4,$5,-1)) {
  1388.         $date_range = $1;
  1389.         $start = length($`);
  1390.         $end = $start + length($date_range);
  1391.         $key = "$start-$end";
  1392.         addType($key, "Date range (1)");
  1393.         }
  1394.     }      
  1395.  
  1396.     # Searches for mm/dd/yy-mm/dd/yy or mm/dd/yyyy-mm/dd/yyyy where the items are valid dates
  1397.     while ($text =~ /\b((\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d)\-(\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d))\b/ig) {
  1398.         if (isValidDate($2,$3,$4) && isValidDate($5,$6,$7)) {
  1399.         $date_range = $1;
  1400.         $start = length($`);
  1401.         $end = $start + length($date_range);
  1402.         $key = "$start-$end";
  1403.         addType($key, "Date range (2)");
  1404.         }
  1405.     }      
  1406.    
  1407.     # Searches for mm/dd-mm/dd/yy or mm/dd-mm/dd/yyyy where the items are valid dates
  1408.     while ($text =~ /\b((\d\d?)\/(\d\d?)\-(\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d))\b/ig) {
  1409.         if (isValidDate($6,$2,$3) && isValidDate($6,$4,$5)) {
  1410.         $date_range = $1;
  1411.         $start = length($`);
  1412.         $end = $start + length($date_range);
  1413.         $key = "$start-$end";
  1414.         addType($key, "Date range (3)");
  1415.         }
  1416.     }  
  1417.     } #end if date filter is on
  1418.     # End of sub-function date1()
  1419.  
  1420.     if ($datefilter =~ /y/) {  
  1421.     # Checks for month/day/year
  1422.     while ($text =~ /\b(\d\d?)[\-\/](\d\d?)[\-\/](\d\d|\d{4})\b/g) {
  1423.         my $startI = (length($`));
  1424.         my $endI = $startI + length($&);
  1425.         my $key = $startI."-".$endI;
  1426.         my $first_num = $1;
  1427.         my $second_num = $2;
  1428.         my $third_num = $3;
  1429.         my $beginr = substr $text, ($startI - 2), 2;
  1430.         my $endr = substr $text, $endI, 2;
  1431.  
  1432.  
  1433.         if (($beginr !~ /(\|\|)/) && ($endr !~ /(\|\|)/)) {
  1434.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1435.  
  1436.             #print "checking M/D/Y, $first_num, $second_num, $third_num\n";
  1437.  
  1438.             if (isValidDate ($first_num, $second_num, $third_num)) {
  1439.             addType ($key, "Month/Day/Year");
  1440.             }
  1441.         }
  1442.         }
  1443.     } #end while
  1444.    
  1445.     # Checks for month/day/year
  1446.     while ($text =~ /\b(\d\d?)\.(\d\d?)\.(\d\d|\d{4})\b/g) {
  1447.         my $startI = (length($`));
  1448.         my $endI = $startI + length($&);
  1449.         my $key = $startI."-".$endI;
  1450.         my $first_num = $1;
  1451.         my $second_num = $2;
  1452.         my $third_num = $3;
  1453.         my $beginr = substr $text, ($startI - 2), 2;
  1454.         my $endr = substr $text, $endI, 2;
  1455.        
  1456.         #print "2. checking M/D/Y, $first_num, $second_num, $third_num\n";
  1457.  
  1458.         if (($beginr !~ /(\|\|)/) && ($endr !~ /(\|\|)/)) {
  1459.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1460.             if (isValidDate ($first_num, $second_num, $third_num)) {
  1461.             addType ($key, "Month/Day/Year");
  1462.             }
  1463.         }
  1464.         }
  1465.     }            
  1466.  
  1467.     # Checks for day/month/year
  1468.     while ($text =~ /\b(\d\d?)[\-\/](\d\d?)[\-\/](\d\d|\d{4})\b/g){
  1469.         my $startI = (length($`));
  1470.         my $endI = $startI + length($&);
  1471.         my $key = $startI."-".$endI;
  1472.         my $first_num = $1;
  1473.         my $second_num = $2;
  1474.         my $third_num = $3;
  1475.         my $beginr = substr $text, ($startI - 2), 2;
  1476.         my $endr = substr $text, $endI, 2;
  1477.        
  1478.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1479.         if (isValidDate ($second_num, $first_num, $third_num)) {
  1480.         }
  1481.         }
  1482.     }
  1483.    
  1484.     # Checks for year/month/day
  1485.     while ($text =~ /\b(\d\d|\d{4})[\-\/](\d\d?)[\-\/](\d\d?)\b/g){
  1486.         my $startI = (length($`));
  1487.         my $endI = $startI + length($&);
  1488.         my $key = $startI."-".$endI;
  1489.         my $yr = $1;
  1490.         $nn_year = $yr;
  1491.         my $mo = $2;
  1492.         my $da = $3;
  1493.         my $beginr = substr $text, ($startI - 2), 2;
  1494.         my $endr = substr $text, $endI, 2;
  1495.        
  1496.        
  1497.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1498.         if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
  1499.             $prevChars = (substr $text, ($startI-4), 4);
  1500.             $nextChars = (substr $text, $endI, 11);
  1501.             if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
  1502.             addType ($key, "Header Date");
  1503.             $longyear = $yr;
  1504.             }
  1505.             else {
  1506.             addType ($key, "Year/Month/Day");
  1507.             }
  1508.         }
  1509.         }
  1510.     } #end while
  1511.    
  1512.    
  1513.    
  1514.     # Checks for year/month/day
  1515.     while ($text =~ /\b(\d\d|\d{4})\.(\d\d?)\.(\d\d?)\b/g){
  1516.         my $startI = (length($`));
  1517.         my $endI = $startI + length($&);
  1518.         my $key = $startI."-".$endI;
  1519.         my $yr = $1;
  1520.         $nn_year = $yr;
  1521.         my $mo = $2;
  1522.         my $da = $3;
  1523.         my $beginr = substr $text, ($startI - 2), 2;
  1524.         my $endr = substr $text, $endI, 2;
  1525.        
  1526.        
  1527.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1528.         #if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
  1529.         if (isValidDate ($mo, $da, $yr)) {
  1530.             $prevChars = (substr $text, ($startI-4), 4);
  1531.             $nextChars = (substr $text, $endI, 11);
  1532.             if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
  1533.             addType ($key, "Header Date");
  1534.             $longyear = $yr;
  1535.             }
  1536.             else {
  1537.             addType ($key, "Year/Month/Day");
  1538.             }
  1539.         }
  1540.         }
  1541.     }
  1542.    
  1543.    
  1544.    
  1545.     # Checks for year/day/month
  1546.     while ($text =~ /\b(\d\d|\d{4})[\-\/](\d\d?)[\-\/](\d\d?)\b/g){
  1547.         my $startI = (length($`));
  1548.         my $endI = $startI + length($&);
  1549.         my $key = $startI."-".$endI;
  1550.         my $yr = $1;
  1551.         $nn_year = $yr;
  1552.         my $mo = $3;
  1553.         my $da = $2;
  1554.         my $beginr = substr $text, ($startI - 2), 2;
  1555.         my $endr = substr $text, $endI, 2;
  1556.        
  1557.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1558.         #if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
  1559.         if (isValidDate ($mo, $da, $yr)) {
  1560.             $prevChars = (substr $text, ($startI-4), 4);
  1561.             $nextChars = (substr $text, $endI, 11);
  1562.            
  1563.             if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
  1564.             addType ($key, "Header Date");
  1565.             $longyear = $yr;
  1566.             }
  1567.             else {
  1568.             }
  1569.         }
  1570.         }
  1571.     } #end while
  1572.  
  1573.     # Checks for month/4-digit year
  1574.     while ($text =~ /\b((\d\d?)[\-\/](\d{4}))/g) {
  1575.         my $startI = (length($`));
  1576.         my $endI = $startI + length($&);
  1577.         my $beginr = substr $text, ($startI - 2), 2;
  1578.         my $endr = substr $text, $endI, 2;
  1579.         my $first_num = $2;
  1580.         my $second_num = $3;
  1581.         my $st = length($`);
  1582.         my $endb = $st + length ($2) + length($3) + 1;
  1583.         my $key = "$st-$endb";
  1584.         if (($beginr !~ /\|\|/) && ($endr !~ /\|\|/)) {
  1585.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /[\/\.\%]/)) {
  1586.             #if (($first_num <= 12) && ($first_num > 0) && ($second_num>=1900)) {
  1587.             if (($first_num <= 12) && ($first_num > 0) &&
  1588.             ( $second_num >= $VALID_YEAR_LOW &&  $second_num <= $VALID_YEAR_HIGH  )  ) {
  1589.             addType ($key, "Month/Year 1"); }}}
  1590.     } #end while
  1591.    
  1592.         # Checks for 4-digit year/month
  1593.     while ($text =~ /\b((\d{4})[\-\/](\d\d?))\b/g) {
  1594.         my $first_num = $2;
  1595.         my $second_num = $3;
  1596.         my $st = length($`);
  1597.         my $endb = $st + length ($2) + length($3) + 1;
  1598.         my $key = "$st-$endb";
  1599.         if (($begin !~ /\d[\/\.\-]/) && ($end !~ /[\/\.\%]/)) {
  1600.         #if (($second_num <= 12) && ($second_num > 0) && ($first_num>=1900) && ($first_num<2010)) {
  1601.        
  1602.         if (($second_num <= 12) && ($second_num > 0) && ($first_num>=$VALID_YEAR_LOW) && ($first_num <= $VALID_YEAR_HIGH)) {
  1603.             addType ($key, "Year/Month"); }}
  1604.     } #end while
  1605.  
  1606.    
  1607.     # Checks for spelled-out months
  1608.     # Accounts for ambiguity around the dates, e.g. acronyms for measurements, spelled out months and such
  1609.     foreach $m (@months) {
  1610.        
  1611.         while ($text =~ /\b((\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
  1612.         my $day = $2;
  1613.         my $completeDate = $1;
  1614.         my $st = length($`);
  1615.         my $key = "$st-".($st + length($1));
  1616.         if (($day < 32) && ($day > 0)) {
  1617.             addType ($key, "Day Month Year");
  1618.         }
  1619.         }
  1620.  
  1621.         while ($text =~ /\b((\d{1,2}) ?(\-|to|through)+ ?(\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
  1622.         my $day1 = $2;
  1623.         my $day2 = $4;
  1624.         my $completeDate = $1;
  1625.         my $st = length($`);
  1626.         my $key = "$st-".($st + length($1));
  1627.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 >0)) {
  1628.             addType ($key, "Date range (4)");
  1629.         }
  1630.         }
  1631.        
  1632.         while ($text =~ /\b((\d{1,2}) ?\-\> ?(\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
  1633.         my $day1 = $2;
  1634.         my $day2 = $3;
  1635.         my $completeDate = $1;
  1636.         my $st = length($`);
  1637.         my $key = "$st-".($st + length($1));
  1638.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1639.             addType ($key, "Date range (5)");
  1640.         }
  1641.         }
  1642.        
  1643.        
  1644.         while ($text =~ /\b($m\b\.? (\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
  1645.    
  1646.         my $day = $2;
  1647.         my $completeDate = $1;
  1648.         my $st = length($`);
  1649.         my $key = "$st-".($st + length($1));
  1650.         if (($day < 32) && ($day > 0)) {
  1651.             addType ($key, "Month Day Year");
  1652.         }
  1653.         }
  1654.  
  1655.         while ($text =~ /\b($m\b\.? (\d{1,2}) ?(\-|to|through)+ ?(\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
  1656.         my $day1 = $2;
  1657.         my $day2 = $4;
  1658.         my $completeDate = $1;
  1659.         my $st = length($`);
  1660.         my $key = "$st-".($st + length($1));
  1661.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) & ($day2 > 0)) {
  1662.             addType ($key, "Date range (6)");
  1663.         }
  1664.         }
  1665.  
  1666.         while ($text =~ /\b($m\b\.? (\d{1,2}) ?\-\> ?(\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
  1667.    
  1668.         my $day1 = $2;
  1669.         my $day2 = $3;
  1670.         my $completeDate = $1;
  1671.         my $st = length($`);
  1672.         my $key = "$st-".($st + length($1));
  1673.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) & ($day2 > 0)) {
  1674.             addType ($key, "Date range (7)");
  1675.         }
  1676.         }
  1677.                
  1678.         #while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 12 2000
  1679.        while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|) ?[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 12th 2000
  1680.         my $day = $2;
  1681.         my $completeDate = $1;
  1682.         my $st = length($`);
  1683.         my $key = "$st-".($st + length($1));
  1684.         if (($day < 32) && ($day > 0)) {
  1685.             addType ($key, "Month Day Year (2)");
  1686.             #addType ($key, "Month Day Year");
  1687.         }
  1688.         }
  1689.        
  1690.        # while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
  1691.         while ($text =~ /\b($m\b\.?,?\s*(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
  1692.         my $day = $2;
  1693.         my $completeDate = $1;
  1694.         my $st = length($`);
  1695.         my $key = "$st-".($st + length($1));
  1696.  
  1697.         if (($day < 32) && ($day > 0)) {
  1698.             addType ($key, "Month Day");
  1699.         }
  1700.         }
  1701.        
  1702.         while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?(\-|to|through)+ ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
  1703.         my $day1 = $2;
  1704.         my $day2 = $4;
  1705.         my $completeDate = $1;
  1706.         my $st = length($`);
  1707.         my $key = "$st-".($st + length($1));
  1708.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1709.             addType ($key, "Date range (8)");
  1710.         }
  1711.         }
  1712.        
  1713.         while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?\-\> ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12th
  1714.         my $day1 = $2;
  1715.         my $day2 = $4;
  1716.         my $completeDate = $1;
  1717.         my $st = length($`);
  1718.         my $key = "$st-".($st + length($1));
  1719.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1720.             addType ($key, "Date range (9)");
  1721.         }
  1722.         }
  1723.        
  1724.         while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr, or Second of April
  1725.         my $day = $2;
  1726.         my $completeDate = $1;
  1727.         my $st = length($`);
  1728.         my $key = "$st-".($st + length($1));
  1729.         if (($day < 32) && ($day > 0)) {
  1730.             addType ($key, "Day Month");
  1731.         }
  1732.         }
  1733.  
  1734.         ###
  1735.       #  while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)?\s+(of)?\s+[\-]\b$m\.?,?)\s*(\d{2,4})\b/ig) { # 12-Apr, or Second of April
  1736. while ($text =~ /\b(((\d{1,2})(|st|nd|rd|th|)?\s+(of\s)?[\-]?\b($m)\.?,?)\s+(\d{2,4}))\b/ig) { # 12-Apr, or Second of April
  1737.         my $day = $3;
  1738.         my $month = $6;
  1739.         my $year = $7;
  1740.  
  1741.         my $completeDate = $1;
  1742.         my $st = length($`);
  1743.         my $key = "$st-".($st + length($1));
  1744.         if (($day < 32) && ($day > 0)) {
  1745.             addType ($key, "Day Month Year 2");
  1746.         }
  1747.         }
  1748.            ###
  1749.        
  1750.         while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)? ?(\-|to|through)+ ?(\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr
  1751.         my $day1 = $2;
  1752.         my $day2 = $5;
  1753.         my $completeDate = $1;
  1754.         my $st = length($`);
  1755.         my $key = "$st-".($st + length($1));
  1756.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1757.             addType ($key, "Date range (10)");
  1758.         }
  1759.         }
  1760.        
  1761.         while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)? ?\-\> ?(\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr
  1762.         my $day1 = $2;
  1763.         my $day2 = $5;
  1764.         my $completeDate = $1;
  1765.         my $st = length($`);
  1766.         my $key = "$st-".($st + length($1));
  1767.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1768.             addType ($key, "Date range (11)");
  1769.         }
  1770.         }
  1771.        
  1772.        
  1773.         while ($text =~ /\b($m\.?,? ?(of )?\d{2}\d{2}?)\b/ig) { # Apr. 2002
  1774.         my $year = $2;
  1775.         my $completeDate = $1;
  1776.         my $st = length($`);
  1777.         my $key = "$st-".($st + length($1));
  1778.         addType ($key, "Month Year");
  1779.         }
  1780.     }
  1781.     }  
  1782. }
  1783. # End of function date()
  1784.  
  1785. #***********************************************************************************************************
  1786. #***********************************************************************************************************
  1787. #***********************************************************************************************************
  1788. # Function: age()
  1789. # Checks for ages that are >=90 (assuming that no one is over 125 years old, just as a sanity check)
  1790. # When ages are spelled-out, assumes that the number will not be over hundred, i.e. highest spelled-out age="hundred"
  1791.  
  1792. sub age {
  1793.     $text = $_[0];  
  1794.     if ($agefilter =~ /y/) {
  1795.    
  1796.     foreach $i (@age_indicators_suff) {
  1797.         if (($text =~ /\b(ninety)[\s\-]+ *$i\b/ig) || ($text =~ /\b(hundred)[\s\-]+ *$i\b/ig)) {
  1798.         my $age = $1;
  1799.         my $st = length($`);
  1800.         my $key = "$st-".((length $age) + $st);
  1801.         addType ($key, "Age over 90");
  1802.         }
  1803.         while ($text =~ /\b(([A-Za-z]+)([\s \-])([A-Za-z]+)) ? *$i\b/ig) {           
  1804.         foreach $j (@digits) {
  1805.             $first = $2;
  1806.             $second = $4;
  1807.             my $age1 = $1;
  1808.             my $st1 = length($`);
  1809.             my $end1 = (length $age1) + $st1;
  1810.             my $key1 = "$st1-$end1";
  1811.             $st2 = $st1+length($2)+length($3);
  1812.             $end2 = $st2+(length($second));
  1813.             my $key2 = "$st2-$end2";
  1814.             if ((($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig)) && (($second=~/\b$digits\b/ig))) {
  1815.             addType ($key1, "Age over 90");    
  1816.             }
  1817.             else {
  1818.             if (!(($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig))) {
  1819.                 if (($second=~/\bninety\b/ig) || ($second=~/\bhundred\b/ig)) {
  1820.                 addType ($key2, "Age over 90");
  1821.                 }
  1822.             }
  1823.             }
  1824.         }
  1825.         }
  1826.                
  1827.         while ($text =~ /\b(\d+) *$i/ig) {    
  1828.         my $age = $1;
  1829.         my $st = length($`);
  1830.         my $key = "$st-".((length $age) + $st);
  1831.         if (($age >= 90) && ($age <= 125)) {
  1832.             addType ($key, "Age over 90");
  1833.         }
  1834.         }
  1835.     }
  1836.    
  1837.     foreach $i (@age_indicators_pre) {
  1838.         while ($text =~ /\b($i + *)(([A-Za-z]+)([\s \-])([A-Za-z]+))\b/ig) {         
  1839.         foreach $j (@digits) {
  1840.             $first = $3;
  1841.             $second = $5;
  1842.             my $age1 = $2;
  1843.             my $st1 = length($`)+length($1);
  1844.             my $end1 = (length $age1) + $st1;
  1845.             my $key1 = "$st1-$end1";
  1846.             $st2 = $st1;
  1847.             $end2 = $st2+length($first);
  1848.             my $key2 = "$st2-$end2";
  1849.             if ((($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig)) && (($second=~/\b$digits\b/ig) || (length($second)))) {
  1850.             addType ($key1, "Age over 90");    
  1851.             }
  1852.             else {
  1853.             if (!(($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig))) {
  1854.                 if (($second=~/\bninety\b/ig) || ($second=~/\bhundred\b/ig)) {
  1855.                 addType ($key2, "Age over 90");
  1856.                 }
  1857.             }
  1858.             }
  1859.         }
  1860.         }
  1861.        
  1862.        
  1863.         while ($text =~ /\b($i + *)(\d+)\b/ig) {    
  1864.         my $age = $2;
  1865.         my $st = length($`)+length($1);
  1866.         my $key = "$st-".((length $age) + $st);
  1867.         if (($age >= 90) && ($age <= 125)) {
  1868.             addType ($key, "Age over 90");
  1869.         }
  1870.         }
  1871.     }
  1872.     }
  1873. }
  1874. # End of function age()
  1875. #***********************************************************************************************************
  1876. #***********************************************************************************************************
  1877. #***********************************************************************************************************
  1878. # Function: pager()
  1879. # Checks for pager numbers
  1880.  
  1881. sub pager {
  1882.     $text = $_[0];
  1883.     if ($telfilter =~ /y/) {
  1884.     foreach $i ("Pager", "Pg", "Pgr", "Page", "Beeper", "Beeper number", "Pager number") {  #removed "P"
  1885.  
  1886.         while ($text =~ /\b$i[\s\#\:\-\=]*([a-zA-Z]\s+)*[a-zA-Z]*\s*(\d\d\d+)\b/gi) {
  1887.         my $num = $2;
  1888.         my $end = length($`)+length($&);
  1889.         my $key = ($end - (length $num))."-$end";
  1890.         addType ($key, "Pager number");
  1891.         }
  1892.         while ($text =~ /\b$i[\s\#\:\-\=]*/gi){
  1893.         my $startp = length($`);
  1894.                 my $endp =  length($`)+length($&);
  1895.         #get the next 30 characters
  1896.         my $the_chunck = (substr $text, $endp, 30);
  1897.                 #now look for a 5-digit number
  1898.         while ($the_chunck =~ /(\D)(\d{5})(\D)/gi){
  1899.             my $pager_startp = $endp + length($`) + length($1);
  1900.             my $pager_endp = $pager_startp + length($2);   
  1901.             my $key = "$pager_startp-$pager_endp";
  1902.             addType ($key, "Pager number");
  1903.            
  1904.         } #end while
  1905.         } #end while
  1906.     }
  1907.     }
  1908. }
  1909. # End of function pager()
  1910. #***********************************************************************************************************
  1911. #***********************************************************************************************************
  1912. #***********************************************************************************************************
  1913. # Function: email()
  1914. # Looks for e-mail addresses
  1915.  
  1916. sub email {
  1917.     $text = $_[0];  
  1918.     if ($emailfilter =~ /y/) {
  1919.     while ($text =~ /\b([\w\.]+\w ?@ ?\w+[\.\w+]\.\w{2,3})\b/g) {
  1920.         my $st = length($`);
  1921.         my $key = "$st-".($st+length($&));
  1922.         addType ($key, "E-mail address");
  1923.     }
  1924.     }
  1925. }
  1926. # End of function email()
  1927. #***********************************************************************************************************
  1928. #***********************************************************************************************************
  1929. #***********************************************************************************************************
  1930. # Function: url()
  1931. # Checks for URLs of different types
  1932.  
  1933. sub url {
  1934.     $text = $_[0];  
  1935.     if ($urlfilter =~ /y/) {
  1936.    
  1937.     while ($text =~ /\bhttps?\:\/\/[\w\.]+\w{2,4}\/\S+\b/gi) {
  1938.         my $st = length($`);
  1939.         my $key = "$st-".($st+length($&));
  1940.         addType ($key, "URL");
  1941.     }
  1942.     while ($text =~ /\bftp\:\/\/[\w\.]+\w{2,4}\/\S+\b/gi) {
  1943.         my $st = length($`);
  1944.         my $key = "$st-".($st+length($&));
  1945.         addType ($key, "URL");
  1946.     }
  1947.     while ($text =~ /\bwww\.[\w\.]+\w{2,4}\/\S+\b/gi) {
  1948.         my $st = length($`);
  1949.         my $key = "$st-".($st+length($&));
  1950.         addType ($key, "URL");
  1951.     }
  1952.     while ($text =~ /\bwww\.[\w\.]+\w{2,4}\b/gi) {
  1953.         my $st = length($`);
  1954.         my $key = "$st-".($st+length($&));
  1955.         addType ($key, "URL");
  1956.     }
  1957.     while ($text =~ /\bweb\.[\w\.]+\w{2,4}\/\S+\b/gi) {
  1958.         my $st = length($`);
  1959.         my $key = "$st-".($st+length($&));
  1960.         addType ($key, "URL");
  1961.     }
  1962.     while ($text =~ /\bweb\.[\w\.]+\w{2,4}\b/gi) {
  1963.         my $st = length($`);
  1964.         my $key = "$st-".($st+length($&));
  1965.         addType ($key, "URL");
  1966.     }
  1967.  
  1968.     while ($text =~ /\bhttps?\:\/\/[\w\.]+\w{2,4}\b/g) {
  1969.         my $st = length($`);
  1970.         my $key = "$st-".($st+length($&));
  1971.         addType ($key, "URL");
  1972.     }
  1973.     while ($text =~ /\bftp\:\/\/[\w\.]+\w{2,4}\b/g) {
  1974.         my $st = length($`);
  1975.         my $key = "$st-".($st+length($&));
  1976.         addType ($key, "URL");
  1977.     }
  1978.     }
  1979. }
  1980. # End of function url()
  1981. #***********************************************************************************************************
  1982. #***********************************************************************************************************
  1983. #***********************************************************************************************************
  1984. # Function: ssn()
  1985. # Checks for social security numbers (SSNs)
  1986.  
  1987. sub ssn {
  1988.     $text = $_[0];  
  1989.      
  1990.     if ($ssnfilter =~ /y/) {
  1991.     while ($text =~ /\b\d\d\d([- \/]?)\d\d\1\d\d\d\d\b/g) {
  1992.         my $st = length($`);
  1993.         my $key = "$st-".($st+length($&));
  1994.         addType ($key, "Social Security Number");
  1995.     }
  1996.     }
  1997. }
  1998. # End of function ssn()
  1999. #***********************************************************************************************************
  2000. #***********************************************************************************************************
  2001. #***********************************************************************************************************
  2002. # Function: name1()
  2003. # Uses name prefixes to make last names
  2004.  
  2005. sub name1 {
  2006.     $text = $_[0];  
  2007.     if ($namefilter =~ /y/) {
  2008.  
  2009.     my @keylst = sort numerically keys %phi;
  2010.     my $key;
  2011.     for($i=0;$i<$#keylst;$i++){
  2012.         $key = $keylst[$i];
  2013.         if (isType($key, "Last Prefix", 0)){
  2014.         ($f1,$t1) = split ('-',$key);
  2015.         ($f2,$t2) = split ('-',$keylst[$i+1]);
  2016.         if ($f2 < $t1+3){
  2017.             if (isType($keylst[$i+1], "Last Name", 1)){
  2018.             print "Found Last Prefix match, Adding $keylst[$i+1] type for last name!!";
  2019.             addType("$f1-$t2","Last Name");
  2020.             }
  2021.         }
  2022.         }
  2023.     }
  2024.     #####################################################  
  2025.     # Uses common-sense heuristics to try to find more names
  2026.     foreach $i (@name_indicators){
  2027.      
  2028.         #while ($text =~ /\b($i)(s)?( *)(\-|\,|\.|\()?( *)([A-Za-z]+\b)(\s+)(and )?( *)([A-Za-z]+)\b/ig) {
  2029.         while ($text =~ /\b($i)(s)?( *)(\-|\,|\.|\()?( *)([A-Za-z]+\b)\b/ig) {
  2030.  
  2031.         $potential_name = $6;
  2032.    
  2033.         $start = length($`)+length($1) + length ($2) + length($3) + length($4) + length($5);
  2034.         $end = $start + length($potential_name);
  2035.         $key = "$start-$end";
  2036.         my $tmpstr = substr $text, $start, $end-$start;
  2037.        
  2038.         my $tmp = isType($key, "Name",1);
  2039.          
  2040.  
  2041.         if (isProbablyName($key, $potential_name)){    
  2042.             addType ($key, "Name (NI)");
  2043.         } # end if the first word after the name indicator is a name
  2044.  
  2045.          my $new_start = $end + length($6) + length($7);
  2046.          my $new_end = $new_start + length($8);
  2047.  
  2048.  
  2049.          #########now check the next word
  2050.          my $rest = substr $text, $end+1, 20;
  2051.         if (($rest =~ /\b(and )?( *)([A-Za-z]+)\b/ig)){
  2052.             my $new_start = $end + 1 + length($`) + length($1)+length($2);
  2053.             my $new_end = $new_start + length ($3);
  2054.  
  2055.             my $keyAfter = "$new_start-$new_end";
  2056.             my $wordAfter = substr $rest,   (length ($`)+ length($1) + length($2)) , length ($3);
  2057.          
  2058.  
  2059.             if ( !isNameIndicator($wordAfter) && ( (  !isCommon($wordAfter) ||
  2060.             ((isType($keyAfter, "Name", 1) && isType($keyAfter, "(un)"))  ||
  2061.             (isType($keyAfter, "Name", 1) && ($wordAfter =~ /\b(([A-Z])([a-z]+))\b/g)) ||
  2062.             (!isCommonest($wordAfter) && isType($keyAfter, "Name", 1)) ||
  2063.             (isType($keyAfter, "popular",1)) ) )  ) ) {
  2064.    
  2065.             if ($rest =! /\b[\d]\b/ig) { #disregard if the rest contains numeric patterns
  2066.                 addType ($keyAfter, "Name2 (NI)");
  2067.             }
  2068.             }
  2069.  
  2070.             elsif ($1 =~ /and/ig){
  2071.             if (! (isCommon($wordAfter) || isNameIndicator($wordAfter))) {
  2072.                 addType ($keyAfter, "Name2 (NI)");
  2073.             }
  2074.             }
  2075.         } #end if rest
  2076.          
  2077.        
  2078.         } # end of while
  2079.  
  2080.     } #end for each name indicator
  2081.  
  2082.     #**********************************************************************************************************
  2083.     # Searches for the name pattern LASTNAME, FIRSTNAME
  2084.     # First checks if word2 is a firstname
  2085.     # If it is, then if word1 is not a common or commonest word, identifies word1 as a lastname
  2086.     while ($text =~ /\b([A-Za-z]+)( ?\, ?)([A-Za-z]+)\b/ig) {
  2087.         $input1 = $1;
  2088.         $input2 = $3;
  2089.         my $st1 = length($`);
  2090.         my $end1 = $st1 + length($input1);
  2091.         my $key = $key1;
  2092.         my $key1 = "$st1-$end1";
  2093.         my $st2 = $end1+length($2);
  2094.         my $end2 = $st2 + length($input2);
  2095.         my $key = $key2;
  2096.         my $key2 = "$st2-$end2";
  2097.        
  2098.         if ((isType($key2, "Name", 1)) && (isType($key1, "Name (ambig)", 1)) && (!isNameIndicator($input1)) ) {
  2099.         addType ($key1, "Last Name (LF)");
  2100.         addType ($key2, "First Name1 (LF)");
  2101.         }
  2102.        
  2103.         if ((isType($key1, "Name", 1)) && (isType($key2, "Name (ambig)", 1)) && (!isNameIndicator($input1))  ) {
  2104.         addType ($key2, "Last Name (LF)");
  2105.         addType ($key1, "First Name2 (LF)");
  2106.         }
  2107.        
  2108.         if (isType($key2, "First Name", 1)) {
  2109.         if (   (isType($key1, "Last Name", 1) && (!isCommonest($input1)) &&  (!isNameIndicator($input1))) ||
  2110.                ((!isCommon($input1)) && (!isCommonest($input1)))   ) {
  2111.             addType ($key1, "Last Name (LF)");
  2112.             addType ($key2, "First Name3 (LF)");
  2113.         }
  2114.         }  
  2115.     }
  2116.     }
  2117. }
  2118. # End of function name1()
  2119. #***********************************************************************************************************
  2120. #***********************************************************************************************************
  2121. #***********************************************************************************************************
  2122. # Function: seasonYear()
  2123. # Checks for both season and year from patterns like "<season> of? <year>"
  2124.  
  2125. sub seasonYear {
  2126.     $text = $_[0];  
  2127.    
  2128.     @seasons = ("winter", "spring", "summer", "autumn", "fall");
  2129.    
  2130.     foreach $i (@seasons) {
  2131.         while ($text =~ /\b(($i)(( +)of( +))? ?\,?( ?)\'?(\d{2}|\d{4}))\b/gi) {
  2132.         $f2=$2;
  2133.         $f3=$3;
  2134.         $f6=$6;
  2135.         $f7=$7;
  2136.         if (length($f7)==4) {
  2137.             if (($f7<=$longyear) && ($f7>1900)) {  
  2138.             my $st1 = length($`);
  2139.             my $end1 = (length $f2) + $st1;
  2140.             my $key1 = "$st1-$end1";
  2141.            
  2142.             my $st2 = $end1+(length $f3)+(length $f6);
  2143.             my $key2 = "$st2-".((length $f7) + $st2);
  2144.             addType ($key2, "Year (4 digits)");    
  2145.             }
  2146.         }
  2147.         else {                   
  2148.             my $st1 = length($`);
  2149.             my $end1 = (length $f2) + $st1;
  2150.             my $key1 = "$st1-$end1";
  2151.            
  2152.             my $st2 = $end1+(length $f3)+(length $f6);
  2153.             my $key2 = "$st2-".((length $f7) + $st2);
  2154.             addType ($key2, "Year (4 digits)");                    
  2155.         }
  2156.         }
  2157.     }
  2158.    
  2159. }
  2160. # End of function seasonYear()
  2161. #***********************************************************************************************************
  2162. #***********************************************************************************************************
  2163. #***********************************************************************************************************
  2164. # Function: name2()
  2165. # Checks for more than 1 name following plural drs, drs., doctors, professors,
  2166. # Checks up to 3 names
  2167.  
  2168. sub name2 {
  2169.     $text = $_[0];  
  2170.     if ($namefilter =~ /y/) {
  2171.  
  2172.     @plural_titles = ("doctors", "drs", "drs\.", "professors");
  2173.    
  2174.     foreach $p (@plural_titles) {
  2175.     while ($text =~ /\b((($p +)([A-Za-z]+) *(and +)?\,? *)([A-Za-z]+) *(and +)?\,? *)([A-Za-z]+)?\b/gi) {  
  2176.         my %names = ();
  2177.         $st3 = length($`);
  2178.         $st4 = $st3+length($3);
  2179.         $end4 = $st4+length($4);
  2180.         $key4 = "$st4-$end4";
  2181.         $st6 = $st3+length($2);
  2182.         $end6 = $st6+length($6);
  2183.         $key6 = "$st6-$end6";
  2184.         $st8 = $st3+length($1);
  2185.         $end8 = $st8+length($8);
  2186.         $key8= "$st8-$end8";
  2187.         $names{$4} = $key4;
  2188.         $names{$6} = $key6;
  2189.         $names{$8} = $key8;
  2190.         foreach $i (keys %names) {
  2191.         $val = $names{$i};
  2192.         if (length($i)>0) {
  2193.             if (!(isCommonest($i)) || (isType($val, "Name", 1))) {
  2194.             #print "addtype, val is $val\n";
  2195.             addType ($val, "Name5 (PTitle)");
  2196.             }
  2197.         }
  2198.         }
  2199.     }
  2200.     }
  2201.  
  2202.  
  2203.  
  2204.  
  2205. #****************************************************************************************
  2206. # Checks for names followed by "MD" or "M.D"
  2207. # Checks up to 3 previous words
  2208.     while ($text =~ /\b((([A-Za-z\']+) +)?(([A-Za-z])\. +)?([A-Za-z\-\']+)((\, *)|(\s+))(rrt|md|m\.d\.|crt|np|rn|nnp|msw|r\.n\.)(\.|\,)*\b)/ig) {
  2209.    
  2210.     $name = $1;
  2211.     $start = length($`);
  2212.     $end = $start + length($name);
  2213.     $key = "$start-$end";
  2214.    
  2215.     $name1 = $3; #if present, would be first name
  2216.     $name2 = $4;  #if present, would be initial
  2217.     $name3 = $6;  #if present would be last name
  2218.  
  2219.     $st1 = length($`);
  2220.     $end1 = $st1 + (length $name1);
  2221.     $key1 = "$st1-$end1";
  2222.  
  2223.     $st2 = $st1 + length($2);
  2224.     $end2 = $st2 + (length $name2);
  2225.     $key2 = "$st2-$end2";  
  2226.  
  2227.     $st3 = length($`) + (length $2) + (length $4);
  2228.     $end3 = $st3 + (length $name3);
  2229.     $key3 = "$st3-$end3";
  2230.    
  2231.     if ((!($text =~ /(m\.?d\.?\')/)) && (!($text =~ /(m\.?d\.?s)/))) {
  2232.    
  2233.         if (length($name1)>0) {
  2234.             if ((length($name1)==1) || ((length($name1)==2) && ($name1 =~ /\b([A-Za-z])\.+\b/ig))) {  
  2235.             addType($key1, "Name Initial (MD)");
  2236.         }  
  2237.         else {
  2238.             if (isProbablyName($key1, $name1)) {
  2239.             addType($key1,"Name6 (MD)" );
  2240.             }
  2241.         }
  2242.         }
  2243.  
  2244.         if (length($name2)>0 && length($name3)>0) {
  2245.         if ((length($name2)==1) || ((length($name2)==2) && ($name2 =~ /\b([A-Za-z])\.+\b/ig))) {    
  2246.              #addType($key2, "Name Initial (MD)");
  2247.         }  
  2248.         else {
  2249.             #if (!(isCommon($name2) && !(isType($key2, "Name", 1)))) {
  2250.             if (isProbablyName($key2, $name2)) {
  2251.             addType($key2,"Name7 (MD)" );
  2252.             }
  2253.         }
  2254.         }
  2255.  
  2256.         if (length($name3)>0) {
  2257.         if ((length($name3)==1) || ((length($name3)==2) && ($name3 =~ /\b([A-Za-z])\.\b/ig))) {  
  2258.             #addType($key3, "Name Initial (MD)");
  2259.         }  
  2260.         else {
  2261.             if (isProbablyName($key3, $name3)) {   
  2262.            
  2263.             addType($key3,"Name8 (MD)" );
  2264.  
  2265.             }
  2266.         } # end else
  2267.         } #endif
  2268.     } #end if text does not have M.D.' or M.D.s
  2269.     }
  2270.  
  2271.  
  2272. #****************************************************************************************
  2273. # Removes PCP name field, leaving "PCP name:" intact, from discharge summaries
  2274. # Does not check for name patterns, since these should be caught by the other methods
  2275. # Required mainly for unknown names, checks up to 3 words following "PCP Name"
  2276. # Follows the pattern seen in discharge summaries, may not work well in nursing notes
  2277.  
  2278.     #@name_pre = ("PCP", "physician", "provider", "created by", "name");
  2279.     @name_pre = ("PCP", "physician", "provider", "created by");
  2280.    
  2281.     foreach $l (@name_pre) {
  2282.     while ($text =~/\b(($l( +name)?( +is)?\s\s*)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)(((\s*\,*\s*)? *)([A-Za-z\-]+))?)\b/ig) {  
  2283.         my $key1 = $5;
  2284.         my $st1 = length($`)+(length $2);
  2285.         my $end1 = $st1+(length $5);
  2286.         my $keyloc1 = "$st1-$end1";
  2287.         my $key2 = $8;
  2288.         my $st2 = $end1+(length $6);
  2289.         my $end2 = $st2+(length $8);
  2290.         my $keyloc2 = "$st2-$end2";
  2291.         my $key3 = $12;
  2292.         my $st3 = $end2+(length $10);
  2293.         my $end3 = $st3+(length $12);
  2294.         my $keyloc3 = "$st3-$end3";    
  2295.         my %pcp = ();
  2296.         $pcp{$keyloc1} = $key1;
  2297.         $pcp{$keyloc2} = $key2;
  2298.         $pcp{$keyloc3} = $key3;
  2299.        
  2300.         foreach my $keyloc (keys %pcp ) {
  2301.         my $val = $pcp{$keyloc};
  2302.         if (length($val)>0) {
  2303.             if ((length($val)==1) || ($val =~ /\b([A-Za-z])\.\b/ig)) {    
  2304.             addType($keyloc, "Name Initial (PRE)");
  2305.             }  
  2306.             else {
  2307.             #if (!(isCommonest($val) && !(isType($keyloc, "Name", 1)))) {
  2308.             if (isProbablyName($keyloc, $val)){
  2309.            
  2310.                 addType($keyloc,"Name9 (PRE)" );
  2311.             }
  2312.             }
  2313.         }
  2314.         }
  2315.     }
  2316.        
  2317.         #followed by pattern "name is"
  2318.     while ($text =~ /\b(($l( +name)?( +is)? ?([\#\:\-\=\.\,])+ *)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)?)\b/ig) {
  2319.         my $key1 = $6;
  2320.         my $st1 = length($`)+(length $2);
  2321.         my $end1 = $st1+(length $6);
  2322.         my $keyloc1 = "$st1-$end1";
  2323.         my $key2 = $9;
  2324.         my $st2 = $end1+(length $7);
  2325.         my $end2 = $st2+(length $9);
  2326.         my $keyloc2 = "$st2-$end2";
  2327.         my $key3 = $12;
  2328.         my $st3 = $end2+(length $10);
  2329.         my $end3 = $st3+(length $12);
  2330.         my $keyloc3 = "$st3-$end3";
  2331.         my %pcp = ();
  2332.         my $firstfound = 0;
  2333.         my $secondfound = 0;
  2334.         $pcp{$keyloc1} = $key1;
  2335.         $pcp{$keyloc2} = $key2;
  2336.         $pcp{$keyloc3} = $key3;
  2337.         $blah = isCommonest($key3);
  2338.         $blah2 = isType($keyloc3, "Name", 1);
  2339.        
  2340.         if (length($key1)>0) {
  2341.         if ((length($key1)==1) || ($key1 =~ /\b([A-Za-z])\.\b/ig)) {
  2342.             addType($keyloc1, "Name Initial (NameIs)");
  2343.             $firstfound = 1;
  2344.         }
  2345.         else {
  2346.             if (isProbablyName($keyloc1, key1)){
  2347.             addType($keyloc1,"Name10 (NameIs)" );
  2348.             $firstfound = 1;
  2349.             }
  2350.         }
  2351.         }
  2352.         if ($firstfound == 1) {
  2353.         if (length($key2)>0) {
  2354.             if ((length($key2)==1) || ($key2 =~ /\b([A-Za-z])\.\b/ig)) {
  2355.             addType($keyloc2, "Name Initial (NameIs)");
  2356.             $secondfound = 1;
  2357.             }
  2358.             else {
  2359.             if (isProbablyName($keyloc2, $key2)){
  2360.                 addType($keyloc2,"Name11 (NameIs)" );
  2361.                 $secondfound = 1;
  2362.             }
  2363.             }
  2364.         }
  2365.         }
  2366.         if ($secondfound == 1) {
  2367.         if (length($key3)>0) {
  2368.             if ((length($key3)==1) || ($key3 =~ /\b([A-Za-z])\.\b/ig)) {
  2369.             addType($keyloc3, "Name Initial (NameIs)");
  2370.             }
  2371.             else {
  2372.             if (isProbablyName ($keyloc3, $key3)){
  2373.                 addType($keyloc3,"Name12 (NameIs)" );
  2374.             }
  2375.             }
  2376.         }
  2377.         }
  2378.     }
  2379.     }
  2380. }
  2381. }
  2382. # End of function name2()
  2383. #***********************************************************************************************************
  2384. #***********************************************************************************************************
  2385. #***********************************************************************************************************
  2386. # Function: providerNumber()
  2387. # Removes "provider number", whole field, from discharge summaries
  2388. # Does not remove the field if there is no number following it
  2389.  
  2390. sub providerNumber {
  2391.     $text = $_[0];      
  2392.     if ($unitfilter =~ /y/) {
  2393.    
  2394.     while ($text =~ /\b(provider(( +)number)?( ?)[\#\:\-\=\s\.]?( ?)(\d+)([\/\-\:](\d+))?)\b/gi) {  
  2395.         my $unit = $1;
  2396.         my $st = length($`);
  2397.         my $key = "$st-".((length $unit) + $st);
  2398.         addType ($key, "Provider Number");
  2399.     }
  2400.     }
  2401. }
  2402. # End of function providerNumber()
  2403. #***********************************************************************************************************
  2404. #***********************************************************************************************************
  2405. #***********************************************************************************************************
  2406. # Function: signatureField()
  2407. # Removes signature fields from discharge summaries
  2408. # signature field taken as 3 or more underscores
  2409. # does not remove doctor names, since these are handled by other name handlers
  2410.    
  2411. sub signatureField {
  2412.     $text = $_[0];  
  2413.  
  2414.     while ($text =~ /\b(\_\_+)\b/gi) {
  2415.    
  2416.     my $sigfield = $1;
  2417.     my $st = length($`);
  2418.     my $key = "$st-".((length $sigfield) + $st);
  2419.     addType ($key, "Signature");
  2420.     }
  2421. }
  2422. # End of function signatureField()
  2423. #***********************************************************************************************************
  2424. #***********************************************************************************************************
  2425. #***********************************************************************************************************
  2426. # Function: location1()
  2427. # Removes complete street addresses only when including one of the terms in @street_add_suff
  2428.  
  2429. sub location1 {
  2430.     $text = $_[0];    
  2431.  
  2432.     if ($locfilter =~ /y/) {
  2433.     #check if ambiguous locations are preceded by any of the location indicators,
  2434.         #if so, add in PHI list
  2435.     foreach $i (@location_indicators){
  2436.      
  2437.         while ($text =~ /\b($i)(\s+)([A-Za-z]+)\b/ig) {
  2438.         #print "Location Match 1 is $1, 2 is $2, 3 is $3\n";
  2439.         my $st = length ($`) +  length ($1) + length ($2);
  2440.                 my $end = $st + length ($3);
  2441.         my $key = "$st-$end";
  2442.         my $word = substr $text, $st, ($end-$st);
  2443.         #print "word is $word\n";
  2444.            
  2445.         if (isType($key, "Location",1)  || (length ($word) > 1 && !(isCommon($word))) ) {  
  2446.            
  2447.             addType ($key, "Location");
  2448.         }
  2449.         }
  2450.     } #end for each i in location indicators
  2451.  
  2452.  
  2453.  
  2454.     #if the company dictionary is loaded, check if any of the ambiguous company names
  2455.     #are preceded by the employment indicators
  2456.     if ($company_list =~/y/){
  2457.         #print "company list is on\n";
  2458.         foreach $i (@employment_indicators_pre){
  2459.         while ($text =~ /\b($i)(\s+)([A-Za-z]+)\b/ig) {
  2460.            
  2461.             $st = length ($`) +  length ($1) + length ($2);
  2462.             $end = $st + length ($3);
  2463.             $key = "$st-$end";
  2464.             my $word = substr $text, $st, ($end-$st);
  2465.            
  2466.             my $tmp = isCommon($word);
  2467.            
  2468.  
  2469.             if (isType($key, "Company",1)|| (length ($word) > 1 && !(isCommon($word)))  ) {
  2470.             #print "adding  $3 key $key as company unambig";
  2471.             addType ($key, "Company");
  2472.             }  
  2473.         }
  2474.         } #end for each i in location indicators
  2475.     }
  2476.  
  2477.  
  2478.  
  2479.     #strict address suffix, case-sensitive match, PHI regardless of ambiguity
  2480.     foreach $i (@strict_street_add_suff) { 
  2481.         #make it a case-sensitive match for street address suffix
  2482.         while ($text =~ /\b(([0-9]+ +)?(([A-Za-z\.\']+) +)?([A-Za-z\.\']+) +\b$i\.?\b)\b/g) {
  2483.  
  2484.         $st = length($`);
  2485.         $end = $st + length($1);
  2486.  
  2487.         #check next segment for apartment, suite, floor #s
  2488.         my $nextSeg = substr $text, $end, 30;
  2489.         #print "check nextSeg for apt and sutie #, seg is  $nextSeg\n";
  2490.         foreach $k (@apt_indicators){
  2491.             if ($nextSeg =~ /\b($k\.?\#? +[\w]+)\b/gi) {
  2492.             $end += length ($`) + length($1);
  2493.             }
  2494.         }
  2495.         $key = "$st-$end";
  2496.         #addType ($key, "Street Address");
  2497.  
  2498.         if (length($3) == 0) {
  2499.             if (!isUnambigCommon($5)) {
  2500.             addType ($key, "Street Address");
  2501.             }
  2502.         }
  2503.         elsif (!((isUnambigCommon($4)) && (isUnambigCommon($5)))) {
  2504.             addType($key, "Street Address");
  2505.         }
  2506.         } # end while
  2507.     } #end foreach
  2508.     }#end if
  2509.  
  2510.     #Non-strict address suffix, case-insensitive match, PHI if no ambiguity
  2511.     if ($locfilter =~ /y/) {
  2512.    
  2513.     foreach $i (@street_add_suff) {
  2514.  
  2515.         while ($text =~ /\b(([0-9]+) +(([A-Za-z]+) +)?([A-Za-z]+) +$i)\b/gi) {
  2516.         $st = length($`);
  2517.         $end = $st + length($1);
  2518.         $key = "$st-$end";
  2519.  
  2520.         if (length($3) == 0) {
  2521.         if (!isUnambigCommon($word)){
  2522.             addType ($key, "Street Address");
  2523.         }
  2524.         }
  2525.         elsif ( ! (isUnambigCommon($4) || isUnambigCommon($5))){
  2526.         addType($key, "Street Address");
  2527.         }
  2528.     }
  2529.     }
  2530. }
  2531. #****************************************************************************************
  2532.     # Removes 2-word location PHI ending with @loc_indicators_suff or preceded by @loc_indicators_pre
  2533.  
  2534.     # Words potentially PHI
  2535.     if ($locfilter =~ /y/) {
  2536.    
  2537.     foreach $i (@loc_indicators_suff) {
  2538.         while ($text =~ /\b(([A-Za-z\-]+)? +)?(([A-Za-z\-]+) + *$i +)\b/ig) {
  2539.         if (!isCommon($4)) {       
  2540.             $st2 = length($`)+length($1);
  2541.             $end2 = $st2 + length($3);
  2542.             $key2 = "$st2-$end2";
  2543.             addType ($key2, "Location");
  2544.            
  2545.             if (length $2>0) {
  2546.             if (!isCommon($2)) {
  2547.                 $st1 = length($`);
  2548.                 $end1 = $st1 + length($2);
  2549.                 $key1 = "$st1-$end1";
  2550.                 addType ($key1, "Location");}
  2551.             }
  2552.            
  2553.         }
  2554.         }  
  2555.     }
  2556.     }
  2557.     if ($locfilter =~ /y/) {
  2558.    
  2559.     # Words most likely PHI
  2560.     foreach $i (@loc_ind_suff_c) {
  2561.        
  2562.         while ($text =~ /\b(([A-Za-z]+ +)?)(([A-Za-z]+)$i+)\b/ig) {
  2563.         if (!isCommon($3)) {       
  2564.             $st2 = length($`)+length($1);
  2565.             $end2 = $st2 + length($3);
  2566.             $key2 = "$st2-$end2";
  2567.             addType ($key2, "Location");
  2568.         }
  2569.         }
  2570.     }
  2571.     }
  2572.    
  2573.     if ($locfilter =~ /y/) {
  2574.    
  2575.     # Words potentially PHI
  2576.     foreach $i (@loc_indicators_pre) {
  2577.         while ($text =~ /\b((($i + *([A-Za-z\-]+)) *)([A-Za-z\-]+)?)\b/ig) {
  2578.         if (!isCommon($4)) {       
  2579.             $st2 = length($`);
  2580.             $end2 = $st2 + length($3);
  2581.             $key2 = "$st2-$end2";
  2582.             addType ($key2, "Location");
  2583.            
  2584.             if (length $5>0) {
  2585.             if (!isCommon($5)) {
  2586.                 $st1 = length($`)+length($2);
  2587.                 $end1 = $st1 + length($5);
  2588.                 $key1 = "$st1-$end1";
  2589.                 addType ($key1, "Location");
  2590.             }
  2591.             }              
  2592.         }
  2593.         }
  2594.     }
  2595.     } # end if locfilter =~ /y/
  2596.  
  2597.  
  2598.     @universities_pre = ("University", "U", "Univ", "Univ.");
  2599.  
  2600.     #catches "University of", "U of", "Univ of", "Univ. of"
  2601.     if ($locfilter =~ /y/) {
  2602.    
  2603.     # Words potentially PHI
  2604.     foreach $i (@universities_pre) {
  2605.         while ($text =~ /\b((($i +of *([A-Za-z\-]+)) *)([A-Za-z\-]+)?)\b/ig) {
  2606.         my $tmp = isUSStateAbbre($4);
  2607.  
  2608.         if (isUSStateAbbre($4) || isUSState($4) ||  !isCommon($4) ) {      
  2609.             $st2 = length($`);
  2610.             $end2 = $st2 + length($3);
  2611.             $key2 = "$st2-$end2";
  2612.             addType ($key2, "Location");
  2613.            
  2614.             if (length $5>0) {
  2615.             if (!isCommon($5)) {
  2616.                 $st1 = length($`)+length($2);
  2617.                 $end1 = $st1 + length($5);
  2618.                 $key1 = "$st1-$end1";
  2619.                 addType ($key1, "Location (Universities)");
  2620.             }
  2621.             }              
  2622.         }
  2623.         }
  2624.     }
  2625.     } # end if locfilter =~ /y/
  2626.  
  2627.    
  2628.  
  2629. }
  2630. # End of function location1()
  2631.  
  2632. #***********************************************************************************************************
  2633. #***********************************************************************************************************
  2634. #***********************************************************************************************************
  2635. # Function: location2()
  2636. # Searches for multiple hospital and location terms
  2637.  
  2638. sub location2 {
  2639.     $text = $_[0];
  2640.     if ($locfilter =~ /y/) {
  2641.    
  2642.     foreach $hos (@hospital) {
  2643.        
  2644.         my @hospital_terms = split " ", $hos;
  2645.         $len = 0;
  2646.         foreach $h (@hospital_terms) {
  2647.         if (length($h) != 0) {
  2648.             $len = $len+1;
  2649.             $hos[$len] = $h;
  2650.         }
  2651.         }
  2652.        
  2653.         if ($len == 1) {
  2654.         while ($text =~ /\b($hos[1])\b/ig) {
  2655.             $hospital = $1;
  2656.             $st = length($`);
  2657.             $end = $st + length($hospital);
  2658.             $key = "$st-$end";
  2659.             addType($key, "Hospital1");    
  2660.         }
  2661.         }
  2662.        
  2663.         if ($len == 2) {
  2664.         while ($text =~ /\b($hos[1])( )($hos[2])\b/ig) {
  2665.            
  2666.             $hos1 = $1;
  2667.             $hos2 = $3;
  2668.             $space = $2;
  2669.             $st1 = length($`);
  2670.             $end1 = $st1 + length($hos1);
  2671.             $key1 = "$st1-$end1";
  2672.             addType($key1, "Hospital2");
  2673.             $st2 = $end1 + length($space);
  2674.             $end2 = $st2 + length($hos2);
  2675.             $key2 = "$st2-$end2";
  2676.             addType($key2, "Hospital3");       
  2677.         }
  2678.         }
  2679.        
  2680.         if ($len == 3) {
  2681.         while ($text =~ /\b($hos[1])( )($hos[2])( )($hos[3])\b/ig) {
  2682.             $hos1 = $1;
  2683.             $hos2 = $3;
  2684.             $hos3 = $5;
  2685.             $st1 = length($`);
  2686.             $end1 = $st1 + length($hos1);
  2687.             $key1 = "$st1-$end1";
  2688.             addType($key1, "Hospital4");
  2689.             $st2 = $end1 + length($2);
  2690.             $end2 = $st2 + length($hos2);
  2691.             $key2 = "$st2-$end2";
  2692.             addType($key2, "Hospital5");
  2693.             $st3 = $end2 + length($4);
  2694.             $end3 = $st3 + length($hos3);
  2695.             $key3 = "$st3-$end3";
  2696.             addType($key3, "Hospital6");
  2697.         }
  2698.         }
  2699.                        
  2700.         if ($len == 4) {
  2701.         while ($text =~ /\b($hos[1])( )($hos[2])( )($hos[3])( )($hos[4])\b/ig) {
  2702.             $hos1 = $1;
  2703.             $hos2 = $3;
  2704.             $hos3 = $5;    
  2705.             $st1 = length($`);
  2706.             $end1 = $st1 + length($hos1);
  2707.             $key1 = "$st1-$end1";
  2708.             addType($key1, "Hospital");
  2709.             $st2 = $end1 + length($2);
  2710.             $end2 = $st2 + length($hos2);
  2711.             $key2 = "$st2-$end2";
  2712.             addType($key2, "Hospital");
  2713.             $st3 = $end2 + length($4);
  2714.             $end3 = $st3 + length($hos3);
  2715.             $key3 = "$st3-$end3";
  2716.             addType($key3, "Hospital");
  2717.             $st4 = $end3 + length($6);
  2718.             $end4 = $st4 + length($hos4);
  2719.             $key4 = "$st4-$end4";
  2720.             addType($key4, "Hospital");
  2721.         }
  2722.         }          
  2723.     }
  2724.        
  2725.     foreach $loc (@loc_unambig) {
  2726.        
  2727.         my @loc_terms = split " ", $loc;
  2728.         $len = 0;
  2729.         foreach $h (@loc_terms) {
  2730.         if (length($h) != 0) {
  2731.             $len = $len+1;
  2732.             $loc[$len] = $h;
  2733.         }
  2734.         }
  2735.        
  2736.         if ($len == 1) {
  2737.         while ($text =~ /\b($loc[1])\b/ig) {
  2738.             $location = $1;
  2739.             $st = length($`);
  2740.             $end = $st + length($location);
  2741.             $key = "$st-$end";
  2742.             addType($key, "Location");     
  2743.         }
  2744.         }
  2745.        
  2746.         if ($len == 2) {
  2747.         while ($text =~ /\b($loc[1])( )($loc[2])\b/ig) {
  2748.             $loc1 = $1;
  2749.             $loc2 = $3;
  2750.             $st1 = length($`);
  2751.             $end1 = $st1 + length($loc1);
  2752.             $key1 = "$st1-$end1";
  2753.             addType($key1, "Location");
  2754.             $st2 = $end1 + length($2);
  2755.             $end2 = $st2 + length($loc2);
  2756.             $key2 = "$st2-$end2";
  2757.             addType($key2, "Location");    
  2758.         }
  2759.         }
  2760.        
  2761.         if ($len == 3) {
  2762.         while ($text =~ /\b($loc[1])( )($loc[2])( )($loc[3])\b/ig) {
  2763.             $loc1 = $1;
  2764.             $loc2 = $3;
  2765.             $loc3 = $5;
  2766.             $st1 = length($`);
  2767.             $end1 = $st1 + length($loc1);
  2768.             $key1 = "$st1-$end1";
  2769.             addType($key1, "Location");
  2770.             $st2 = $end1 + length($2);
  2771.             $end2 = $st2 + length($loc2);
  2772.             $key2 = "$st2-$end2";
  2773.             addType($key2, "Location");
  2774.             $st3 = $end2 + length($4);
  2775.             $end3 = $st3 + length($loc3);
  2776.             $key3 = "$st3-$end3";
  2777.             addType($key3, "Location");
  2778.         }
  2779.         }
  2780.                    
  2781.         if ($len == 4) {
  2782.         while ($text =~ /\b($loc[1])( )($loc[2])( )($loc[3])( )($loc[4])\b/ig) {
  2783.             $loc1 = $1;
  2784.             $loc2 = $3;
  2785.             $loc3 = $5;    
  2786.             $st1 = length($`);
  2787.             $end1 = $st1 + length($loc1);
  2788.             $key1 = "$st1-$end1";
  2789.             addType($key1, "Location");
  2790.             $st2 = $end1 + length($2);
  2791.             $end2 = $st2 + length($loc2);
  2792.             $key2 = "$st2-$end2";
  2793.             addType($key2, "Location");
  2794.             $st3 = $end2 + length($4);
  2795.             $end3 = $st3 + length($loc3);
  2796.             $key3 = "$st3-$end3";
  2797.             addType($key3, "Location");
  2798.             $st4 = $end3 + length($6);
  2799.             $end4 = $st4 + length($loc4);
  2800.             $key4 = "$st4-$end4";
  2801.             addType($key4, "Location");
  2802.         }
  2803.         }
  2804.     }
  2805.  
  2806.         #######
  2807.         #PO Box number
  2808.    
  2809.         while ($text =~ /\b(P[\.]?O[\.]? *Box *[\#]? *[0-9]+)\b/gi) {
  2810.           $location = $1;
  2811.           $st = length($`);
  2812.           $end = $st + length($location);
  2813.           $key = "$st-$end";
  2814.           addType($key, "PO Box");   
  2815.     }
  2816.  
  2817.    
  2818.  
  2819.  
  2820.  
  2821.         ######
  2822.         #Zipcodes
  2823.     foreach $loc (@us_states_abbre) {
  2824.             while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
  2825.             $location = $1;
  2826.             $st = length($`);
  2827.             $end = $st + length($location);
  2828.             $key = "$st-$end";
  2829.             addType($key, "State/Zipcode");  
  2830.         }
  2831.  
  2832.     }
  2833.  
  2834.         #Zipcodes with more US states abbreviations
  2835.     foreach $loc (@more_us_states_abbre) {
  2836.             while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
  2837.             $location = $1;
  2838.             $st = length($`);
  2839.             $end = $st + length($location);
  2840.             $key = "$st-$end";
  2841.             addType($key, "State/Zipcode");  
  2842.         }
  2843.  
  2844.     }      
  2845.     #Zipcodes with full US state names
  2846.     foreach $loc (@us_states) {
  2847.             while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
  2848.             $location = $1;
  2849.             $st = length($`);
  2850.             $end = $st + length($location);
  2851.             $key = "$st-$end";
  2852.             addType($key, "State/Zipcode");  
  2853.         }
  2854.  
  2855.     }
  2856.         ##########
  2857.         #remove US states if filter flag for State is on
  2858.  
  2859.     if ($us_state_filter =~ /y/) {
  2860.  
  2861.       foreach $loc (@us_states) {
  2862.        
  2863.         my @loc_terms = split " ", $loc;
  2864.         $len = 0;
  2865.         foreach $h (@loc_terms) {
  2866.         if (length($h) != 0) {
  2867.             $len = $len+1;
  2868.             $loc[$len] = $h;
  2869.         }
  2870.         }
  2871.        
  2872.         if ($len == 1) {
  2873.         while ($text =~ /\b($loc[1])\b/ig) {
  2874.             $location = $1;
  2875.             $st = length($`);
  2876.             $end = $st + length($location);
  2877.             $key = "$st-$end";
  2878.             addType($key, "State");    
  2879.         }
  2880.         }
  2881.        
  2882.         if ($len == 2) {
  2883.         while ($text =~ /\b(($loc[1])( )($loc[2]))\b/ig) {
  2884.             $location = $1;
  2885.             $st = length($`);
  2886.             $end = $st + length($location);
  2887.             $key = "$st-$end";
  2888.             addType($key, "State");      
  2889.         }
  2890.         }
  2891.        
  2892.         if ($len == 3) {
  2893.         while ($text =~ /\b(($loc[1])( )($loc[2])( )($loc[3]))\b/ig) {
  2894.             $location = $1;
  2895.             $st = length($`);
  2896.             $end = $st + length($location);
  2897.             $key = "$st-$end";
  2898.             addType($key, "State");
  2899.         }
  2900.         }
  2901.                    
  2902.         if ($len == 4) {
  2903.         while ($text =~ /\b(($loc[1])( )($loc[2])( )($loc[3])( )($loc[4]))\b/ig) {
  2904.             $location = $1;
  2905.             $st = length($`);
  2906.             $end = $st + length($location);
  2907.             $key = "$st-$end";
  2908.             addType($key, "State");
  2909.            
  2910.         }
  2911.         }
  2912.     }
  2913.       } #end if us_state_filter is on
  2914.  
  2915.  
  2916.  
  2917.  
  2918.  
  2919.  
  2920.  
  2921.  
  2922.         #######
  2923.     # Sub-function: hospitalIndicators()
  2924.     # Searches for hospital indicators and checks if previous and following words are hospitals
  2925.    
  2926.     foreach $h (@hospital_indicators) {
  2927.  
  2928.         while ($text =~ /((([A-Za-z\-\']+)( + *))?(([A-Za-z\-\']+)( + *))?($h\b)(( + *)([A-Za-z\-\']+))?(( + *)([A-Za-z\-\']+))?\b)/ig) {  
  2929.  
  2930.        
  2931.         my $typeadded = 0;
  2932.         $st1 = length($`);
  2933.         $end1 = $st1 + length($3);
  2934.         $key1 = "$st1-$end1";
  2935.         $st2 = $st1 + length($2);
  2936.         $end2 = $st2 + length($6);
  2937.         $key2 = "$st2-$end2";
  2938.         $st3 = $st1 + length($2) + length ($5) + length($8) + length($10);
  2939.         $end3 = $st3 + length($11);
  2940.         $key3 = "$st3-$end3";
  2941.         $st4 = $end3 + length($13);
  2942.         $end4 = $st4 + length($14);
  2943.         $key4 = "$st4-$end4";
  2944.         $st5 = $end2 + length($7);
  2945.         $end5 = $st5 + length($8);
  2946.         $key5 = "$st5-$end5";
  2947.  
  2948.        
  2949.         if (length($5)==0) {
  2950.        
  2951.             if ((length($3) > 1) &&  (!isUnambigCommon($3)) && (!(isCommon ($3)) || (isType ($key1, "Hospital", 1)))) {
  2952.             addType ($key1, "Hospital");
  2953.             #addType ($key5, "Hospital-Ind");
  2954.             $typeadded = 1;
  2955.             }
  2956.         }
  2957.  
  2958.          elsif ((length($6) > 1) && (!isUnambigCommon($6)) && (!(isCommon ($6)) || isUSState($6) || isUSStateAbbre($6) || (isType ($key2, "Hospital", 1)))) {
  2959.    
  2960.             addType ($key2, "Hospital");
  2961.                     #addType ($key5, "Hospital-Ind");
  2962.             $typeadded = 1;
  2963.            
  2964.             if ((length($3) > 1) &&  (!isUnambigCommon($3)) &&  (!(isCommon ($3)) || isUSState($3) || isUSStateAbbre($3) || (isType ($key1, "Hospital", 1)))) {
  2965.             addType ($key1, "Hospital");
  2966.             #addType ($key5, "Hospital-Ind");
  2967.             $typeadded = 1;
  2968.             }
  2969.         }
  2970.  
  2971.         #   #Generating too many false positives.
  2972.         #   #Need a better common word dictionary to enable this.
  2973.         #if ($typeadded == 0) {
  2974.         #    if ((length($11) > 1) && (!(isCommonest ($11)) || (isType ($key3, "Hospital", 1)))) {
  2975.         #   #addType ($key3, "Hospital");
  2976.                 #       #addType ($key5, "Hospital-Ind");
  2977.         #   #if ((length($14) > 1) && (!(isCommonest ($14)) || (isType ($key4, "Hospital", 1)))) {
  2978.         #   #      # addType ($key4, "Hospital");
  2979.         #   #       #addType ($key5, "Hospital-Ind");          
  2980.         #   #   }
  2981.         #    }
  2982.         #} #end if (typeadded == 0)
  2983.         }
  2984.     }
  2985.     }
  2986. }
  2987. # End of function location2()
  2988.  
  2989.  
  2990. #***********************************************************************************************************
  2991. #***********************************************************************************************************
  2992. #***********************************************************************************************************
  2993. # Function: problem()
  2994. # Checks for names preceded by "problem", pattern found in discharge summaries
  2995.  
  2996. sub problem {
  2997.     $text = $_[0];  
  2998.  
  2999.     $k = "problem";
  3000.     $l = ":";
  3001.     while ($text =~ /\b(([A-Za-z\-]+) + *($k))\b/ig) {
  3002.     if ((!isCommon($2)) || (isNameAmbig($2))) {
  3003.         $st = length($`);
  3004.         $end = $st + length($2);
  3005.         $key = "$st-$end";    
  3006.         addType ($key, "Last Name");
  3007.     }
  3008.     }
  3009. }
  3010. # End of function problem()
  3011. #***********************************************************************************************************
  3012. #***********************************************************************************************************
  3013. #***********************************************************************************************************
  3014. # Function: mrn()
  3015. # Checks for medical record numbers, i.e. numbers preceded by "mrn" or "medical record number"
  3016.  
  3017. sub mrn {
  3018.     $text = $_[0];      
  3019.     if ($unitfilter =~ /y/) {
  3020.    
  3021.     while ($text =~ /\b(mrn( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
  3022.         my $unit = $1;
  3023.         my $st = length($`);
  3024.         my $key = "$st-".((length $unit) + $st);
  3025.         addType ($key, "Medical Record Number");
  3026.     }
  3027.    
  3028.     @numbers = ("number", "no", "num", "");
  3029.    
  3030.     foreach $i (@numbers) {
  3031.         while ($text =~ /\b(medical record( *)$i?( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
  3032.         my $unit = $1;
  3033.         my $st = length($`);
  3034.         my $key = "$st-".((length $unit) + $st);
  3035.         addType ($key, "Medical Record Number");
  3036.         }
  3037.     }
  3038.     }
  3039. }
  3040. # End of function mrn()
  3041. #***********************************************************************************************************
  3042. #***********************************************************************************************************
  3043. #***********************************************************************************************************
  3044. # Function: unit()
  3045. # Checks for unit numbers in discharge summaries
  3046. # Removes the entire field of "unit <number>" or unit <number>/<number>
  3047. # If "unit" is not followed by a number, does not remove it
  3048.  
  3049. sub unit {
  3050.     $text = $_[0];      
  3051.     if ($unitfilter =~ /y/) {
  3052.    
  3053.     @numbers = ("number", "no", "num", "");
  3054.    
  3055.     foreach $i (@numbers) {
  3056.         while ($text =~ /\b(unit( ?)$i?( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
  3057.         my $unit = $1;
  3058.         my $st = length($`);
  3059.         my $key = "$st-".((length $unit) + $st);
  3060.         addType ($key, "Unit Number");
  3061.         }
  3062.     }
  3063.     }
  3064. }
  3065. # End of function unit()
  3066. #***********************************************************************************************************
  3067. #***********************************************************************************************************
  3068. #***********************************************************************************************************
  3069. # Function: name3()
  3070. # Checks every lastnameprefix
  3071. # If the following word is either a name or not commonest, identifies it as lastname
  3072.  
  3073. sub name3 {
  3074.     $text = $_[0];  
  3075.     if ($namefilter =~ /y/) {
  3076.  
  3077.     foreach $line (@prefixes_unambig) {
  3078.     while ($text =~ /\b(($line)([\s\'\-])+ *)([A-Za-z]+)\b/ig) {
  3079.         my $pre = $2;
  3080.         my $prestart = length($`);
  3081.         my $preend = $prestart+(length $pre);
  3082.         my $prekey = "$prestart-$preend";
  3083.         my $lname = $4;
  3084.         my $lstart = $prestart+length($1);
  3085.         my $lend = $lstart+length($4);
  3086.         my $lnamekey = "$lstart-$lend";
  3087.         if ((!(isCommonest ($lname))) || (isType ($lnamekey, "Name", 1))) {
  3088.         addType ($prekey, "Name Prefix (Prefixes)");
  3089.         addType ($lnamekey, "Last Name (Prefixes)");
  3090.         }
  3091.     }
  3092.     }
  3093.        #****************************************************************************************
  3094.    
  3095.     @specific_titles = ("MR", "MISTER", "MS");
  3096.     foreach $i (@specific_titles) {
  3097.        
  3098.         while ($text =~ /\b($i\.( *))([A-Za-z\'\-]+)\b/ig) {
  3099.         $potential_name = $3;
  3100.         $start = length($`)+length($1);
  3101.         $end = $start + length($potential_name);
  3102.         $key = "$start-$end";
  3103.         if (isType($key, "Name", 1)) {
  3104.             addType ($key, "Name13 (STitle)");
  3105.         }
  3106.        
  3107.         elsif (!(isCommon($potential_name))) {
  3108.             addType ($key, "Name14 (STitle)");
  3109.         }
  3110.         }
  3111.     }
  3112.         #****************************************************************************************
  3113.     # Goes through word by word looking for unspotted names
  3114.     # All words have already been marked as potential names (where appropriate) by the previous routines
  3115.     # Looks for last names following titles (Dr. Smith)
  3116.     # Also should pick up "Dr. S"
  3117.     # Looks for the last name prefixes
  3118.    
  3119.     foreach $i (@strict_titles) {      
  3120.       L:
  3121.  
  3122.         while ($text =~ /\b($i\b\.? *)([A-Za-z\'\-]+)\b/ig) { # added ' -
  3123.  
  3124.         my $tt = $1;
  3125.             my $word = $2;
  3126.            
  3127.         my $st = length($`) + length($1);
  3128.         my $fi = $st + length($2);
  3129.         my $key = "$st-$fi";
  3130.         if (exists $prefixes{uc($word)}) {
  3131.    
  3132.             addType ($key, "Last Name (STitle)");
  3133.             my $start = $fi;
  3134.             my $nextWord = substr $text, $start;
  3135.  
  3136.             if ($nextWord =~ /\A( ?)(\')?( ?)([A-Za-z]+)\b/g) {
  3137.            
  3138.             my $token = $4;
  3139.             my $lstart = $start+length($1)+length($2)+length($3);
  3140.             my $lend = $lstart+length($4);
  3141.             my $fi += length($1)+length($4);
  3142.             if (exists $prefixes{uc($token)}){
  3143.                 addType ("$start-$fi", "Last Name (STitle)");
  3144.                 my $start = $fi;
  3145.                 my $nextWord = $nextWord;
  3146.                 my $token = $token;
  3147.                 if ($nextWord =~ /\A( ?$token( ?))([A-Za-z]+)\b/g) {
  3148.                 $word = $3;
  3149.                 $key = "$fi-".($fi + length($2) + length($3));
  3150.                 } else {
  3151.                 next L;
  3152.                 }
  3153.             }
  3154.             else {
  3155.                 # Has already identified one prefix, should now check to see if next word is name or is not commonest  
  3156.                 $word = $token;
  3157.                 $key = "$lstart-$lend";
  3158.            
  3159.                 if (isProbablyName($key, $word)){              
  3160.                 addType ($key, "Last Name (STitle)");
  3161.                 }
  3162.             }
  3163.             } else {
  3164.             next L;
  3165.             }
  3166.         } else { #else $word is not a prefix
  3167.  
  3168.             if ($word =~ /\'([A-Za-z]+)/) {
  3169.             $word = $1;
  3170.             $st--;
  3171.             $key = $st."-".$fi;
  3172.             }
  3173.             if ($word =~ /([A-Za-z]+)\'/) {
  3174.             $word = $1;
  3175.             $key =  $st."-".($fi-1);   
  3176.             }
  3177.         }
  3178.        
  3179.  
  3180.         ###########################################################
  3181.         if (exists $phi{$key}) {
  3182.        
  3183.             addType($key, "Last Name (STitle)");
  3184.             if ((isType($key, "First Name", 1))) {
  3185.        
  3186.                addType($key, "First Name (STitle)");
  3187.             }
  3188.         } else {
  3189.             if (isProbablyName($key, $word)){
  3190.             addType ($key, "Last Name (STitle)");
  3191.        
  3192.             }
  3193.             else {
  3194.             addType ($key, "Last Name (STitle)");
  3195.  
  3196.             }
  3197.         } #end else (!exists $phi{$key})
  3198.  
  3199.  
  3200.  
  3201.         ################################
  3202.                 #added to catch Dr. <firstname> <lastname>
  3203.         #check the word after $word
  3204.         my ($tmpStart,$tmpEnd) = split '-', $key;
  3205.         my $following = substr $text, $tmpEnd;
  3206.  
  3207.         if($following =~/\A(\s+)([A-Za-z\-\']{2,})\b/g){
  3208.             my $fword = $2;
  3209.             my $newst = $tmpEnd + (length ($1));
  3210.             my $nextKey = "$newst-".($newst + length($2));
  3211.    
  3212.             if (isProbablyName($nextKey, $fword)){
  3213.             addType($nextKey, "Name (STitle)");
  3214.             }
  3215.         }
  3216.         ##########################################
  3217.  
  3218.         } #end while text matches the pattern
  3219.     }  #end for each $i strict_titles
  3220.  
  3221.  
  3222.  
  3223.        #****************************************************************************************
  3224.     # Goes through word by word looking for unspotted names
  3225.     # All words have already been marked as potential names (where appropriate) by the previous routines
  3226.     # Looks for last names following titles (Dr. Smith)
  3227.     # Also should pick up "Dr. S"
  3228.     # Looks for the last name prefixes
  3229.    
  3230.        #mark as ambiguous if common words
  3231.     foreach $i (@titles) {     
  3232.       L:
  3233.         while ($text =~ /\b($i\b\.? ?)([A-Za-z]+) *([A-Za-z]+)?\b/ig) {
  3234.    
  3235.         my $tt = $1;
  3236.         my $word = $2;     
  3237.         my $st = length($`) + length($1);
  3238.         my $fi = $st + length($2);
  3239.         my $key = "$st-$fi";
  3240.  
  3241.         my $wordAfter = $3; ##added to catch last names
  3242.         my $stAfter = $fi + 1;
  3243.         my $fiAfter = $stAfter + length ($3);
  3244.             my $keyAfter = "$stAfter-$fiAfter";
  3245.        
  3246.  
  3247.         if (exists $prefixes{uc($word)}) { 
  3248.            
  3249.             addType ($key, "Last Name (Titles)");
  3250.             my $start = $fi;
  3251.             my $nextWord = substr $text, $start;
  3252.             if ($nextWord =~ /\A( ?)(\')?( ?)([A-Za-z]+)\b/g) {
  3253.             my $token = $4;
  3254.             my $lstart = $start+length($1)+length($2)+length($3);
  3255.             my $lend = $lstart+length($4);
  3256.             my $fi += length($1)+length($4);
  3257.             if (exists $prefixes{uc($token)}){
  3258.                 addType ("$start-$fi", "Last Name (Titles)");
  3259.                 my $start = $fi;
  3260.                 my $nextWord = $nextWord;
  3261.                 my $token = $token;
  3262.                 if ($nextWord =~ /\A( ?$token( ?))([A-Za-z]+)\b/g) {
  3263.                 $word = $3;
  3264.                 $key = "$fi-".($fi + length($2) + length($3));
  3265.                 } else {
  3266.                 next L;
  3267.                 }
  3268.             }
  3269.             else {
  3270.                 # Has already identified one prefix, should now check to see if next word is name or is not commonest  
  3271.                 $word = $token;
  3272.                 $key = "$lstart-$lend";
  3273.  
  3274.                 if (isProbablyName($key, $word) && length($word) > 1 ){
  3275.  
  3276.                 addType ($key, "Last Name (Titles)");
  3277.                 }
  3278.             }
  3279.             } else {
  3280.             next L;
  3281.             }
  3282.         } else {
  3283.             if ($word =~ /\'([A-Za-z]+)/) {
  3284.             $word = $1;
  3285.             $st--;
  3286.             $key = $st."-".$fi;
  3287.             }
  3288.             if ($word =~ /([A-Za-z]+)\'/) {
  3289.             $word = $1;
  3290.             $key =  $st."-".($fi-1);
  3291.             }
  3292.         }
  3293.        
  3294.             if (length ($wordAfter) > 1) {
  3295.  
  3296.             my $tmp = isCommon($wordAfter);
  3297.             if (!isCommonest($wordAfter)  ||  (isType($keyAfter, "Name", 1) && isType($keyAfter, "(un)"))  ||
  3298.                 (isType($keyAfter, "Name", 1) && ($wordAfter =~ /\b(([A-Z])([a-z]+))\b/g)) ) {
  3299.            
  3300.             addType($keyAfter, "Last Name (Titles)");
  3301.             addType($key, "First Name (Titles)");
  3302.             }  
  3303.         }
  3304.  
  3305.         elsif (exists $phi{$key}) {
  3306.             if ((isType($key, "Name", 1))) {
  3307.             addType($key, "Last Name (Titles)");
  3308.             }
  3309.         } else {
  3310.             if ( length($word)  > 1 && !(isCommon($word)) ) {
  3311.             addType ($key, "Last Name (Titles)");
  3312.             }
  3313.             else {
  3314.             if (($word =~ /\b[A-Z][a-z]+\b/) || ($tt =~ /$i\. /)) {
  3315.              
  3316.                 addType ($key, "Last Name (Titles  ambig)");
  3317.                
  3318.             } else {
  3319.                 addType ($key, "Last Name (Titles ambig)");
  3320.             }
  3321.             }
  3322.         }
  3323.         }
  3324.     }
  3325.  
  3326.    
  3327.  
  3328.    
  3329.        #****************************************************************************************
  3330.     # Implements simple rules for finding names that aren't in the list or are ambiguous...
  3331.     # first name + last name -> first name + last name (ambig), first name +
  3332.         #not-on-any-safe-word-list, else save the second word and see whether it appears
  3333.         #in the patient text not associated with the first name or any other name indicator
  3334.     # Also first + initial + last name
  3335.     # Finds each prefix, labels the next not uncommonest word    
  3336.     # Finds all first names (unambig), look at following word -> make last name unambigs
  3337.    
  3338.     foreach $k (keys %phi) {
  3339.        if (((isType($k, "Male First Name", 1)) || (isType($k, "Female First Name", 1))) && ((isType($k, "(un)", 1)) || (isType($k, "pop", 1)))) {      
  3340.    
  3341.         my ($start, $end) = split '-', $k;
  3342.         my $following = substr $text, $end;
  3343.        
  3344.         # No middle initial
  3345.            
  3346.         if ($following =~ /\A( +)([A-Za-z\']{2,})\b/g) { #added to catch firstname s.a. O'Connell
  3347.             my $fword = $2;
  3348.             my $st = $end + (length $1);
  3349.             my $nextKey = "$st-".($st + length($2));
  3350.  
  3351.             if (exists $phi{$nextKey}) {
  3352.  
  3353.             if ((isType($nextKey, "Name", 1) == 1) && isProbablyName($nextKey, $fword)) {    
  3354.                 addType($nextKey, "Last Name (NamePattern1)");
  3355.                 addType($k,"First Name4 (NamePattern1)"); # make it unambig
  3356.             }
  3357.             }
  3358.             else {
  3359.             if (isProbablyName($nextKey, $fword)){
  3360.              
  3361.                 addType ($nextKey, "Last Name (NamePattern1)");
  3362.                 addType($k,"First Name5 (NamePattern1)"); }}}# make it unambig
  3363.                
  3364.                 # Looks for that middle initial
  3365.                 if ($following =~ /\A( +)([A-Za-z])(\.? )([A-Za-z\-][A-Za-z\-]+)\b/g) {
  3366.                     my $initial = $2;
  3367.                     my $lastN = $4;
  3368.                     my $st = $end + (length $1);
  3369.                     my $iniKey = "$st-".($st+1);
  3370.                     my $stn = $st + (length $2) + (length $3);
  3371.                     my $nextKey = "$stn-".($stn + (length $4));
  3372.                     if (exists $phi{$nextKey}) {
  3373.                     if ((isType($nextKey, "Last Name", 0) == 0)) {
  3374.                         addType($nextKey, "Last Name (NamePattern1)");
  3375.                         addType($iniKey, "Initial (NamePattern1)");
  3376.                         addType($k,"First Name11 (Name Pattern1)");
  3377.                     }
  3378.                     }
  3379.                     else {         
  3380.                     if ($following =~ /\A( +)([A-Za-z])(\.? )([A-Za-z][A-Za-z]+)\b\s*\Z/g){
  3381.                         addType ($nextKey, "Last Name (NamePattern1)");
  3382.                         addType($iniKey, "Initial (NamePattern1)");
  3383.                         addType($k,"First Name6 (NamePattern1)");
  3384.                     }
  3385.                     elsif (!(isCommonest($lastN))) {
  3386.                         addType ($nextKey, "Last Name (NamePattern1)");
  3387.                         addType($iniKey, "Initial (NamePattern1)");
  3388.                         addType($k,"First Name7 (NamePattern1)");
  3389.                     }
  3390.                     }
  3391.                 }
  3392.         }
  3393.     }
  3394.    
  3395.     # Finds all last names (unambig), looks at proceeding word -> make first names unambigs
  3396.     foreach $k (keys %phi) {
  3397.         if (isType($k, "Last Name", 1) && (isType($k, "(un)", 1))) {
  3398.        
  3399.         my ($start, $end) = split '-', $k;
  3400.         my $preceding = substr $text, 0, $start;
  3401.        
  3402.         if ($preceding =~ /\b([A-Za-z]+)( *)\Z/g) {  
  3403.             my $pword = $1;
  3404.             my $st = length($`);
  3405.             my $prevKey = "$st-".($st + (length $1));
  3406.             if (exists $phi{$prevKey}) {
  3407.             #my $result = isNameIndicator($pword);
  3408.             #print "pword is $pword, isNameIndicator returns $result";
  3409.             #if ((isType($prevKey, "First Name", 1)) && (!isType($prevKey, "Name Indicator", 0))) {
  3410.             if ((isType($prevKey, "First Name", 1)) && (!isNameIndicator($pword)) ) {
  3411.                 addType($prevKey, "First Name8 (NamePattern2)");
  3412.             } # Else it's been positively identified as something that is not a name so leave it
  3413.             }
  3414.             else {
  3415.             # Sees whether it appears in the common words...
  3416.             if (!(isCommon($pword))) {
  3417.                
  3418.                 addType ($prevKey, "First Name9 (NamePattern2)");
  3419.             }
  3420.             }
  3421.         }
  3422.         }
  3423.     }  
  3424.         #****************************************************************************************
  3425.     # Looks for compound last names -> last name + last name (ambig), last name + not-on-any-safe-word-list, last name "-" another word
  3426.     # Last name with an ambiguous name preceding it has already labeled the preceding thing a first name; no huge loss if it's just a weird first part of a compound last name
  3427.    
  3428.     foreach $k (keys %phi) {
  3429.         if (isType($k, "Last Name", 0)) {
  3430.        
  3431.         my ($start, $end) = split '-', $k;
  3432.         my $following = substr $text, $end;
  3433.        
  3434.         if ($following =~ /\A-([A-Za-z]+)\b/g) { #hypen-ated last name
  3435.             my $newend = $end+length($1)+1;
  3436.             my $nextKey = "$end-$newend";
  3437.             addType ($nextKey, "Last Name (NamePattern3)");
  3438.         }
  3439.         if ($following =~ /\A( *)([A-Za-z]+)\b/g) {
  3440.             my $fword = $2;
  3441.         my $st = $end + (length $1);
  3442.             my $nextKey = "$st-".($st + length($2));
  3443.             if (exists $phi{$nextKey}) {
  3444.             if (!(isType($nextKey, "ambig", 1))) {
  3445.             if (isType($nextKey, "Last Name", 0) == 0) {
  3446.                 addType($nextKey, "Last Name (NamePattern3)");
  3447.             }
  3448.             } # Else it's been positively identified as something that is not a name so leaves it
  3449.         }
  3450.             else {
  3451.             # Sees whether it appears in the common words
  3452.             if (!(isCommon($fword))) {
  3453.                 addType ($nextKey, "Last Name (NamePattern3)");
  3454.             }
  3455.             }
  3456.         }
  3457.         }
  3458.     }
  3459.        #****************************************************************************************
  3460.     # Looks for initials
  3461.     # Many last names get classified as first names and other PHI -> looks for initial before all unambig names and locations
  3462.    
  3463.       INI:
  3464.     foreach $k (keys %phi) {
  3465.         if (  ((!(isType($k, "ambig", 1))) ||    isType($k, "(un)",1))   && (isType($k, "Name", 1))) {
  3466.         #if (isType($k, "Name", 1)) {  
  3467.         my ($start, $end) = split '-', $k;
  3468.         my $preceding = substr $text, 0, $start;
  3469.        
  3470.         # Checks for two initials
  3471.    
  3472.         if ($preceding =~ /\b([A-Za-z][\. ] ?[A-Za-z]\.?) ?\Z/g) {         
  3473.             my $key = (length ($`))."-".(length($`) + (length $1));
  3474.             addType ($key, "Initials (NamePattern4)");
  3475.             if (!(isType($k, "Last Name", 0))) {
  3476.             }
  3477.         }
  3478.        
  3479.         # Checks if preceding word is an initial
  3480.         elsif ($preceding =~ /\b([A-Za-z]\.?) ?\Z/g) { #1 initial
  3481.        
  3482.             my $tmp = substr $text, $start, $end - $start +1;
  3483.        
  3484.             my $init = $1;
  3485.             my $key = (length ($`))."-".(length($`) + (length $1));
  3486.             if (lc($init) eq "s") {
  3487.             if ((substr $preceding, (length($`) - 1), 1) eq "'") {  #for 's
  3488. #           next INI;
  3489.             }
  3490.             }
  3491.             if ((lc($init) eq "a") || (lc($init) eq "i")) {
  3492.             if (isCommon(substr $text, $start, ($end - $start))) {
  3493. #           next INI;
  3494.             }
  3495.             }
  3496.        
  3497.             if (length($init)==2 || length($init)==1) {
  3498.             addType ($key, "Initials (NamePattern4)");
  3499.             }
  3500.             if (!(isType($k, "Last Name", 0))) {
  3501.             addType ($k, "Last Name (NamePattern4)");
  3502.             }
  3503.         }      
  3504.         }
  3505.     }
  3506.        #****************************************************************************************
  3507.        # Looks for initials; similar to previous code block
  3508.    
  3509.     foreach $k (keys %phi) {
  3510.         if (isType($k, "Last Name", 1) && (!isType ($k, "ambig", 1))) {
  3511.        
  3512.         my ($start, $end) = split '-', $k;
  3513.         my $preceding = substr $text, 0, $start;
  3514.         #two initials (why would they write that?  Why not?) {
  3515.         if ($preceding =~ /\b([A-Za-z][\. ] ?[A-Za-z]\.?) ?\Z/g) {
  3516.             my $key = (length ($`))."-".(length($`) + (length $1));
  3517.             addType ($key, "Initials (NamePattern5)");
  3518.             if (!(isType($k, "Last Name", 0))) {
  3519.             addType ($k, "Last Name (NamePattern5)");
  3520.             }
  3521.         }
  3522.        
  3523.         elsif ($preceding =~ /\b([A-Za-z]\.?) ?\Z/g) { #1 initial
  3524.             my $init = $1;
  3525.             my $key = (length ($`))."-".(length($`) + (length $1));
  3526.             if (lc($init) eq "s") {
  3527.             if ((substr $preceding, (length($`) - 1), 1) eq "'") {  #for 's
  3528.                 #next INI;
  3529.             }
  3530.             }
  3531.             if ((lc($init) eq "a") || (lc($init) eq "i")) {
  3532.             if (isCommon(substr $text, $start, ($end - $start))) {
  3533. #           next INI;
  3534.             }
  3535.             }
  3536.    
  3537.         }      
  3538.         }
  3539.     }
  3540.         #****************************************************************************************
  3541.  
  3542.     # Searches for patterns "name and/or," comma list names
  3543.     foreach $k (keys %phi) {
  3544.        
  3545.         if ((isType($k, "Last Name", 0)) || (isType($k, "Male First Name", 0)) || (isType($k, "Female First Name", 0))) {
  3546.         my ($start, $end) = split '-', $k;
  3547.         my $following = substr $text, $end;
  3548.        
  3549.         if ((length $following) == 0) { next; }
  3550.        
  3551.         # First just looks for "and"/"or"
  3552.         if ($following =~ /\A and ([A-Za-z]+)\p{IsPunct}/ig) {
  3553.             my $word = $text1;
  3554.             my $key = ($end + 5)."-".($end + 5 + length($1));
  3555.             if ((isType($key, "Name", 1))  || (!(isCommon($word)))) {
  3556.             addType ($key, "Last Name (NamePattern6)");
  3557.             }
  3558.         }
  3559.         elsif ($following =~ /\A and ([A-Za-z]+)\b/ig) {
  3560.             my $word = $1;
  3561.             my $key = ($end + 5)."-".($end + 5 + length($1));
  3562.             if (!(isCommon($word))) {
  3563.             addType ($key, "Last Name (NamePattern6)");
  3564.             }
  3565.         }
  3566.         elsif ($following =~ /\A or ([A-Za-z]+)\b/ig) {
  3567.             my $word = $1;
  3568.             my $key = ($end + 4)."-".($end + 4 + length($1));
  3569.             if (!(isCommon($word))) {
  3570.             addType ($key, "Last Name (NamePattern6)");
  3571.             }
  3572.         }
  3573.         elsif ($following =~ /\A( ?[\&\+] ?)([A-Za-z]+)\b/ig) {
  3574.             my $word = $2;
  3575.             my $st = $end + (length $1);
  3576.             my $key = "$st-".($st + length($2));
  3577.             if (!(isCommon($word))) {
  3578.             addType ($key, "Last Name (NamePattern6)");
  3579.             }
  3580.         }
  3581.         elsif ($following =~ /\A, ([A-Za-z]+)(,? and )([A-Za-z]+)\b/ig) {
  3582.             # Searches up to 3 names in a list
  3583.             my $name1 = $1;
  3584.             my $name2 = $3;
  3585.             my $st2 = $end + 2 + (length $name1) + length($2);
  3586.             my $key1 = ($end + 2)."-".($end + 2 + (length $name1));
  3587.             my $key2 = "$st2-".($st2 + length($name2));
  3588.             if (!(isCommon($name1))) {
  3589.             addType ($key1, "Last Name (NamePattern6)");
  3590.             }
  3591.             if (!(isCommon($name2))) {
  3592.             addType ($key2, "Last Name (NamePattern6)");
  3593.             }
  3594.         }
  3595.         }
  3596.     }
  3597. }
  3598. }
  3599. # End of function name3()
  3600.  
  3601.  
  3602.  
  3603. #***********************************************************************************************************
  3604. #***********************************************************************************************************
  3605. #***********************************************************************************************************
  3606. # Function: commonHoliday()
  3607. # Searches for some common holiday names that can identify the date
  3608. # Extension: Add new holiday names to this regex
  3609.  
  3610. sub commonHoliday() {
  3611.     $text = $_[0];
  3612.     if ($datefilter =~ /y/) {
  3613.    
  3614.     while ($text =~ /\b(christmas|thanksgiving|easter|hannukah|rosh hashanah|ramadan)\b/ig) {
  3615.         $holidayname = $1;
  3616.         $start = length($`);
  3617.         $end = $start + length($holidayname);
  3618.         $key = $start."-".$end;
  3619.         addType ($key, "Holiday");
  3620.     }
  3621.     }
  3622. }
  3623. # End of function commonHoliday()
  3624. #***********************************************************************************************************
  3625. #***********************************************************************************************************
  3626. #***********************************************************************************************************    
  3627. # Function: knownPatientName()
  3628. # Searches for PID-specific patient names known a priori, i.e. the patient first and last names for this particular PID
  3629. # Indiscriminately removes these PHI from anywhere in the text
  3630. # Extension: To include new PID-patient name mappings, extend the file $patient_file
  3631.  
  3632. sub knownPatientName {
  3633.     $text = $_[0];
  3634.     if ($namefilter =~ /y/) {
  3635.  
  3636.     foreach $i (@known_first_name) {
  3637.         while ($text =~ /\b($i)\b/ig) {
  3638.         my $start = length($`);
  3639.         my $end = $start + length($1);
  3640.         my $key = "$start-$end";
  3641.         addType ($key, "Known patient firstname");
  3642.         }
  3643.     }
  3644.    
  3645.     foreach $j (@known_last_name) {
  3646.         while ($text =~ /\b($j)\b/ig) {
  3647.         my $start = length($`);
  3648.         my $end = $start + length($1);
  3649.         my $key = "$start-$end";
  3650.         addType ($key, "Known patient lastname");
  3651.         }
  3652.     }      
  3653.     }
  3654. }
  3655. # End of function knownPatientName()
  3656.  
  3657.  
  3658.  
  3659.  
  3660. #***********************************************************************************************************
  3661. #***********************************************************************************************************
  3662. #***********************************************************************************************************
  3663. #Returns true if the number passed in matches a defined us area code
  3664. sub isCommonAreaCode  {
  3665.     $areacode = $_[0];
  3666.     return ($us_area_code{$areacode});
  3667.  
  3668. }
  3669.  
  3670.  
  3671.  
  3672. #***********************************************************************************************************
  3673. #***********************************************************************************************************
  3674. #***********************************************************************************************************
  3675. # Function: telephone()
  3676. # Searches for telephone numbers, with or without area codes, with or without extensions
  3677. # Extension: To add new formats, add a new rule
  3678.  
  3679. sub telephone() {
  3680.     $text = $_[0];
  3681.     if ($telfilter =~ /y/) {
  3682.         while ($text =~ /\b\(?\d{3}\s*[ \-\.\/\=\,]*\s*\d{4}\)?\b/g) { #added back \b to avoid high fp
  3683.     #while ($text =~ /\b\d{3}\s*[ \-\.\/\=\,]*\s*\d{4}\b/g) { #added back \b to avoid high fp
  3684.         my $start = length($`);
  3685.         my $end = $start + length($&);
  3686.  
  3687.         my $nextSeg = substr $text, $end, 20;
  3688.  
  3689.         #catch extensions
  3690.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {       
  3691.         $end += length($1);
  3692.         } elsif ($nextSeg =~ /\A(\s*ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3693.         $end += length($1);
  3694.         } elsif ($nextSeg =~ /\A(\s*extension\s*\(?[\d]+\)?)\b/) {
  3695.         $end += length($1);
  3696.         }
  3697.  
  3698.        
  3699.         my $key = "$start-$end";
  3700.  
  3701.         #now checks the context
  3702.         $context_len = 20;
  3703.         my $start_pos = $start - $context_len;
  3704.         if ($start_pos < 0) {
  3705.         $start_pos = 0;
  3706.         }
  3707.         my $len = $context_len;
  3708.         if  (length ($text) < ($end + $context_len)){
  3709.         $len = length($text) - $end;
  3710.         }
  3711.         my $textBefore = substr $text, $start_pos, ($start - $start_pos);
  3712.         my $textAfter = substr $text, $end, $len;
  3713.         if (isProbablyPhone($textBefore)){
  3714.         addType ($key, "Telephone/Fax (1)");
  3715.         }
  3716.     }
  3717.    
  3718.    
  3719.        #pattern such as ###-###-####
  3720.        #let's not worry about patterns such as ###-HART and ###-LUNG for now
  3721.     #while ($text =~ /\(?\d{3}?\s?[\)\.\/\-\=\, ]*\s?\d{3}\s?[ \-\.\/\=]*\s?\d{4}\b/g) {
  3722.         while ($text =~ /\d{3}\s*[\)\.\/\-\, ]*\s*\d{3}\s*[ \-\.\/]*\s*\d{4}/g) {
  3723.  
  3724.        
  3725.     #if (isCommonAreaCode($1)){
  3726.         my $st = length($`);
  3727.         my $end = $st + length($&);
  3728.  
  3729.         my $nextSeg = substr $text, $end, 20;
  3730.        
  3731.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3732.        
  3733.         $end += length($1);
  3734.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3735.        
  3736.         $end += length($1);
  3737.         }  elsif ($nextSeg =~ /\A([\,]?\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3738.        
  3739.         $end += length($1);
  3740.         } elsif ($nextSeg =~ /\b(\s?extension\s*\(?[\d]+\)?)\b/) {
  3741.        
  3742.         $end += length($1);
  3743.         }
  3744.  
  3745.         my $key = "$st-$end";      
  3746.         addType ($key, "Telephone/Fax (2)");
  3747.  
  3748.     }
  3749.  
  3750.     #allow arbitrary line break (almost) anywhere in the phone numbers (except first 3 digit to reduce fp)
  3751.     #only scrubbs the pattern, if it's a known area code
  3752.     while ($text =~ /(\d\d\d)\s*[\)\.\/\-\, ]*\s*\d\s*\d\s*\d\s*[ \-\.\/]*\s*\d\s*\d\s*\d\s*\d/g) {
  3753.        
  3754.         if (isCommonAreaCode($1)){
  3755.         my $st = length($`);
  3756.         my $end = $st + length($&);
  3757.  
  3758.        
  3759.         my $nextSeg = substr $text, $end, 20;
  3760.        
  3761.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3762.        
  3763.             $end += length($1);
  3764.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3765.        
  3766.             $end += length($1);
  3767.         }  elsif ($nextSeg =~ /\A([\,]?\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3768.        
  3769.             $end += length($1);
  3770.         } elsif ($nextSeg =~ /\b(\s?extension\s*\(?[\d]+\)?)\b/) {
  3771.        
  3772.             $end += length($1);
  3773.         }
  3774.  
  3775.         my $key = "$st-$end";      
  3776.         addType ($key, "Telephone/Fax (2)");
  3777.         }
  3778.     }
  3779.  
  3780.     #check phone pattern that has 1 extra or 1 less digit at end
  3781.     #in case pattern such as xxx-xxx-xxx?, check if the first 3 digits match with
  3782.         #common area code
  3783.  
  3784.     while (($text =~ /\(?(\d{3})\s*[\)\.\/\-\=\, ]*\s*\d{3}\s*[ \-\.\/\=]*\s*\d{3}\b/g)){
  3785.        
  3786.         #match it with common local area code
  3787.         if (isCommonAreaCode($1)){
  3788.         my $st = length($`);
  3789.         my $end = $st + length($&);    
  3790.        
  3791.         my $nextSeg = substr $text, $end, 20;
  3792.        
  3793.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3794.        
  3795.             $end += length($1);
  3796.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3797.        
  3798.             $end += length($1);
  3799.         }  elsif ($nextSeg =~ /\A(\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3800.        
  3801.             $end += length($1);
  3802.         } elsif ($nextSeg =~ /\A(\s?extension\s*\(?[\d]+\)?)\b/) {
  3803.        
  3804.             $end += length($1);
  3805.         }
  3806.        
  3807.         my $key = "$st-$end";      
  3808.         addType ($key, "Telephone/Fax (3)");
  3809.         } #end if the first 3 digits are area codes
  3810.     }  #end while
  3811.  
  3812.     #check phone pattern that has 1 extra  digit at end
  3813.     #in case pattern such as xxx-xxx-xxxxx, check if the first 3 digits match with
  3814.         #common area code
  3815.     while (
  3816.            ($text =~ /\(?(\d{3})\s*[\)\.\/\-\=\, ]*\s*\d{3}\s*[ \-\.\/\=]*\s*\d{5}\b/g)) {
  3817.  
  3818.        
  3819.         #match it with common local area code
  3820.         if (isCommonAreaCode($1)){
  3821.         my $st = length($`);
  3822.         my $end = $st + length($&);    
  3823.        
  3824.         my $nextSeg = substr $text, $end, 20;
  3825.        
  3826.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3827.        
  3828.             $end += length($1);
  3829.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3830.        
  3831.             $end += length($1);
  3832.         }  elsif ($nextSeg =~ /\A(\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3833.            
  3834.             $end += length($1);
  3835.         } elsif ($nextSeg =~ /\A(\s?extension\s*\(?[\d]+\)?)\b/) {
  3836.            
  3837.             $end += length($1);
  3838.         }
  3839.    
  3840.         my $key = "$st-$end";      
  3841.         addType ($key, "Telephone/Fax (4)");
  3842.         } #end if the first 3 digits are area codes
  3843.     }  #end while
  3844.  
  3845.  
  3846.      #in case typed in pattern such as ###-####-###
  3847.        while ($text =~ /\(?\d{3}?\s?[\)\.\/\-\=\, ]*\s?\d{4}\s?[ \-\.\/\=]*\s?\d{3}\b/g) {
  3848.         my $st = length($`);
  3849.         my $end = $st + length($&);
  3850.         my $nextSeg = substr $text, $end, 20;
  3851.  
  3852.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3853.        
  3854.         $end += length($1);
  3855.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3856.        
  3857.         $end += length($1);
  3858.         }
  3859.         elsif ($nextSeg =~ /\A(\s*ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3860.        
  3861.         $end += length($1);
  3862.         } elsif ($nextSeg =~ /\A(\s*extension\s*\(?[\d]+\)?)\b/) {
  3863.        
  3864.         $end += length($1);
  3865.         }
  3866.  
  3867.  
  3868.         my $key = "$st-$end";      
  3869.        
  3870.         addType ($key, "Telephone/Fax (5)");
  3871.     }
  3872.     }    
  3873. }
  3874. # End of function telephone()
  3875.  
  3876. #***********************************************************************************************************
  3877. #***********************************************************************************************************
  3878. #***********************************************************************************************************
  3879. # Add new function here >>>>>>>>>>
  3880. # Follow format shown here if necessary
  3881. # sub functionName {
  3882. #    $text = $_[0];
  3883. #    while ($text =~ /(<search pattern>)/) {
  3884. #       $startIndex = length($');
  3885. #       $endIndex = $startIndex + length($1);
  3886. #       $phiKey = $startIndex."-".$endIndex;
  3887. #       addType ($phiKey, "Name of PHI Category");
  3888. #   }
  3889. # }
  3890.  
  3891. #***********************************************************************************************************
  3892. #***********************************************************************************************************
  3893. #***********************************************************************************************************
  3894. # Function: pruneKeys()
  3895. # Arguments: hash %keys, string $text
  3896. # Returns: array @keylst
  3897. # Called by: findPHI()
  3898. # Description: Extracts PHI locations from hash %keys, compares each loc with previous loc to prevent overlaps
  3899. # Returns an array of pruned PHI keys
  3900.  
  3901. sub pruneKeys {
  3902.     my ($keyh,$text) = @_;
  3903.     my $lk = "";
  3904.     my ($ls,$le)=(0,0);
  3905.     my ($cs,$ce)=(0,0);
  3906.     my @keylst = ();
  3907.     foreach $k (sort numerically keys %{$keyh}) {
  3908.     #print "prunkey, key = $k  values = \n";
  3909.     $ls = $cs;
  3910.     $le = $ce;
  3911.     ($cs,$ce)= split ('-',$k);
  3912.     if ($cs > $le){push (@keylst,$lk);} # proper relation
  3913.     elsif($cs > $ls){
  3914.         if($ce > $le) {
  3915.         my $stgl = substr($text,$ls,$le-$ls);          
  3916.         my $stgc = substr($text,$cs,$ce-$cs);  
  3917.         $cs = $ls; $k = "$ls-$ce"; $$keyh{$k} = $$keyh{$lk}} # include both transfer types
  3918.         else{$cs = $ls; $ce = $le; $k = $lk;}} # use previous (current in previous)
  3919.     elsif($le > $ce){$cs = $ls; $ce = $le; $k = $lk;} # use previous (current in previous)
  3920.     $lk = $k;}
  3921.     #print "pushing $lk to keylst\n";
  3922.     push (@keylst,$lk); # last one
  3923.     return (@keylst)
  3924.     }    
  3925. # End of pruneKeys()
  3926. #***********************************************************************************************************
  3927. #***********************************************************************************************************
  3928. #***********************************************************************************************************
  3929. # Function: addType()
  3930. # Arguments: string $key ("start-end" of PHI loc), string $type (PHI type)
  3931. # Returns: None
  3932. # Called by: findPHI()
  3933. # Description: Pushes PHI key and type into the hash %phi
  3934. # Keeps track of all possible PHI types for each PHI key
  3935.  
  3936. sub addType {
  3937.  
  3938.     my ($key,$type) = @_;
  3939.     ($st,$end) = split '-',$key;
  3940.     if ($end > $end{$st}) {
  3941.     $end{$st} = $end;
  3942.     }
  3943.     #print "in addType, key is $key\n";
  3944.     push @{$phi{$key}}, $type;
  3945.     $t = (@{$phiT{$key}});
  3946.     $start = $st - 1 - 64;
  3947.     $ending = $end - 1 - 64;
  3948.     return;
  3949. }
  3950. # End of addType()
  3951. #***********************************************************************************************************
  3952. #***********************************************************************************************************
  3953. #***********************************************************************************************************
  3954. # Function: isType()
  3955. # Arguments: string $key ("start-end" of PHI loc), string $type (PHI type), int $pattern (1 if PHI type can be matched with '=~'; 0 otherwise)
  3956. # Returns: 1 if PHI $key is of PHI type $type; 0 otherwise
  3957. # Called by: findPHI()
  3958. # Description: Given a PHI loc, checks its PHI type in the existing PHI hash. If the type in the hash is equal to the given type, then returns 1.
  3959.  
  3960. sub isType {
  3961.     my ($key, $type, $pattern) = @_;
  3962.     foreach $tt (@{$phi{$key}}){
  3963. #   print "isType, tt is $tt key is $key\n";
  3964.     if ($pattern) {
  3965.         if ($tt =~ /$type/) {
  3966.         return 1;
  3967.         }
  3968.     }
  3969.     elsif ($tt eq $type) {
  3970.         return 1;
  3971.     }
  3972.     }
  3973.     return 0;
  3974. }
  3975.  
  3976. # End of isType()
  3977.  
  3978. #***********************************************************************************************************
  3979. #***********************************************************************************************************
  3980. #***********************************************************************************************************
  3981. # Function: isPHIType()
  3982. # Arguments: string $mytype (PHI type),  array of string @phiTypes
  3983. # Returns: 1 if PHI $mytype appears in @phiTypes, 0 otherwise.
  3984. # Called by: deid()
  3985. # Description: Given a PHI type, checks if it appears in @phiTypes, if so, returns 1. Returns 0 otherwise.
  3986.  
  3987. sub isPHIType {
  3988.   my (  $mytype,  @phiTypes) = @_;
  3989.    
  3990.     #foreach $tt (@{$phi{$key}}){
  3991.     foreach $tt (@phiTypes){
  3992.    
  3993.     if ($tt =~ /$mytype/) {
  3994.         return 1;
  3995.     } #end if
  3996.     } #end foreach
  3997.     return 0;
  3998. }
  3999.  
  4000. # End of isPHIType()
  4001.  
  4002.  
  4003. #***********************************************************************************************************
  4004. #***********************************************************************************************************
  4005. #***********************************************************************************************************
  4006. # Function: isCommon()
  4007. # Arguments: string $word
  4008. # Returns: 1 if given word is a common word; 0 otherwise
  4009. # Called by: findPHI()
  4010. # Description: Compares the given word to the common_words association, compiled from dictionary files for common English words and from SNOMED.
  4011. # Returns 1 if given word is in one of those lists, i.e. is a common word.
  4012.  
  4013. sub isCommon {
  4014.     my $word = $_[0];
  4015.     chomp $word;
  4016.     $word = uc($word);
  4017.     return  ($common_words{$word} || $unambig_common_words{$word});
  4018.    
  4019. }
  4020. # End of isCommon()
  4021.  
  4022. #***********************************************************************************************************
  4023. #***********************************************************************************************************
  4024. #***********************************************************************************************************
  4025. # Function: isUnambigCommon()
  4026. # Arguments: string $word
  4027. # Returns: 1 if given word is a really common word or unambig med terms; 0 otherwise
  4028. sub isUnambigCommon {
  4029.     my $word = $_[0];
  4030.     $word = uc($word);
  4031.     return $unambig_common_words{$word};
  4032. }
  4033. # End of isUnambigCommon()
  4034.  
  4035.  
  4036. #***********************************************************************************************************
  4037. #***********************************************************************************************************
  4038. #***********************************************************************************************************
  4039. # Function: isNameAmbig()
  4040. # Arguments: string $word
  4041. # Returns: 1 if given word is an ambiguous name
  4042. # Called by: findPHI()
  4043. # Description: Searches for given word in lists of ambiguous male, female and last names
  4044. # Returns 1 if word is in any of those lists
  4045. sub isNameAmbig {
  4046.     my $word = $_[0];
  4047.     $word = uc($word);
  4048.     return (($male_ambig{$word}) || ($female_ambig{$word}) || ($last_ambig{$word}));
  4049. }
  4050.  
  4051. #***********************************************************************************************************
  4052. #***********************************************************************************************************
  4053. #***********************************************************************************************************
  4054. #Function: isProbablyName()
  4055. #Arguments: string $key, $potential_name
  4056. #Returns true if
  4057. #name is Not a common word  OR
  4058. #name is Unambiguous name OR
  4059. #name maybe ambiguous BUT starts with Capital_letter followed by small_case letter OR
  4060. #name is popular
  4061. sub isProbablyName{
  4062.     my ($key, $potential_name) = @_;
  4063.  
  4064.     if ( (!isCommon($potential_name)) ||
  4065.      ((isType($key, "Name", 1) && isType($key, "(un)"))  ||
  4066.       (isType($key, "Name", 1) && ($potential_name =~ /\b(([A-Z])([a-z]+))\b/g)) ||
  4067.       (isType($key, "popular",1)) )) {
  4068.      
  4069.     return 1;
  4070.     } else {
  4071.     return 0;
  4072.     }
  4073.  
  4074. }
  4075. #***********************************************************************************************************
  4076. #***********************************************************************************************************
  4077. #***********************************************************************************************************
  4078. #Function: isUSStateAbbre
  4079. #Data structure used for us states
  4080. #Returns true if the word is a US state abbreviation
  4081. sub isUSStateAbbre {
  4082.     my $word = $_[0];
  4083.     $word = uc($word);
  4084.  
  4085.     foreach $loc (@us_states_abbre){
  4086.     if ($word =~/\b$loc\b/gi){
  4087.         return 1;
  4088.     }
  4089.     }
  4090.     foreach $loc (@more_us_states_abbre){
  4091.     if ($word =~/\b$loc\b/gi){
  4092.         return 1;
  4093.     }
  4094.     }
  4095.     return 0;
  4096.  
  4097. }
  4098.  
  4099. #***********************************************************************************************************
  4100. #***********************************************************************************************************
  4101. #***********************************************************************************************************
  4102. #Function: isNameIndicator
  4103. #Returns true if the word is a name indicator
  4104. sub isNameIndicator {
  4105.     my $word = $_[0];
  4106.     $word = uc($word);
  4107.  
  4108.     foreach $nam (@name_indicators){
  4109.     #print "nam in name indicators is $nam";
  4110.     if ($word =~/\b$nam\b/gi){
  4111.        
  4112.         return 1;
  4113.     }
  4114.     }
  4115.  
  4116.     return 0;
  4117. }
  4118.  
  4119. #***********************************************************************************************************
  4120. #***********************************************************************************************************
  4121. #***********************************************************************************************************
  4122. #Function: isUSState
  4123. #Returns true if the word is a US State
  4124. sub isUSState {
  4125.     my $word = $_[0];
  4126.     $word = uc($word);
  4127.     #return (($us_states{$word}));
  4128.     foreach $loc (@us_states){
  4129.     if ($word =~/\b$loc\b/gi){
  4130.         return 1;
  4131.     }
  4132.     }
  4133.     return 0;
  4134.  
  4135.  
  4136. }
  4137.  
  4138. #***********************************************************************************************************
  4139. #***********************************************************************************************************
  4140. #***********************************************************************************************************
  4141. # Function: isFirstName()
  4142. # Arguments: string $word
  4143. # Returns: 1 if given word is an ambiguous or unambiguous firstname
  4144. # Called by: findPHI()
  4145.  
  4146. sub isFirstName {
  4147.     my $word = $_[0];
  4148.     #$word = ($word);
  4149.     $word = uc($word);
  4150.     return (($male_ambig{$word}) || ($female_ambig{$word}) || ($male_unambig{$word}) || ($female_unambig{$word}) || ($male_popular{$word}) || ($female_popular{$word}));
  4151. }
  4152. # End of isFirstName()
  4153.  
  4154. #***********************************************************************************************************
  4155. #***********************************************************************************************************
  4156. #***********************************************************************************************************
  4157. # Function: isCommonest()
  4158. # Arguments: string $word
  4159. # Returns: 1 if given word is a commonest word; 0 otherwise
  4160. # Called by: findPHI()
  4161. # Description: Compares the given word to the commonest_words association, compiled from dictionary file for commonest English words.
  4162. # Returns 1 if given word is in that list, i.e. is a commonest word.
  4163.  
  4164. sub isCommonest {
  4165.     my $word = $_[0];
  4166.     $word = uc($word);
  4167.     return ($very_common_words{$word} || isUnambigCommon($word));
  4168. }
  4169.  
  4170. # End of isCommonest()
  4171. #***********************************************************************************************************
  4172. #***********************************************************************************************************
  4173. #***********************************************************************************************************
  4174. # Function: getNote()
  4175. # Arguments: int $patient (patient number), int $note (note number)
  4176. # Returns: string $noteText
  4177. # Called by: deidStats() (obsolete)
  4178. # Description: Given patient and note numbers, looks up header in $header_file
  4179. # If it finds the header, extracts the body of the note, and returns text until the end pattern
  4180. sub getNote{
  4181.  
  4182.     my ($patient, $note) = @_;    
  4183.     open DF, $data_file or die "Cannot open $data_file";
  4184.     my $noteFound = 0;
  4185.     my $noteText = "";
  4186.    
  4187.   D:    
  4188.     # Parses the data file line-by-line to match the header found in the header file
  4189.     while ($line = <DF>) {
  4190.     chomp $line;
  4191.    
  4192.  
  4193.     # If header is found in the text, then matches the end pattern, and sets the body of the note (excluding the header) as the note text
  4194.  
  4195.     if ($line =~ /\b$patient\|\|\|\|$note\|\|\|\|/) {
  4196.         $noteFound = 1;
  4197.         $noteText = "";
  4198.     }
  4199.     else {     
  4200.         if ($noteFound) {
  4201.         if ($line eq "||||END_OF_RECORD"){
  4202.             #$noteText = $noteText."\n".$1;
  4203.             #$noteText = $noteText.$1."\n";
  4204.             $end = $2;
  4205.             last D;
  4206.         }
  4207.         else {
  4208.             $noteText = $noteText.$line."\n";
  4209.         }
  4210.         }
  4211.     }
  4212.     }
  4213.     close DF;
  4214.    
  4215.     # If the note text has zero length, prints an error message
  4216.     if ((length $noteText) == 0) {
  4217.     print("Warning. No text found for Patient $patient, Note $note ");
  4218.     }
  4219.  
  4220.     # Returns the body of the note (everything excluding the header) as the note text
  4221.     return $noteText;
  4222. }
  4223. # End of getNote()
  4224.  
  4225.  
  4226.  
  4227.  
  4228. #***********************************************************************************************************
  4229. #***********************************************************************************************************
  4230. #***********************************************************************************************************
  4231. # Function: isValidDate()
  4232. # Arguments: int $month, int $day, int $year (If the date being validated doesn't specify year, the "year" argument should be "-1")
  4233. # Returns: 1 if given date is valid based on the calendar
  4234. # Called by: findPHI(), isValidDay()
  4235. # Description: Verifies if the given date is valid or not.
  4236.  
  4237. sub isValidDate{
  4238.     my ($month, $day, $year) = @_;
  4239.     if (($year!=-1) && (length $year) == 2) {
  4240.     #if ($year < 30) {
  4241.     if ($year <= $TWO_DIGIT_YEAR_THRESHOLD) {
  4242.         $year = "20".$year;
  4243.     }
  4244.     else {
  4245.         $year = "19".$year;
  4246.     }
  4247.     }
  4248.    
  4249.     #if (($year != -1) && ($year < 1900 || $year > 2030)){
  4250.     #if (($year != -1) && ($year <= $VALID_YEAR_LOW || $year >= $VALID_YEAR_HIGH)){
  4251.     if (($year != -1) && ($year < $VALID_YEAR_LOW || $year > $VALID_YEAR_HIGH)){
  4252.    
  4253.     return 0;
  4254.     }
  4255.    
  4256.     # Invalid months and days
  4257.     if (($month< 1) || ($month > 12) || ($day < 1) || ($day > 31)) {
  4258.     return 0;
  4259.     }
  4260.  
  4261.     # Checks validity of February days
  4262.     if ($month == 2) {
  4263.     if (($year != -1) && (($year % 4) == 0) && ($year != 2000)) {
  4264.         return ($day <= 29);
  4265.     }
  4266.     return ($day <= 28);
  4267.    
  4268.     # Checks validity of months consisting of 30 days
  4269.     }
  4270.     elsif (($month == 4) || ($month == 6) || ($month== 9) || ($month == 11)) {
  4271.     return ($day <= 30);
  4272.     }
  4273.    
  4274.     # Checks validity of months consisting of 31 days
  4275.     return ($day <= 31);
  4276. }
  4277. # End of isValidDate()
  4278. #***********************************************************************************************************
  4279. #***********************************************************************************************************
  4280. #***********************************************************************************************************
  4281. # Function: isValidDay()
  4282. # Arguments: int $day, string $month, int $year
  4283. # Returns: 1 if given date is valid; 0 otherwise
  4284. # Called by: findPHI()
  4285. # Description: Verifies validity of date when month is printed, by calling isValidDate()
  4286.  
  4287. sub isValidDay {
  4288.     my ($day, $month, $year) = @_;
  4289.     my $mnum = 0;
  4290.  
  4291.     # Converts printed out months to numerical months
  4292.     if ($month =~ /Jan|January|Januar/i) { $mnum = 1; }
  4293.     elsif ($month =~ /Feb|February|Februar/i) { $mnum = 2; }
  4294.     elsif ($month =~ /Mar|March|Maerz/i) { $mnum = 3; }
  4295.     elsif ($month =~ /Apr|April/i) { $mnum = 4; }
  4296.     elsif ($month =~ /May|Mai/i) { $mnum = 5; }
  4297.     elsif ($month =~ /June|Jun|Juni/i) { $mnum = 6; }
  4298.     elsif ($month =~ /July|Jul|Juli/i) { $mnum = 7; }
  4299.     elsif ($month =~ /August|Aug/i) { $mnum = 8; }
  4300.     elsif ($month =~ /September|Sept|Sep/i) { $mnum = 9; }
  4301.     elsif ($month =~ /October|Oct/i) { $mnum = 10; }
  4302.     elsif ($month =~ /November|Nov/i)  { $mnum = 11; }
  4303.     elsif ($month =~ /December|Dec/i) { $mnum = 12; }
  4304.     if ($mnum == 0) { return 0; }
  4305.    
  4306.     return (isValidDate($mnum, $day, $year));
  4307. }
  4308. # End of isValidDay()
  4309.  
  4310.  
  4311. #***********************************************************************************************************
  4312. #***********************************************************************************************************
  4313. #***********************************************************************************************************
  4314. # Function: outputText()
  4315. # Arguments: hash %deids (key=PHI start index, value=PHI end index), hash %phiT (key=PHI start-end indices, value=PHI type)
  4316. # Returns: None
  4317. # Called by: deid()
  4318. # Description: Creates the de-identified version of the text
  4319. # Replaces dates with shifted dates, and other PHI with their PHI types
  4320. sub outputText {
  4321.     my %deids = %{$_[0]};
  4322.     my %phiT = %{$_[1]};
  4323.  
  4324.     # These are the date and PID of the medical record. The year needs to be initialized.
  4325.    # my $checkYear = "2005";
  4326.     my $checkYear;
  4327.     my $checkDate;
  4328.     my $checkID;
  4329.  
  4330.     # Forms associations between printed months and numerical ones
  4331.     # This is because the date-shifting function only accepts numerical months, and months in the text can be non-numerical as well
  4332.     my %months;
  4333.     $months{"jan"} = 1;
  4334.     $months{"feb"} = 2;
  4335.     $months{"mar"} = 3;
  4336.     $months{"apr"} = 4;
  4337.     $months{"may"} = 5;
  4338.     $months{"jun"} = 6;
  4339.     $months{"jul"} = 7;
  4340.     $months{"aug"} = 8;
  4341.     $months{"sep"} = 9;
  4342.     $months{"oct"} = 10;
  4343.     $months{"nov"} = 11;
  4344.     $months{"dec"} = 12;
  4345.  
  4346.     #TF is the .res file (de-identified corpus), and OF is the (.phi file) list of PHIs
  4347.     # open TF, ">$deid_text_file" or die "Cannot open $deid_text_file";
  4348.     open TF, ">>$deid_text_file" or die "Cannot open $deid_text_file";   #now open in append mode
  4349.     my $lastEnd = 0;
  4350.     #open OF, ">>$output_file_old" or die "Cannot open $output_file_old";
  4351.  
  4352.     my $phiType = "";
  4353.  
  4354.     # Prints the PHI locations to the output file (.phi)
  4355.     foreach $k (sort numerically keys %deids) {        
  4356.     my @deidsval = @{$deids{$k}};
  4357.     local $added = 0;
  4358.     my $phiID;
  4359.  
  4360.     # Loops over each PHI recorded in %deids
  4361.     # %deids maps each PHI start index to an array of 3 items
  4362.     # These items are in order: the PHI end index, the PID of the PHI, the record date of the PHI    
  4363.     #if (exists ${@{$deids{$k}}}[0]) {
  4364.     if (exists $deidsval[0]){
  4365.         # Sets some key variables for the PHI
  4366.  
  4367.         #the following no longer works with perl v5.10
  4368.         #$deidsend = ${@{$deids{$k}}}[0]; # End index
  4369.         #$checkID = ${@{$deids{$k}}}[1]; # PID
  4370.         #$checkDate = ${@{$deids{$k}}}[2]; # Record date
  4371.  
  4372.         $deidsend = $deidsval[0]; # End index
  4373.         $checkID = $deidsval[1]; # PID
  4374.         $checkDate = $deidsval[2]; # Record date
  4375.         $checkYear = (substr $checkDate, 6, 4); # Record year
  4376.         if (length($checkYear) ==0) {
  4377.         $checkYear = extractYear($DEFAULT_DATE);
  4378.         print "Warning, in outputText(), cannot extract year from noteDate, setting year to default year $checkYear.";
  4379.         }
  4380.  
  4381.         # Immediately prints the start and end indices of the PHI to the .phi file
  4382.         #print OF "$k\t$deidsend\n";
  4383.  
  4384.         # Sets the $key to the current PHI
  4385.         my $key = $k."-".$deidsend;
  4386.         my $lastlast = $lastEnd;
  4387.         my $phiText;
  4388.        
  4389.         # If this PHI is a date element, shifts the date and replaces it in the text
  4390.         # Output format is YYYY/MM/DD always
  4391.         # For month/date formats, assumes that year = record year; for month/year formats, assumes that day = 1
  4392.  
  4393.         # Needs to go over %phiT for every PHI in %deids
  4394.         # %phiT maps each PHI key to its type, e.g. "Mary" -> "First Name"
  4395.         # This part is necessary because re-identification depends on the PHI's type
  4396.         foreach $ky (sort numerically keys %phiT) {
  4397.  
  4398.         my $ky = $ky;
  4399.         ($startp, $endp) = split "-", $ky;
  4400.         $notAmbig = 0; 
  4401.        
  4402.         # Checks to see if PHI type matches any of the date patterns
  4403.         # Each PHI may have more than one listed type, e.g. First Name AND Last Name
  4404.         # For each PHI type listed for the specific PHI
  4405.         foreach $t (@{$phiT{$ky}}) {
  4406.  
  4407.             $datephi1 = "Year/Month/Day"; # e.g. 1999/2/23
  4408.             $datephi2 = "Year/Day/Month"; # e.g. 1999/23/2, note: pattern currently not filtered in sub date()!
  4409.             $datephi3 = "Month/Day/Year"; # e.g. 2/23/1999
  4410.             $datephi5 = "Day/Month/Year"; # 23/2/1999, note: pattern currently not filtered in sub date()!
  4411.             $datephi4 = "Month/Day"; # e.g. 2/23, using record year as year
  4412.             $datephi6 = "Day/Month"; # e.g. 23/2, using record current year as year, note: pattern currently not filtered in sub date()!
  4413.             $datephi7 = "Month/Year";  # e.g. 2/1999, using 1 as day
  4414.             $datephi8 = "Year/Month";  # e.g. 1999/2, using 1 as day
  4415.             $datephi9 = "Day Month Year"; # e.g. 23 february 1999
  4416.             $datephi10 = "Month Day Year"; # e.g. feb 23 1999 or feb. 23rd 1999
  4417.             $datephi11 = "Month Day"; # e.g. feb 23, using record year as year
  4418.             $datephi12 = "Day Month"; # e.g. 23 february, using record year as year
  4419.             $datephi13 = "Month Year"; # e.g. feb 1999, or february of 1999 or feb. 1999, using 1 as day
  4420.             $datephi14 = "Header Date"; # not important
  4421.             $datephi15 = "4 digits"; # 4-digit year, e.g. 1999
  4422.             $datephi16 = "2 digits"; # 2-digit year, e.g. '99
  4423.             $datephi17 = "Day Month Year 2"; # e.g. 23rd february 1999
  4424.  
  4425.            
  4426.             #if ($ky =~/$key/) {$phiType = $t;}
  4427.             if ($ky eq $key) {$phiType = $t;}
  4428.  
  4429.             # Calls the date-shifting function alterdate() with a date argument appropriate for the date pattern
  4430.             # This is because alterdate() accepts an argument of a fixed date pattern
  4431.             # Prints the resulting shifted date in deid_text_file (.res)
  4432.            
  4433.             # If the current PHI has not been output to the .res file yet, checks if PHI type is date
  4434.             # Shifts the date and writes the shifted date to TF (.res file)
  4435.             if ($added == 0) {
  4436.            
  4437.             #if (($t =~ /$datephi1/)  && ($ky=~/$key/)){
  4438.             if (($t =~ /$datephi1/)  && ($ky eq $key)){        
  4439.                 $date = (substr $allText, $k, ($deidsend-$k));             
  4440.                 $date =~ s/\-/\//g;    
  4441.                 $date =~ s/\./\//g;
  4442.                 $altereddate = &alterdate($date, $checkID);
  4443.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4444.                 $longyear = $1;
  4445.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";                        
  4446.                 $added = 1;
  4447.                 $lastEnd = $deidsend;
  4448.                
  4449.             }
  4450.             elsif (($t =~ /$datephi17/)  && ($ky eq $key)){  
  4451.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4452.                
  4453.                 $date =~ /\b(((\d{1,2})(|st|nd|rd|th|)?\s+(of\s)?[\-]?\b([A-Za-z]+)\.?,?)\s+(\d{2,4}))\b/ig; # 12-Apr, or Second of April
  4454.  
  4455.                 $mon = $6;
  4456.                 $day = $3;
  4457.                 $year = $7;
  4458.                 foreach $m (sort keys %months) {
  4459.                 if ($mon =~ /$m/ig) {
  4460.                     $month = $months{$m};
  4461.                 }
  4462.                 }              
  4463.                 $date =  "$year/$month/$day";
  4464.  
  4465.                 $altereddate = &alterdate($date, $checkID );
  4466.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4467.                
  4468.                 $added = 1;
  4469.                 $lastEnd = $deidsend;
  4470.             }              
  4471.             #elsif (($t =~ /$datephi9/)  && ($ky=~/$key/)){
  4472.             elsif (($t =~ /$datephi9/)  && ($ky eq $key)){  
  4473.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4474.                 $date =~ /(\d+) ([A-Za-z]+)\,? (\d+)/;
  4475.                 $mon = $2;
  4476.                 $day = $1;
  4477.                 $year = $3;
  4478.                 foreach $m (sort keys %months) {
  4479.                 if ($mon =~ /$m/ig) {
  4480.                     $month = $months{$m};
  4481.                 }
  4482.                 }              
  4483.                 $date =  "$year/$month/$day";
  4484.                 $longyear = $5;
  4485.                 $altereddate = &alterdate($date, $checkID );
  4486.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4487.                
  4488.                 #local $added = 1;
  4489.                 $added = 1;
  4490.                 $lastEnd = $deidsend;
  4491.             }          
  4492.             #elsif (($t =~ /$datephi10/)  && ($ky=~/$key/)){  
  4493.             elsif (($t =~ /$datephi10/)  && ($ky eq $key)){
  4494.  
  4495.                 $date = (substr $allText, $k, ($deidsend-$k)); 
  4496.                
  4497.                 $date =~ /\b(([A-Za-z]+)\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?[\,\s]+ *\'?(\d{2,4}))\b/ig;
  4498.                 $mon = $2;
  4499.                 $day = $3;
  4500.                 $year = $5;
  4501.                 #$date =~ /([A-Za-z]+) (\d+)\,? (\d+)/;
  4502.                 #$mon = $1;
  4503.                 #$day = $2;
  4504.                 #$year = $3;
  4505.                 #print "DatePHI10: Before date shift: month= $mon, day = $day , year = $year \n";
  4506.  
  4507.                 foreach $m (sort keys %months) {
  4508.                 if ($mon =~ /$m/ig) {
  4509.                     $month = $months{$m};
  4510.                 }
  4511.                 }
  4512.                 $date =  "$year/$month/$day";
  4513.                 $altereddate = &alterdate($date, $checkID);
  4514.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4515.                
  4516.                 #local $added = 1;
  4517.                 $added = 1;
  4518.                 $lastEnd = $deidsend;
  4519.             }                  
  4520.             #elsif (($t =~ /$datephi2/)  && ($ky=~/$key/)){  
  4521.             elsif (($t =~ /$datephi2/)  && ($ky eq $key)){  
  4522.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4523.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4524.                 $date =  "$1/$5/$3";
  4525.                 $longyear = $1;
  4526.                 $altereddate = &alterdate($date, $checkID);
  4527.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4528.                 #local $added = 1;
  4529.                 $added = 1;
  4530.                 $lastEnd = $deidsend;
  4531.             }      
  4532.             #elsif (($t =~ /$datephi3/)  && ($ky=~/$key/)){  
  4533.             elsif (($t =~ /$datephi3/)  && ($ky eq $key)){  
  4534.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4535.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4536.                 $date =  "$5/$1/$3";
  4537.                 $altereddate = &alterdate($date,  $checkID);
  4538.                 $longyear = $5;
  4539.                
  4540.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4541.                 $added = 1;
  4542.                 $lastEnd = $deidsend;              
  4543.             }
  4544.             #elsif (($t =~ /$datephi5/)  && ($ky=~/$key/)){    
  4545.             elsif (($t =~ /$datephi5/)  && ($ky eq $key)){                 
  4546.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4547.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4548.                 $date =  "$5/$3/$1";
  4549.                 $altereddate = &alterdate($date, $checkID );
  4550.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4551.                 $longyear = $5;
  4552.                 $added = 1;
  4553.                 $lastEnd = $deidsend;              
  4554.             }  
  4555.             #elsif (($t =~ /$datephi4/)  && ($ky=~/$key/)){  
  4556.             elsif (($t =~ /$datephi4/)  && ($ky eq $key)){                 
  4557.                 $date = (substr $allText, $k, ($deidsend-$k)); 
  4558.                 $date =~ s/\-/\//g;    
  4559.                 $date = "$checkYear/$date";            
  4560.                 $altereddate = &alterdate($date, $checkID);
  4561.                 $altereddate = (substr $altereddate, 5, (length($altereddate)-5));
  4562.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4563.                 $added = 1;
  4564.                 $lastEnd = $deidsend;              
  4565.             }  
  4566.             #elsif (($t =~ /$datephi11/)  && ($ky=~/$key/)){  
  4567.             elsif (($t =~ /$datephi11/)  && ($ky eq $key)){                
  4568.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4569.                 #$date =~ /([A-Za-z]+) (\d+)/;
  4570.                 #$mon = $1;
  4571.                 #$day = $2;
  4572.                 $date =~ /\b(([A-Za-z]+)\b\.?,?\s*(\d{1,2})(|st|nd|rd|th|)?)\b/ig;  # Apr. 12
  4573.                 $mon = $2;
  4574.                 $day = $3;
  4575.                
  4576.                 foreach $m (sort keys %months) {
  4577.                 if ($mon =~ /$m/ig) {
  4578.                     $month = $months{$m};
  4579.                 }
  4580.                 }
  4581.                 $date =  "$checkYear/$month/$day";
  4582.                 $altereddate = &alterdate($date,  $checkID);
  4583.                 $altereddate = substr($altereddate, 5, (length($altereddate)-5));              
  4584.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4585.                 $added = 1;
  4586.                 $lastEnd = $deidsend;
  4587.             }
  4588.                
  4589.             #elsif (($t =~ /$datephi6/)  && ($ky=~/$key/)){  
  4590.             elsif (($t =~ /$datephi6/)  && ($ky eq $key)){                 
  4591.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4592.                 $date =~ /(\d+)(.)(\d+)/;
  4593.                 #print "DatePHI6: date is $date, 3 is $3, 1 is $1\n";
  4594.                 $date =  "$3/$1";
  4595.                 $date = "$checkYear/$date";
  4596.                 $altereddate = &alterdate($date,  $checkID);
  4597.                 $altereddate = substr($altereddate, 5, (length($altereddate)-5));
  4598.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4599.                 $added = 1;
  4600.                 $lastEnd = $deidsend;              
  4601.             }
  4602.             #elsif (($t =~ /$datephi12/)  && ($ky=~/$key/)){    
  4603.             elsif (($t =~ /$datephi12/)  && ($ky eq $key)){                
  4604.                 $date = (substr $allText, $k, ($deidsend-$k)); 
  4605.                 #$date =~ /(\d+) ([A-Za-z]+)/;
  4606.                 #$mon = $2;
  4607.                 #$day = $1;
  4608.                 $date =~ /\b((\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b([A-Za-z]+))\b/ig;
  4609.                
  4610.                 $mon = $5;
  4611.                 $day = $2;
  4612.                 #print "month is $month, day is $day\n";
  4613.                 foreach $m (sort keys %months) {
  4614.                 if ($mon =~ /$m/ig) {
  4615.                     $month = $months{$m};
  4616.                 }
  4617.                 }
  4618.                 $date =  "$checkYear/$month/$day";
  4619.                 $altereddate = &alterdate($date,  $checkID);
  4620.                 $altereddate = substr($altereddate, 5, (length($altereddate)-5));              
  4621.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4622.                 $added = 1;
  4623.                 $lastEnd = $deidsend;
  4624.             }
  4625.             #elsif (($t =~ /$datephi13/)  && ($ky=~/$key/)){  
  4626.             elsif (($t =~ /$datephi13/)  && ($ky eq $key)){  
  4627.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4628.                 $date =~ /([A-Za-z]+)(\.)?(\s+)(of\s+)?(\d+)/ig;
  4629.                 $mon = $1;
  4630.                 $year = $5;
  4631.                 foreach $m (sort keys %months) {
  4632.                 if ($mon =~ /$m/ig) {              
  4633.                     $month = $months{$m};
  4634.                 }
  4635.                 }
  4636.  
  4637.                 $date = "$year/$month/1";
  4638.                 $longyear = $3;
  4639.                 $altereddate = &alterdate($date, $checkID );
  4640.                
  4641.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4642.                 $added = 1;
  4643.                 $lastEnd = $deidsend;
  4644.             }
  4645.             #elsif (($t =~ /$datephi7/)  && ($ky=~/$key/)){    
  4646.             elsif (($t =~ /$datephi7/)  && ($ky eq $key)){                 
  4647.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4648.                 $date =~ /(\d+)(.)(\d+)/;      
  4649.                 $date =  $3.'/'.$1.'/1';
  4650.                 $altereddate = &alterdate($date,  $checkID);
  4651.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4652.                 $longyear = $3;
  4653.                 $added = 1;
  4654.                 $lastEnd = $deidsend;
  4655.             }
  4656.             #elsif (($t =~ /$datephi8/)  && ($ky=~/$key/)){  
  4657.             elsif (($t =~ /$datephi8/)  && ($ky eq $key)){                 
  4658.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4659.                 $date =~ /(\d+)(.)(\d+)/;      
  4660.                 $date =  $1.'/'.$3.'/1';
  4661.                 $altereddate = &alterdate($date,  $checkID);
  4662.                 $longyear = $1;
  4663.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4664.                 $added = 1;
  4665.                 $lastEnd = $deidsend;
  4666.             }
  4667.             #elsif (($t =~ /$datephi14/)  && ($ky=~/$key/)) {
  4668.             elsif (($t =~ /$datephi14/)  && ($ky eq $key)) {
  4669.                 $date = (substr $allText, $k, ($deidsend-$k));
  4670.                 $date =~ /(\d+)(\-)(\d+)(\-)(\d+)/;
  4671.                 $date = $1.'/'.$3.'/'.$5;
  4672.                 $altereddate = &alterdate($date,  $checkID);
  4673.                 $longyear = $1;
  4674.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."$date";
  4675.                 $added = 1;
  4676.                 $lastEnd = $deidsend;
  4677.             }
  4678.             #elsif (($t =~ /$datephi15/)  && ($ky=~/$key/)) {
  4679.             elsif (($t =~ /$datephi15/)  && ($ky eq $key)) {
  4680.                 $year  = (substr $allText, $k, ($deidsend-$k));
  4681.                 $date = $year.'/1/1';
  4682.                 $altereddate = &alterdate($date, $checkID);
  4683.                 $alteredyear = (substr $altereddate, 0, 4);
  4684.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$alteredyear**]";
  4685.                 $added = 1;
  4686.                 $lastEnd = $deidsend;
  4687.             }  
  4688.             #elsif (($t =~ /$datephi16/)  && ($ky=~/$key/)) {
  4689.             elsif (($t =~ /$datephi16/)  && ($ky eq $key)) {
  4690.                 $year  = (substr $allText, $k, ($deidsend-$k));
  4691.  
  4692.                 #if ($year > 20) {
  4693.                 #   $year = '19'.$year;
  4694.                 #} else {
  4695.                 #   #$year = '20.'.$year;  
  4696.                 #   $year = '20'.$year;
  4697.                 #    }
  4698.              
  4699.                 $date = $year.'/1/1';
  4700.  
  4701.                 $altereddate = &alterdate($date, $checkID);
  4702.                 $alteredyear = (substr $altereddate, 2, 2);
  4703.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$alteredyear**]";
  4704.                 $added = 1;
  4705.                 $lastEnd = $deidsend;
  4706.             }
  4707.             # If the current PHI is not a date, then indicates that it has not been output yet
  4708.             else { my $added = 0;}         
  4709.             }
  4710.         }
  4711.         }    
  4712.        
  4713.         # If the PHI is not a date, replaces it in deid_text_file (.res) by its PHI type tag
  4714.         if ($added==0) {
  4715.        
  4716.         if ($k > $lastEnd || ($k==0)) {
  4717.             $phiText = (substr $allText, $k, ($deidsend-$k));
  4718.        
  4719.  
  4720.             # Parentheses are eliminated so that they do not trip up the run
  4721.             $phiText =~ s/\(//g;
  4722.             $phiText =~ s/\)//g;
  4723.             $phiText =~ s/\+//g;       
  4724.            
  4725.             $found = 0;
  4726.  
  4727.             # Assigns a unique ID to each PHI, e.g. all instances of "Mary" may be assigned "1", but "John" may be assigned "2"
  4728.             # %ID maps each PHI (e.g. "Mary") to its ID (e.g. "1")
  4729.             foreach $phik (keys %ID) {
  4730.             if ($phik =~/$phiText/ig) {
  4731.                 $found = 1;
  4732.             }
  4733.             }
  4734.  
  4735.             # If the current PHI to be added to .res file is already recorded in %ID, then retrieves its unique ID
  4736.             if ($found==1) {
  4737.             $phiID = $ID{$phiText};
  4738.             }
  4739.  
  4740.             # If the current PHI is not recorded in %ID, records it in %ID and assigns the PHI a unique ID
  4741.             else {
  4742.             $ID{$phiText} = keys(%ID) + 1;
  4743.             $phiID = $ID{$phiText};
  4744.             }
  4745.                        
  4746.             # Prints the PHI type and PHI ID in place of the original PHI in the .res file
  4747.             print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$phiType $phiID**]";
  4748.             $lastEnd = $deidsend;
  4749.         }
  4750.  
  4751.         else {
  4752.             if ($lastEnd < $deidsend) {
  4753.             $lastEnd = $deidsend;
  4754.             }
  4755.         }
  4756.        
  4757.         if ($lastEnd == 0) {
  4758.             $lastEnd = $lastlast;
  4759.         }
  4760.         }      
  4761.     }
  4762.     }
  4763.     #close OF;
  4764.     #print "Finished outputing to the .phi file.";
  4765.    
  4766.     # Prints the remaining non-PHI text to the .res file    
  4767.     print TF (substr $allText, $lastEnd);  
  4768.     #print "finished outputing to the .res file";
  4769.  
  4770.     close TF;  
  4771. }
  4772. # End of outputText()
  4773.  
  4774.  
  4775. #***********************************************************************************************************
  4776. #***********************************************************************************************************
  4777. #***********************************************************************************************************
  4778. # Function: alterdate()
  4779. # Arguments: string $date (yyyy/mm/dd), int $pid (patient ID)
  4780. # Returns: string $year-$month-$day
  4781. # Called by: outputText()
  4782. # Description: Converts given date to shifted date, depending on date-shift-mode, by calling doalterdate().
  4783. # Returns shifted date
  4784.  
  4785. sub alterdate {
  4786.     # Separates date fields by splitting along "/" or "-"
  4787.     # Then calls doalterdate on the resulting array of date elements
  4788.     $d = $_[0];  
  4789.     if (substr($d,0,6) =~ /(\/)/) {
  4790.     @d = split '/', $d;
  4791.     }
  4792.     elsif (substr($d,0,6) =~ /(\-)/) {
  4793.     @d = split '-', $d;
  4794.     }
  4795.  
  4796.     ($entryyear, $entrymonth)=@d[0..1];
  4797.     return join "-", &doalterdate(@d, $_[1]);
  4798. }
  4799. # End of alterdate()
  4800. #***********************************************************************************************************
  4801. #***********************************************************************************************************
  4802. #***********************************************************************************************************
  4803. # Function: doalterdate()
  4804. # Arguments: int $year, int $month, int $day, int $pid (patient ID)
  4805. # Returns: array @($year, $month, $day)
  4806. # Called by: alterdate()
  4807. # Description: Converts given date, depending on date-shift-mode, to shifted date.
  4808. # Returns shifted date in an array format to calling function alterdate(). alterdate() converts the shifted date to a string.
  4809.  
  4810. sub doalterdate {
  4811.     my $year=$_[0];
  4812.     if (length $year == 2) {
  4813.  
  4814.             # Limits 2-digit years between 1900 and 2020
  4815.         # Converts them to 4-digit years
  4816.         #if ($year<=10) {$year = 2000+$year;}  
  4817.         #if ($year<=20) {$year = 2000+$year;}
  4818.         if ($year<=$TWO_DIGIT_YEAR_THRESHOLD) {$year = 2000+$year;}
  4819.         else {$year = 1900+$year;}
  4820.     }
  4821.     my $month=$_[1];
  4822.     my $day=$_[2];
  4823.     my $pid=$_[3];          
  4824.    
  4825.     if ($pid_dateshift_list =~ /y/) {
  4826.         open SF, $date_shift_file or die "Cannot open $date_shift_file";
  4827.         while ($line = <SF>) {
  4828.         chomp $line;
  4829.         if ($line =~ /\A($pid)\|\|\|\|([0-9\-]+)/) {
  4830.             $offset = $2;
  4831.         }      
  4832.         }
  4833.         close SF;
  4834.     }
  4835.  
  4836.     my $ml=&monthlength($month, $year);
  4837.    
  4838.     # $offset = days of offset (positive or negative shift)
  4839.     # Sets the shifted year
  4840.     $offset_local = $offset;
  4841.     if ($offset_local>0) {
  4842.         $year += 4*int($offset_local/1461);
  4843.         $offset_local -=1461*int($offset_local/1461);
  4844.     }
  4845.     if ($offset_local<0){
  4846.         $year -= 4*int(-$offset_local/1461);
  4847.         $offset_local +=1461*int(-$offset_local/1461);
  4848.     }
  4849.    
  4850.     # Shifts number of days
  4851.     $day +=$offset_local;
  4852.    
  4853.     $ml=&monthlength($month, $year);
  4854.  
  4855.     # Changes $day, $month, $year based on the remaining offset after shifting $year
  4856.     while ($day>$ml) {
  4857.         $ml=&monthlength($month, $year);
  4858.         $day=$day-$ml;
  4859.         $month++;
  4860.         if ($month>12) {
  4861.         $month -=12;
  4862.         $year++;
  4863.         }
  4864.         $ml = &monthlength($month, $year);
  4865.     }
  4866.     while ($day<1) {
  4867.         $ml=&monthlength($month-1, $year);
  4868.         $day=$day + $ml;
  4869.         $month--;
  4870.         if ($month<1) {
  4871.         $month +=12;
  4872.         $year--;
  4873.         }
  4874.     }
  4875.    
  4876.     # Formats the output of single-digit month and day: "2" becomes "02"
  4877.     if (length($month)<2) {
  4878.         $month="0".$month;
  4879.     }
  4880.     if (length($day)<2) {
  4881.         $day="0".$day;
  4882.     }
  4883.    
  4884.     # Returns the shifted date as an array
  4885.     return ($year, $month, $day);    
  4886.     }
  4887. # End of doalterdate()
  4888. #***********************************************************************************************************
  4889. #***********************************************************************************************************
  4890. #***********************************************************************************************************
  4891. # Function: altermonthdate()
  4892. # Arguments: int $month, int $day, int $pid (patient ID)
  4893. # Returns: call doalterdate(), returns array ($year, $month, $day)
  4894. # Called by: None
  4895. # Description: Calls doalterdate() which performs dateshift
  4896. # This function is not used
  4897.  
  4898. sub altermonthdate {
  4899.     my ($month, $day, $pid)=@_;
  4900.     my $year;
  4901.    
  4902.     if (($month-$entrymonth)%12<6) {
  4903.     if ($month<$entrymonth) {
  4904.         $year=$entryyear+1;
  4905.     }
  4906.     else {
  4907.         $year=$entryyear;
  4908.     }
  4909.     }
  4910.     else {
  4911.     if ($month>$entrymonth) {
  4912.         $year=$entryyear-1;
  4913.     }
  4914.     else {
  4915.         $year=$entryear;
  4916.     }
  4917.     }
  4918.     return (&doalterdate($year, $month, $day, $pid))[1,2];    
  4919. }
  4920. # End of altermonthdate()
  4921. #***********************************************************************************************************
  4922. #***********************************************************************************************************
  4923. #***********************************************************************************************************
  4924. # Function: getcentury()
  4925. # Arguments: int 2-digit $year
  4926. # Returns: int
  4927. # Called by: None
  4928. # Description: Converts a 2-digit year into a 4-digit year based on entryyear
  4929. # This function is not used
  4930.  
  4931. sub getcentury {
  4932.     my $year=$_[0];
  4933.  
  4934.     if (($year-$entryyear)%100<10) {
  4935.     $year=$entryyear+(($year-$entryyear)%100);
  4936.     }
  4937.     else {
  4938.     $year=$entryyear-(($entryyear-$year)%100);
  4939.     }
  4940. }
  4941. # End of getcentury()
  4942. #***********************************************************************************************************
  4943. #***********************************************************************************************************
  4944. #***********************************************************************************************************
  4945. # Function: monthlength()
  4946. # Arguments: int $m, int $y
  4947. # Returns: int
  4948. # Called by: doalterdate()
  4949. # Description: Returns the number of days in given month-year
  4950.  
  4951. sub monthlength {
  4952.     my($m, $y)=@_;
  4953.  
  4954.     while ($m<=0) {
  4955.     $m += 12;
  4956.     $y --;
  4957.     }
  4958.     while ($m>=13) {
  4959.     $m -= 12;
  4960.     $y ++;
  4961.     }    
  4962.     # Checks for February
  4963.     if ($m==2) {
  4964.     if ($y % 4 ==0) {
  4965.         if($y % 100 ==0) {
  4966.         if ($y % 400 ==0){
  4967.             return 29;
  4968.         }
  4969.         else {
  4970.             return 28;
  4971.         }
  4972.         }
  4973.         else {     
  4974.         return 29;
  4975.         }
  4976.     }
  4977.     else {     
  4978.         return 28;
  4979.     }
  4980.     }
  4981.     # Checks for months consisting of 30 days    
  4982.     elsif (($m==4) || ($m==6) || ($m==9) || ($m==11)) {
  4983.     return 30;
  4984.     }
  4985.     # Checks for months consisting of 31 days    
  4986.     else {     
  4987.     return 31;
  4988.     }
  4989. }
  4990. # End of monthlength()
  4991.  
  4992.  
  4993.  
  4994.  
  4995.  
  4996.  
  4997.  
  4998.  
  4999.  
  5000. #***********************************************************************************************************
  5001. #***********************************************************************************************************
  5002. #***********************************************************************************************************
  5003. # Application-specific code follows. Code may contain patterns specific to our medical notes.
  5004. # Customize by replacing with your application-specific filters.
  5005. #***********************************************************************************************************
  5006. #***********************************************************************************************************
  5007. #***********************************************************************************************************
  5008.  
  5009.  
  5010. #***********************************************************************************************************
  5011. #***********************************************************************************************************
  5012. #***********************************************************************************************************
  5013. #Function: isProbablyPhone
  5014. #Argument: context string
  5015. #Returns: Always returns 1 (true) unless the context is one of the words defined in
  5016. #phone_pre_disqualifier. For future extensions, can add qualifier words
  5017. #such as "phone", "pager", etc.
  5018.  
  5019. sub isProbablyPhone {
  5020.     @phone_pre_disqualifier = ("HR","Heart", "BP", "SVR", "STV", "VT", "Tidal Volumes", "Tidal Volume", "TV", "CKS");
  5021.     $context = $_[0];
  5022.     foreach $i (@phone_pre_disqualifier) {
  5023.     if ($context =~ /\b$i\b/i){
  5024.         return 0;
  5025.     }
  5026.     }      
  5027.     return 1;
  5028. }
  5029. #end of isProbablyPhone()
  5030.  
  5031. #***********************************************************************************************************
  5032. #***********************************************************************************************************
  5033. #***********************************************************************************************************
  5034. # Function: wardname()
  5035. # Argument: string
  5036. # Searches for ward names specific to our hospital
  5037. sub wardname() {
  5038.     $text = $_[0];
  5039.    
  5040.     #Added to catch gs specific wardnames
  5041.     if ($gs_specific_filter =~ /y/){
  5042.        
  5043.     @ward_indicators = ("Quartermain");
  5044.     foreach $ward_ind (@ward_indicators){
  5045.         while ($text =~ /\b(($ward_ind) ?(\d))\b/ig){
  5046.         $wardname = $1;
  5047.         $start = length($`);
  5048.         $end = $start + length($wardname);
  5049.         $key = $start."-".$end;
  5050.         addType ($key, "Wardname");
  5051.         }
  5052.     }
  5053.     }
  5054.  
  5055. }
  5056.  
  5057. # End of function wardname()
  5058.  
  5059.  
  5060.  
  5061. #***********************************************************************************************************
  5062. #***********************************************************************************************************
  5063. #***********************************************************************************************************
  5064. #Function: isProbablyMeasurement
  5065. #Argument: context string
  5066. #Returns: Returns true if the text passed in contains (more specifically, ends with) any of the measurement indicators.
  5067. sub isProbablyMeasurement {
  5068.  @measurement_indicators_pre = ("increased to","decreased from","rose to","fell from", "down to",
  5069.                "increased from", "dropped to", "dec to", "changed to","remains on", "change to");
  5070.     $context = $_[0];
  5071.  
  5072.     foreach $i (@measurement_indicators_pre) {
  5073.  
  5074.     if ($context =~ /\b$i\b/i){  #only match if it ends with the phrase
  5075.         return 1;
  5076.     }
  5077.     }      
  5078.  
  5079.     return 0;
  5080. }
  5081.  
  5082. #end isProbablyMeasurement()
  5083.  
  5084. #***********************************************************************************************************
  5085. #***********************************************************************************************************
  5086. #***********************************************************************************************************
  5087. #Function: isProbablyDate
  5088. #Argument: textBefore string and textAfter string
  5089. #Returns: returns 1 if the context check determines that textBefore or textAfter
  5090. #are most likely strings preceding or following a date PHI; returns 0 otherwise.
  5091. #Description: Subroutine performs a context check on the textBefore and textAfter. If it contains one of the
  5092. #keywords (see below) that indicates it's probably not a date, then return false; otherwise return true.
  5093. #Called by dateWithContextCheck on partial dates MM-DD
  5094. sub isProbablyDate{
  5095.     my ($textBefore, $textAfter) = @_;
  5096.    
  5097.     if ((!isProbablyMeasurement($textBefore)) && ($textBefore !~ /\b(drop|up|cc|dose|doses|range|ranged|pad|rate|bipap|pap|unload|ventilation|scale|cultures|blood|at|up|with|in|of|RR|ICP|CVP|strength|PSV|SVP|PCWP|PCW|BILAT|SRR|VENT|PEEP\/PS|flowby|drinks|stage) ?\Z/i) && ($textAfter !~ /\A ?(packs|litres|puffs|mls|liters|L|pts|patients|range|psv|scale|beers|per|esophagus|tabs|tablets|systolic|sem|strength|hours|pts|times|drop|up|cc|mg|\/hr|\/hour|mcg|ug|mm|PEEP|hr|hrs|hour|hours|bottles|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns|units|amp|qd|chest pain|intensity)\b/i)) {
  5098.      return 1;
  5099.  }
  5100.  
  5101.  return 0;
  5102.  
  5103. }
  5104. #end isProbablyDate()
  5105.  
  5106. #***********************************************************************************************************
  5107. #***********************************************************************************************************
  5108. #***********************************************************************************************************
  5109. #Function: isProbablyDate2
  5110. #Argument: textBefore string and textAfter string
  5111. #Returns: returns 1 if the context check determines that textBefore or textAfter
  5112. #are most likely strings preceding or following a date PHI; returns 0 otherwise.
  5113. #Subroutine performs a context check on the textBefore and textAfter.  If it contains one of the
  5114. #keywords (see below) that indicates it's probably not a date, then return false; otherwise return true.
  5115. #Called by dateWithContextCheck on partial dates MM/DD.  
  5116. sub isProbablyDate2{
  5117.  
  5118. if ( (!isProbablyMeasurement($textBefore)) && ($textBefore !~ /\b(drop|up|cc|range|ranged|pad|rate|rating|bipap|pap|unload|ventilation|scale|blood|at|up|with|RR|ICP|CVP|strength|PSV|SVP|PCWP|BILAT|SRR|VENT|PEEP\/PS|flowby) ?\Z/i) && ($textAfter !~ /\A ?(packs|litres|puffs|mls|liters|L|pts|patients|range|psv|scale|drinks|beers|per|esophagus|tabs|tab|tablet|tablets|systolic|sem|strength|hours|pts|times|drop|up|cc|mg|\/hr|\/hour|mcg|ug|mm|hr|hrs|hour|hours|bottles|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|drops|breaths|wbcs|beat|beats|ns|units|amp)\b/i)) {
  5119.      return 1;
  5120.  }
  5121.  return 0;
  5122.  
  5123.  
  5124. }
  5125. #end isProbablyDate2()
  5126.  
  5127. #***********************************************************************************************************
  5128. #***********************************************************************************************************
  5129. #Function: extractYear
  5130. #Argument: a date string in the format MM/DD/YYYY
  5131. #Returns: the 4-digit year if the date is in the correct format
  5132. #returns 0000 otherwise.
  5133. sub extractYear{
  5134.  
  5135.   $date = $_[0];
  5136.  
  5137.   if ($date =~ /\b(\d\d)\/(\d\d)\/(\d\d\d\d)\b/){
  5138.       $year = $3;
  5139.   } else{
  5140.       $year = 0000;
  5141.   }
  5142.   return $year;
  5143. }
  5144.  
  5145. #***********************************************************************************************************
  5146. #***********************************************************************************************************
  5147. #***********************************************************************************************************
  5148. #Function: dateWithContextCheck
  5149. #Argument: text string
  5150. #Returns: none
  5151. #Description: Find date patterns.  Performs context check on text before and after the date patterns
  5152. #to determine if its an actual date.
  5153. sub dateWithContextCheck{
  5154.     #$text = $_[0];    
  5155.     my ($text, $date) = @_;
  5156.     my $year = extractYear($date);
  5157.  
  5158.     #my $year = substr $date, 0, 4;  
  5159.     #**********************************************************************************************
  5160.     # Searches for numerical date formats
  5161.     # Checks if dates should be filtered
  5162.     if ($datefilter =~ /y/) {
  5163.    
  5164.     # Searches for mm/dd or mm/yy
  5165.     while ($text =~ /\b([A-Za-z0-9%\/]+ +)?(\d\d?)([\/\-])(\d\d?)\/?\/?( +[A-Za-z]+)?\b/g) {               
  5166.    
  5167.         $pre = $1;
  5168.         $post = $5;
  5169.         my $first_num = $2;
  5170.         my $divider = $3;
  5171.         my $second_num = $4;
  5172.         my $postdate = $5;
  5173.            
  5174.         my $startI = length($`) + length($pre);
  5175.         my $endI = $startI + length($first_num)+length($divider)+length($second_num);
  5176.         my $key = $startI."-".$endI;
  5177.        
  5178.         my $beginr = substr $text, ($startI - 2), 2;
  5179.         my $endr = substr $text, $endI, 2;
  5180.        
  5181.         #Excludes nn/nn formats when preceded by any in array @prev
  5182.         #@prev = ("cvp", "noc", "peep/ps","%");
  5183.         @prev = ("cvp", "noc", "%", "RR", "PCW");
  5184.         my $dateorno = 1;
  5185.         foreach $j (@prev) {
  5186.         if ((!($pre =~ /\b$j/ig)) && (!($post =~ /\bpersantine\b/ig))) {
  5187.         }
  5188.         else {
  5189.             $dateorno = 0;
  5190.         }
  5191.         }
  5192.  
  5193.         my $context_len = 12; #number of characters we extract before the date
  5194.  
  5195.         if ($dateorno == 1) {
  5196.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%]/) && ($endr !~ /\S\d/)) {
  5197.            
  5198.             # Checks if date identified is valid as mm/dd; then adds the date as PHI
  5199.             if (isValidDate ($first_num, $second_num, -1)) {
  5200.            
  5201.             if ($second_num == 5) {
  5202.                
  5203.                 my $start_pos = $startI - $context_len;
  5204.                 if ($start_pos < 0) {
  5205.                 $start_pos = 0;
  5206.                 }
  5207.                 my $len = $context_len;
  5208.                 if  (length($text) < ($endI + $context_len)){
  5209.                 $len = length($text) - $endI;
  5210.                 }
  5211.                    
  5212.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5213.                 my $textAfter = substr $text, $endI, $len;
  5214.  
  5215.                
  5216.                 if (  (!isProbablyMeasurement($textBefore)) &&   ($textBefore !~ /\bPSV? \Z/i) && ($textBefore !~ /\b(CPAP|PS|range|bipap|pap|pad|rate|unload|ventilation|scale|strength|drop|up|cc|rr|cvp|at|up|in|with|ICP|PSV|of) \Z/i) && ($textAfter !~ /\A ?(packs|psv|puffs|pts|patients|range|scale|mls|liters|litres|drinks|beers|per|esophagus|tabs|pts|tablets|systolic|sem|strength|times|bottles|drop|drops|up|cc|mg|\/hr|\/hour|mcg|ug|mm|PEEP|L|hr|hrs|hour|hours|dose|doses|cultures|blood|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns)\b/i)) {
  5217.  
  5218.                 addType ($key, "Month/Day (1)");
  5219.                 }
  5220.            
  5221.             } elsif ($second_num == 2) {
  5222.            
  5223.                 my $start_pos = $startI - $context_len;
  5224.                 if ($start_pos < 0) {
  5225.                 $start_pos = 0;
  5226.                 }
  5227.                 my $len = $context_len;
  5228.                 if  (length ($text) < ($endI + $context_len)){
  5229.                 $len = length($text) - $endI;
  5230.                 }
  5231.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5232.                 my $textAfter = substr $text, $endI, $len;
  5233.  
  5234.                
  5235.                 if (   (!isProbablyMeasurement($textBefore)) &&   ($textAfter !~ /\A ?hour\b/i) && ($textBefore !~ /\b(with|drop|bipap|pap|range|pad|rate|unload|ventilation|scale|strength|up|cc|rr|cvp|at|up|with|in|ICP|PSV|of) \Z/i) && ($textAfter !~ /\A ?hr\b/i) && ($textAfter !~ /\A ?(packs|L|psv|puffs|pts|patients|range|scale|dose|doses|cultures|blood|mls|liters|litres|pts|drinks|beers|per|esophagus|tabs|tablets|systolic|sem|strength|bottles|times|drop|cc|up|mg|\/hr|\/hour|mcg|ug|mm|PEEP|hr|hrs|hour|hours|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns)\b/i)) {
  5236.                
  5237.                
  5238.                 addType ($key, "Month/Day (2)");
  5239.                 }
  5240.             #} elsif (($divider eq "-") && ($startI > 4)) {
  5241.             } elsif (($divider eq "-")) {
  5242.                 my $start_pos = $startI - $context_len;
  5243.                 if ($start_pos < 0) {
  5244.                 $start_pos = 0;
  5245.                 }
  5246.                 my $len = $context_len;
  5247.                 if  (length ($text) < ($endI + $context_len)){
  5248.                 $len = length($text) - $endI;
  5249.                 }
  5250.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5251.                 my $textAfter = substr $text, $endI, $len;
  5252.                                
  5253.                 if (isProbablyDate($textBefore, $textAfter)){
  5254.                
  5255.                 addType ($key, "Month/Day (3)");
  5256.                 }
  5257.             }
  5258.             else {
  5259.  
  5260.                 my $start_pos = $startI - $context_len;
  5261.                 if ($start_pos < 0) {
  5262.                 $start_pos = 0;
  5263.                 }
  5264.                 my $len = $context_len;
  5265.                 if  (length ($text) < ($endI + $context_len)){
  5266.                 $len = length($text) - $endI;
  5267.                 }
  5268.                 my $textAfter = substr $text, $endI, $len;  
  5269.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5270.                            
  5271.                
  5272.                 if (isProbablyDate($textBefore, $textAfter)){  
  5273.                 addType ($key, "Month/Day (4)");
  5274.                 }
  5275.             }
  5276.             }
  5277.            
  5278.             # Checks if date identified is valid as mm/yy; then adds the date as PHI
  5279.                     # Checks for years of length 2, restricted to 1950-2030
  5280.             if (($first_num <= 12) && ($first_num > 0) && ((length $second_num) == 2)
  5281.             && (($second_num>=50) || ($second_num<=30))) {     
  5282.  
  5283.                 #my $textAfter = substr $text, $endI, 9;
  5284.                 my $start_pos = $startI - $context_len;
  5285.                 if ($start_pos < 0) {
  5286.                 $start_pos = 0;
  5287.                 }
  5288.                 my $len = $context_len;
  5289.                 if  (length ($text) < ($endI + $context_len)){
  5290.                 $len = length($text) - $endI;
  5291.                 }
  5292.                 my $textAfter = substr $text, $endI, $len;  
  5293.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5294.                #  print "checking mm/yy text before is $textBefore, text after is $textAfter\n";
  5295.                 if (isProbablyDate($textBefore, $textAfter)){
  5296.                 addType ($key, "Month/Year (2)");  
  5297.                 }
  5298.                } #end if the first num and second num are month/year
  5299.  
  5300.         }
  5301.         }  #end if dateno ==1          
  5302.  
  5303.     } #end while the pattern match
  5304.     } # end if datefilter is on
  5305. }
  5306.  
  5307. #end dateWithContextCheck()
  5308.  
  5309. #***********************************************************************************************************
  5310. #***********************************************************************************************************
  5311. #***********************************************************************************************************
  5312. #Function: yearWithContextCheck
  5313. #Argument: text string
  5314. #Returns: none
  5315. #Description: Find date patterns.  Performs context check on text before and after the year patterns
  5316. #to determine if it is a year.
  5317. sub yearWithContextCheck {
  5318.     #$text = $_[0];    
  5319.     my ($text, $date) = @_;
  5320.     #my $year = substr $date, 0, 4;  
  5321.     my $year = extractYear($date);
  5322.  
  5323.  
  5324.     # Checks for 2-digit year written as '01, &c, only when preceded by the following medical terms
  5325.     while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|avr|x2|x3|CHOLECYSTECTOMY|cva|ca|PTC|PTCA|stent|since|surgery|year) + *(\')?)(\d\d)\b/ig) {
  5326.         my $num = $1;
  5327.         #print "2-digit date, 1 is $1, 2 is $2, 3 is $3, 4 is $4, 5 is $5\n";
  5328.         #my $key = (1 + (length($`))+length($1))."-".(pos $text);
  5329.         my $startd =  (length($`))+length($1);
  5330.         my $endd = $startd + length($4);
  5331.         my $key =  $startd."-".$endd;
  5332.  
  5333.         #check if the match is part of number followed by decimal point and a number
  5334.         my $textAfter = substr $text, $endd, 2;
  5335.        
  5336.         if ($textAfter !~ /\.\d/){
  5337.         addType($key, "Year (2 digits)");
  5338.        
  5339.         }
  5340.      }
  5341.        
  5342.  
  5343.      while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|avr|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|surgery|year)(\'+))(\d\d)\b/ig) {
  5344.         my $num = $1;
  5345.  
  5346.         my $startd =  (length($`))+length($1);
  5347.         my $endd = $startd + length($4);
  5348.         my $key =  $startd."-".$endd;
  5349.  
  5350.         #check if the match is part of number followed by decimal point and a number
  5351.         #or if it is a time (HH:MM)
  5352.         my $textAfter = substr $text, $endd, 2;
  5353.        
  5354.         if ($textAfter !~ /(\.|\:)\d/){
  5355.         addType($key, "Year (2 digits)");
  5356.         }
  5357.      }
  5358.  
  5359.      # Checks for 4-digit year written as 2001, &c, only when preceded by the following medical terms
  5360.      while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|in|PTCA|since|from|year) + *)(\d{4})((\,? )\d{4})?\b/ig) {
  5361.         my $num1 = $1;
  5362.         $s1 = length($`) + length($1);
  5363.         $e1 = $s1+length($3);
  5364.         $s2 = $e1+length($5);
  5365.         $e2 = $e1+length($4);
  5366.         $k1 = "$s1-$e1";
  5367.         $k2 = "$s2-$e2";
  5368.  
  5369.        
  5370.         #for 4-digit year, check if the matched number is in the range of [$VALID_YEAR_LOW,$VALID_YEAR_HIGH]
  5371.         #if ($3 <= 2030 && $3 >= 1950){
  5372.         if ($3 <= $VALID_YEAR_HIGH && $3 >= $VALID_YEAR_LOW){
  5373.             addType($k1, "Year (4 digits)");
  5374.             addType($k2, "Year (4 digits)");
  5375.             }
  5376.        }
  5377.        
  5378.       # Looks for year only (esp Patient Medical History): looks for year numbers within the 30 years before
  5379.       # and 2 years after the date passed in as an argument.
  5380.        
  5381.       # for $n (($year - 30) .. $year) {
  5382.       for $n (($year - 30) .. ($year+2)) {
  5383.         my $short = substr $n, 2, 2;
  5384.         if ($n =~ /\d\d\d\d/) {
  5385.         while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|in|PTCA|since|from|year) + *)$n\b/ig) {
  5386.             my $key = (length($`)+length($1))."-".(pos $text);
  5387.             addType ($key, "Year (4 digits)");
  5388.         }}
  5389.         if ($short =~ /\d\d/) {
  5390.         #while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|from|year) + *(\'?))$short\b/ig) {
  5391.         while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|year) + *(\'?))$short\b/ig) {
  5392.             my $key = (length($`)+length($1))."-".(pos $text);
  5393.             addType ($key, "Year (2 digits)");
  5394.         }}
  5395.         if ($short =~ /\d\d/) {
  5396.     #   while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|from|year)(\'+))$short\b/ig) {
  5397.         while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|year)(\'+))$short\b/ig) {
  5398.             my $key = (length($`)+length($1))."-".(pos $text);
  5399.             addType ($key, "Year (2 digits)");
  5400.         }}
  5401.      }
  5402.  
  5403. }
  5404.  
  5405. #end yearWithContextCheck()
  5406.  
  5407.  
  5408.  
  5409.  
  5410. #***********************************************************************************************************
  5411. #***********************************************************************************************************
  5412. #***********************************************************************************************************
  5413. # End of De-Identification Algorithm
  5414. #***********************************************************************************************************
  5415. #***********************************************************************************************************
  5416. #***********************************************************************************************************
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement