Advertisement
Guest User

Untitled

a guest
Jan 9th, 2020
174
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 187.35 KB | None | 0 0
  1.  #!/usr/bin/perl
  2.  
  3. #**************************************************************************************************************
  4. #file: deid.pl   Original author: M. Douglass 2004 
  5. #Last revised:    Apr 2009  DeID 1.1
  6. #                
  7. #
  8. #_______________________________________________________________________________
  9. #
  10. #deid.pl: De-identification algorithm -- scrubs PHI from free-text medical records
  11. #(e.g. Discharge Summaries and Nursing Notes)
  12. #
  13. #Copyright (C) 2004-2007 Margaret Douglas and  Ishna Neamatullah
  14. #
  15. #This code is free software; you can redistribute it and/or modify it under
  16. #the terms of the GNU Library General Public License as published by the Free
  17. #Software Foundation; either version 2 of the License, or (at your option) any
  18. #later version.
  19. #
  20. #This library is distributed in the hope that it will be useful, but WITHOUT ANY
  21. #WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
  22. #PARTICULAR PURPOSE.  See the GNU Library General Public License for more
  23. #details.
  24. #
  25. #You should have received a copy of the GNU Library General Public License along
  26. #with this code; if not, write to the Free Software Foundation, Inc., 59
  27. #Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  28. #
  29. #You may contact the author by e-mail or postal mail
  30. #(MIT Room E25-505, Cambridge, MA 02139 USA).  For updates to this software,
  31. #please visit PhysioNet (http://www.physionet.org/).
  32. #_______________________________________________________________________________
  33. #
  34. # De-identification Algorithm: Scrubs PHI from free-text medical records
  35. #(e.g. Discharge Summaries and Nursing Notes)
  36. # Original version written by:
  37. #   Margaret Douglass (douglass AT alum DOT mit DOT edu)
  38. #   William Long (wjl AT mit DOT edu)
  39. # Modified by:
  40. #   Ishna Neamatullah (ishna AT alum DOT mit DOT edu) in Sept 5, 2006
  41. # Last modified by:
  42. #   Li-wei Lehman (lilehman AT alum DOT mit DOT edu) in April, 2009
  43. #
  44. # Command to run the software:
  45. # perl deid.pl <filename> <config_filename>
  46. #
  47. # Input arguments:
  48. # filename (without extension, where extension must be .text): file to be de-identified
  49. # config_filename: configuration file
  50. #
  51. # Required library file: stat.pm
  52. #
  53. # Output files:
  54. # filename.res: de-identified file
  55. # filename.phi: file containing all PHI locations, used in calculating performance statistics
  56. # filename.info: file containing information useful for debugging
  57. # code performance statistics printed on screen if Gold Standard available for nursing notes (details in README)
  58. #**************************************************************************************************************
  59.  
  60. #use Stat;
  61.  
  62. # Declaring some variables for algorithm run configuration
  63.  
  64. # Variables to switch on/off filter functions
  65. $allfilters = "";
  66. $ssnfilter = "";
  67. $urlfilter = "";
  68. $emailfilter = "";
  69. $telfilter = "";
  70. $unitfilter = "";
  71. $agefilter = "";
  72. $locfilter = "";
  73. $datefilter = "";
  74. $namefilter = "";
  75. $us_state_filter = "";
  76. $ds_specific_filter = ""; #filter for discharge summmary specific patterns
  77. $gs_specific_filter = ""; #filter for gold std specific patterns
  78.  
  79. my $offset; # positive date shift in number of days
  80. my $comparison = ""; # 1=comparison with gold standard, 0=no comparison with gold standard
  81.  
  82. # Variables to switch on/off dictionaries/lists
  83. $alllists = "";
  84. $pid_patientname_list = "";
  85. $pid_dateshift_list = "";
  86. $country_list = "";
  87. $company_list = "";
  88. $ethnicity_list = "";
  89. $hospital_list = "";
  90. $doctor_list = "";
  91. $location_list = "";
  92. $local_list = "";
  93. $state_list = "";
  94.  
  95. use Time::Local;
  96. my ($mday,$mon,$year) = (localtime(time))[3,4,5];
  97. my $shortyear = substr($year,1,4);
  98. my $longyear = '20'.$shortyear;
  99.  
  100. # Nursing note date as retrieved from the header
  101. my $nn_year;
  102.  
  103. # Sets whether a de-identified version should be output: 0 = no new version of text output, 1 = fully de-identified version of text output
  104. # Note: Generally keep output_deid_text = 1
  105. $output_deid_text = 1;
  106.  
  107. #Default date used to de-identify the Gold Std if no default date is specified in the
  108. #config file.  You can change the default date by setting the "Default date" variable to
  109. #some other dates (in MM/DD/YYYY format).
  110. $DEFAULT_DATE = "01/01/2000";
  111.  
  112. #The "Two Digit Year Threshold" is used to determine whether
  113. #to interpret the year as a year in the 1900's or 2000's.
  114. #Must be a 1- or 2-digit number.
  115. #Two digit years > Threshold are  interepreted as in the 1900's
  116. #Two digit years <=  Threshold are interpreted as in the 2000's
  117. $TWO_DIGIT_YEAR_THRESHOLD = 30;#change this default by setting "Two Digit Year Threshold" in config file.
  118.  
  119. # "Valid Year Low" and "Valid Year High" (must be 4-digit numbers) are
  120. # used in certain date pattern checking routines to determine if a
  121. # four digit number that appear in a potential
  122. # date pattern is a year or not -- it is a valid year if it is
  123. # in the range of [Valid Year Low, Valid Year High].  
  124. $VALID_YEAR_LOW = 1900;
  125. $VALID_YEAR_HIGH = 2030;
  126.  
  127.  
  128.  
  129. my @known_phi;
  130. my @known_first_name;
  131. my @known_last_name;
  132.  
  133. # Hash that stores PHI information from de-identification
  134. # KEY = (patient number), VALUE = array with each element = (%HASH with KEY = start-end, VALUE = array of types of PHI for the note number/index of the array)
  135. %all_phi;
  136. my %ID;
  137.  
  138. # Declares some global variables
  139. %lhash = ();
  140. @typename = ();
  141. $ntype = 1;
  142. @extend = ('phrases');
  143.  
  144.  
  145. ##########################################################################################
  146. # Sets paths to lists and dictionaries in working directory that will be used in this algorithm
  147. $date_shift_file = "shift.txt"; # contains mapping of PID to date offset
  148. $countries_file = "lists/countries_unambig.txt";
  149. $ethnicities_unambig_file = "lists/ethnicities_unambig.txt";
  150. $companies_file = "lists/company_names_unambig.txt";
  151. $companies_ambig_file = "lists/company_names_ambig.txt";
  152. $common_words_file = "dict/common_words.txt";
  153. $medical_words_file = "dict/sno_edited.txt";
  154. $very_common_words_file = "dict/commonest_words.txt";
  155. $female_unambig_file = "lists/female_names_unambig.txt";
  156. $female_ambig_file = "lists/female_names_ambig.txt";
  157. $female_popular_file = "lists/female_names_popular.txt";
  158. $male_unambig_file = "lists/male_names_unambig.txt";
  159. $male_ambig_file = "lists/male_names_ambig.txt";
  160. $male_popular_file = "lists/male_names_popular.txt";
  161. $last_unambig_file = "lists/last_names_unambig.txt";
  162. $last_ambig_file = "lists/last_names_ambig.txt";
  163. $last_popular_file = "lists/last_names_popular.txt";
  164. #$last_name_prefixes_file = "lists/last_name_prefixes.txt";
  165. $doctor_first_unambig_file = "lists/doctor_first_names.txt";
  166. $doctor_last_unambig_file = "lists/doctor_last_names.txt";
  167. $prefixes_unambig_file = "lists/prefixes_unambig.txt";
  168. $locations_unambig_file = "lists/locations_unambig.txt";
  169. $locations_ambig_file = "lists/locations_ambig.txt";
  170. $local_places_ambig_file = "lists/local_places_ambig.txt";
  171. $local_places_unambig_file = "lists/local_places_unambig.txt";
  172. $hospital_file = "lists/stripped_hospitals.txt";
  173. $last_name_prefix_file = "lists/last_name_prefixes.txt";
  174. $patient_file = "lists/pid_patientname.txt"; # contains mapping of PID to patient name
  175. $us_states_file = "lists/us_states.txt";
  176. $us_states_abbre_file = "lists/us_states_abbre.txt";
  177. $more_us_states_abbre_file = "lists/more_us_state_abbreviations.txt";
  178. $us_area_code_file = "lists/us_area_code.txt";
  179. $medical_phrases_file = "dict/medical_phrases.txt";
  180. $unambig_common_words_file = "dict/notes_common.txt";
  181.  
  182. ############################################################################################################
  183. # Declares some arrays of context words that can be used to identify PHI
  184. # Days of the month
  185. @days = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday");
  186.  
  187. # Titles that precede last names (ignore .'s)
  188. @titles = ("MISTER", "DOCTOR", "DOCTORS", "MISS", "PROF", "PROFESSOR", "REV", "RABBI", "NURSE", "MD", "PRINCESS", "PRINCE", "DEACON", "DEACONESS", "CAREGIVER", "PRACTITIONER", "MR", "MS");
  189.  
  190. @strict_titles = ("Dr", "DRS", "Mrs");  #treat words after these strict_titles as PHI
  191.  
  192. %titles = ();
  193. foreach $title (@titles){
  194.     $titles{$title} = 1;
  195. }
  196.  
  197. # Name indicators that precede or follow names
  198. @name_indicators = ("problem","problem:", "proxy", "daughter","daughters", "dtr", "son", "brother","sister", "mother", "mom", "father", "dad", "wife", "husband", "neice", "nephew", "spouse", "partner", "cousin", "aunt", "uncle", "granddaughter", "grandson", "grandmother", "grandmom", "grandfather", "granddad", "relative", "friend", "neighbor", "visitor", "family member", "lawyer", "priest", "rabbi", "coworker", "co-worker", "boyfriend", "girlfriend", "name is", "named", "rrt", "significant other", "jr", "caregiver", "proxys", "friends", "sons", "brothers", "sisters", "sister-in-law", "brother-in-law", "mother-in-law", "father-in-law", "son-in-law", "daughter-in-law", "dtr-in-law", "surname will be", "name will be", "name at discharge will be", "name at discharge is");
  199.  
  200.  
  201. # Phrases that precede locations
  202. @location_indicators = ("lives in", "resident of", "lived in", "lives at", "comes from", "called from", "visited from", "arrived from", "returned to");
  203.  
  204. @employment_indicators_pre = ("employee of", "employed by", "employed at", "CEO of", "manager at", "manager for", "manager of", "works at", "business");
  205.  
  206.  
  207. # Hospital indicators that follow hospital names
  208. @hospital_indicators = ("Hospital", "General Hospital", "Gen Hospital", "gen hosp", "hosp", "Medical Center", "Med Center", "Med Ctr", "Rehab", "Clinic", "Rehabilitation", "Campus", "health center", "cancer center", "development center", "community health center", "health and rehabilitation", "Medical", "transferred to", "transferred from", "transfered to", "transfered from");
  209.  
  210. # Location indicators that follow locations
  211. @loc_indicators_suff = ("city", "town", "beach", "valley","county", "harbor", "ville", "creek", "springs", "mountain", "island", "lake", "lakes", "shore", "garden", "haven", "village", "grove", "hills", "hill", "shire", "cove", "coast", "alley", "street", "terrace", "boulevard", "parkway", "highway", "university", "college", "tower");
  212.  
  213. # Location indicators that are most likely preceded by locations
  214. @loc_ind_suff_c = ("town", "ville", "harbor", "tower");
  215.  
  216. # Location indicators that precede locations
  217. #@loc_indicators_pre = ("cape", "fort", "lake", "mount", "santa", "los", "great","east","west","north","south");
  218. @loc_indicators_pre = ("cape", "fort", "lake", "mount", "santa", "los", "east","west","north","south");
  219.  
  220.  
  221. @apt_indicators = ("apt", "suite"); #only check these after the street address is found
  222. @street_add_suff = ("park", "drive", "street", "road", "lane", "boulevard", "blvd", "avenue", "highway", "circle","ave", "place", "rd", "st");
  223.  
  224. #Strict street address suffix: case-sensitive match on the following,
  225. #and will be marked as PHI regardless of ambiguity (common words)
  226. @strict_street_add_suff = ("Park", "Drive", "Street", "Road", "Lane", "Boulevard", "Blvd", "Avenue", "Highway","Ave",,"Rd", "PARK", "DRIVE", "STREET", "ROAD", "LANE", "BOULEVARD", "BLVD", "AVENUE", "HIGHWAY","AVE", "RD");
  227.  
  228. # Age indicators that follow ages
  229. @age_indicators_suff = ("year old", "y\. o\.", "y\.o\.", "yo", "y", "years old", "year-old", "-year-old", "years-old", "-years-old", "years of age", "yrs of age");
  230.  
  231. # Age indicators that precede ages
  232. @age_indicators_pre = ("age", "he is", "she is", "patient is");
  233.  
  234. # Digits, used in identifying ages
  235. @digits = ("one","two","three","four","five","six","seven","eight","nine", "");
  236.  
  237. # Different variations of the 12 months
  238. @months = ("January", "Jan", "February", "Feb", "March", "Mar", "April", "Apr", "May", "June", "Jun", "July", "Jul", "August", "Aug", "September", "Sept", "Sep", "October", "Oct", "November", "Nov", "December", "Dec");
  239.  
  240. ######################################################################################
  241. # If the correct number of input argument is provided, sets the input and output filenames.
  242. if ($#ARGV == 1) {
  243.     $data_file = "train_data/" + "$ARGV[0].text";      # data_file: input file
  244.     $output_file = "train_data/" +"$ARGV[0].phi";     # output_file: file containing PHI locations
  245.     $debug_file = "train_data/" +"$ARGV[0].info";     # debug_file: file used for debugging, contains PHI and non-PHI locations
  246.     $deid_text_file = "train_data/" +"$ARGV[0].res";  # deid_text_file: de-identified text file
  247.     $gs_file = "train_data/" +"$ARGV[0].deid";        # gs_file: Gold Standard of the input file
  248.  
  249.    
  250.     print "\n*******************************************************************************************************************\n";
  251.     print "De-Identification Algorithm: Identifies Protected Health Information (PHI) in Discharge Summaries and Nursing Notes";
  252.     print "\n*******************************************************************************************************************\n";
  253.  
  254.  
  255.    
  256.     $config_file = $ARGV[1];
  257.     open CF, $config_file or die "Cannot open $config_file";
  258.     while ($cfline = <CF>) {
  259.     chomp $cfline;
  260.     if ($cfline =~ /\A[\#]+/){
  261.        next;
  262.     }
  263.     if ($cfline =~ /\bGold\s+standard\s+comparison\s*\=\s*([0-9])/ig) {
  264.         $comparison = ($1);
  265.     }
  266.     #Date default expects MM/DD/YYYY
  267.     if ($cfline =~ /\bDate\s+default\s*\=\s*(\d\d)\/(\d\d)\/(\d\d\d\d)/ig) {
  268.        
  269.         my $mm = $1; $dd = $2; $yyyy = $3;
  270.         $DEFAULT_DATE = "$mm/$dd/$yyyy";
  271.         #print "Default date is $DEFAULT_DATE\n";
  272.     }
  273.  
  274.     #The "Two Digit Year Threshold" is used to determine whether
  275.         #to interpret the year as a year in the 1900's or 2000's
  276.     if ($cfline =~ /\bTwo\s+Digit\s+Year\s+Threshold\s*=\s*(\d{1,2})/ig) {
  277.         $TWO_DIGIT_YEAR_THRESHOLD = "$1";
  278.         #print "Two Digit Year Threshold is $TWO_DIGIT_YEAR_THRESHOLD\n";
  279.     }
  280.  
  281.     if ($cfline =~ /\bDate\s+offset\s*\=\s*([0-9]+)/ig) {
  282.         $offset = ($1);
  283.         #print "Date offset is $1\n";
  284.     }
  285.     if ($cfline =~ /\bSSN\s+filter\s*\=\s*([a-z])/ig) {
  286.         $ssnfilter = ($1);     
  287.     }
  288.     if ($cfline =~ /\bURL\s+filter\s*\=\s*([a-z])/ig) {
  289.         $urlfilter = ($1);
  290.     }
  291.     if ($cfline =~ /\bEmail\s+filter\s*\=\s*([a-z])/ig) {
  292.         $emailfilter = ($1);
  293.     }
  294.     if ($cfline =~ /\bTelephone\s+filter\s*\=\s*([a-z])/ig) {
  295.         $telfilter = ($1);
  296.     }
  297.     if ($cfline =~ /\bUnit\s+number\s+filter\s*\=\s*([a-z])/ig) {
  298.         $unitfilter = ($1);
  299.     }
  300.     if ($cfline =~ /\bAge\s+filter\s*\=\s*([a-z])/ig) {
  301.         $agefilter = ($1);
  302.     }
  303.     if ($cfline =~ /\bLocation\s+filter\s*\=\s*([a-z])/ig) {
  304.         $locfilter = ($1);
  305.     }
  306.     if ($cfline =~ /\bDate\s+filter\s*\=\s*([a-z])/ig) {
  307.         $datefilter = ($1);
  308.     }
  309.     if ($cfline =~ /\bName\s+filter\s*\=\s*([a-z])/ig) {
  310.         $namefilter = ($1);
  311.     }
  312.  
  313.     if ($cfline =~ /\bState\s+filter\s*\=\s*([a-z])/ig) {
  314.         $us_state_filter = ($1);
  315.  
  316.     }
  317.  
  318.     if ($cfline =~ /\bDS\s+filter\s*\=\s*([a-z])/ig) {
  319.         $ds_specific_filter = ($1);
  320.  
  321.     }
  322.  
  323.     if ($cfline =~ /\bGS\s+filter\s*\=\s*([a-z])/ig) {
  324.         $gs_specific_filter = ($1);
  325.  
  326.     }
  327.  
  328.     ######################################################
  329.     #get the config info for dictionaries loading
  330.     if ($cfline =~ /\bPID\s+to\s+patient\s+name\s+mapping\s*\=\s*([a-z])/ig) {
  331.         $pid_patientname_list = ($1);
  332.  
  333.     }  
  334.     if ($cfline =~ /\bPID\s+to\s+date\s+offset\s+mapping\s*\=\s*([a-z])/ig) {
  335.         $pid_dateshift_list = ($1);
  336.     }
  337.     if ($cfline =~ /\bCountry\s+names\s*\=\s*([a-z])/ig) {
  338.         $country_list = ($1);
  339.     }
  340.     if ($cfline =~ /\bCompany\s+names\s*\=\s*([a-z])/ig) {
  341.         $company_list = ($1);
  342.     }
  343.     if ($cfline =~ /\bEthnicities\s*\=\s*([a-z])/ig) {
  344.         $ethnicity_list = ($1);
  345.     }
  346.     if ($cfline =~ /\bHospital\s+names\s*\=\s*([a-z])/ig) {
  347.         $hospital_list = ($1);
  348.     }
  349.     if ($cfline =~ /\bLocation\s+names\s*\=\s*([a-z])/ig) {
  350.         $location_list = ($1);
  351.  
  352.     }  
  353.     if ($cfline =~ /\bDoctor\s+names\s*\=\s*([a-z])/ig) {
  354.         $doctor_list = ($1);
  355.  
  356.     }
  357.  
  358.     if ($cfline =~ /\bLocalPlaces\s+names\s*\=\s*([a-z])/ig) {
  359.         $local_list = ($1);
  360.     }
  361.  
  362.     if ($cfline =~ /\bState\s+names\s*\=\s*([a-z])/ig) {
  363.         $state_list = ($1);
  364.     }
  365.  
  366.     }
  367.  
  368. }
  369.  
  370. # Prints an error message on the screen if number of arguments is incorrect
  371. else {
  372.     print "\n===========================================================================================";
  373.     print "\nError: Wrong number of arguments entered";
  374.     print "\nThe algorithm takes 2 arguments:";
  375.     print "\n  1. filename (the filename of medical notes, without extension, where extension must be .text)";
  376.     print "\n  2. config_filename (the configuration filename)";
  377.     print "\nExample (for Gold Standard Comparison): perl deid.pl id deid.config";
  378.     print "\nExample (for output mode using Gold Standard): perl deid.pl id deid-output.config";
  379.     print "\nFor further documentation, please consult the README.txt file";
  380.     print "\n===========================================================================================\n";
  381.     exit;
  382.  
  383. }
  384.  
  385. # After setting file names and configuring the run, indicates that de-identification has commenced
  386. print "\n\nStarting de-identification (version 1.1) ...\n\n";
  387.  
  388.  
  389. #check if we can open the .phi file
  390. open F, ">$output_file" or die "Cannot open $output_file";
  391. close F;
  392.  
  393. # Calls setup to create some lookup lists in memory
  394. setup();
  395.  
  396.  
  397. if ($comparison==1) {
  398.     print "Running deid in performance comparison mode.\n";    
  399.     print "Using PHI locations in $gs_file as comparison. Output files will be:\n";
  400.     print "$output_file: the PHI locations found by the code.\n";
  401.     print "$debug_file: debug info about the PHI locations.\n";
  402.     #check if the gold std file exists
  403.     open GS, $gs_file or die "Cannot open $gs_file. Make sure that the gold standard file exists!\n";   # GS = Gold Standard file
  404.     close GS;
  405. }
  406. else {
  407.     #check if we can open the .res file
  408.     open F, ">$deid_text_file" or die "Cannot open $deid_text_file";
  409.     close F;
  410.     print "Running deid in output mode. Output files will be: \n";
  411.     print "$output_file: the PHI locations found by the code.\n";
  412.     print "$deid_text_file: the scrubbed text.\n";
  413.     print "$debug_file: debug info about the PHI locations.\n";
  414.      
  415. }
  416.  
  417. deid();
  418.  
  419. # Calls function stat() to calculate code performance statistics, if comparison mode = 1
  420. if ($comparison==1) {
  421.     require "stat.pm";
  422.     &stat($gs_file, $output_file);
  423.  
  424. }
  425.  
  426. # End of top level of code
  427. #***********************************************************************************************************
  428. #***********************************************************************************************************
  429. #***********************************************************************************************************
  430. sub numerically {
  431.               #print "a is $a  b is $b\n";
  432.               $a <=> $b;
  433.  }
  434. #***********************************************************************************************************
  435. #***********************************************************************************************************
  436. #***********************************************************************************************************
  437. # Reads in a file and pushes each line onto an array. Returns the array.
  438.  
  439. sub preload {
  440.     my ($file) = @_;
  441.     my @res = ();
  442.     open FILE, $file or die "Cannot open file $file";
  443.     while ($line = <FILE>) {
  444.     chomp $line;
  445.     push(@res, uc($line));
  446.     }
  447.     close FILE;
  448.     return @res;
  449. }
  450. # End of preload()
  451. #***********************************************************************************************************
  452. #***********************************************************************************************************
  453. #***********************************************************************************************************
  454. # Reads in a file and pushes each line onto an array. Returns the array.
  455.  
  456. sub preload_uc {
  457.     my ($file) = @_;
  458.     my @res = ();
  459.     open FILE, $file or die "Cannot open file $file";
  460.     while ($line = <FILE>) {
  461.     chomp $line;
  462.     push(@res, uc($line));
  463.     }
  464.     close FILE;
  465.     return @res;
  466. }
  467. # End of preload_uc()
  468. #***********************************************************************************************************
  469. #***********************************************************************************************************
  470. #***********************************************************************************************************
  471. # Reads in a file and creates a dictionary that records each line in that file under the given association, by mapping the line to a '1' in the dictionary of the association
  472.  
  473. sub preload_assoc {
  474.     my ($file,$assoc) = @_;
  475.     open FILE, $file or die "Cannot open file $file";
  476.  
  477.     while ($line = <FILE>) {
  478.     chomp $line;
  479.     $$assoc{uc($line)}=1;
  480.     }
  481.     close FILE;
  482. }
  483. # End of preload_assoc()
  484. #***********************************************************************************************************
  485. #***********************************************************************************************************
  486. #***********************************************************************************************************
  487. # Reads in a file and calls setup_hash on each line
  488.  
  489. sub setup_hash {
  490.     my ($file, $tname) = @_;
  491.     my @entry;
  492.     $typename[$ntype]= $tname;
  493.     open FILE, $file or die "Cannot open file $file";
  494.     while ($line = <FILE>) {
  495.     chomp $line;
  496.     &setup_item($ntype,$line);
  497.     }
  498.     $ntype++;
  499.     close FILE;
  500. }
  501. # End of setup_hash()
  502. #***********************************************************************************************************
  503. #***********************************************************************************************************
  504. #***********************************************************************************************************
  505. # Reads in a file and calls setup_hash on each line
  506. sub setup_lst_hash {
  507.     my ($tname, @hlst) = @_;
  508.     $typename[$ntype]= $tname;
  509.     foreach $line (@hlst) {
  510.     &setup_item($ntype,$line);
  511.     }
  512.     $ntype++;
  513. }
  514. # End of setup_lst_hash()
  515. #***********************************************************************************************************
  516. #***********************************************************************************************************
  517. #***********************************************************************************************************
  518. sub setup_item {
  519.     my ($type,$line) = @_;
  520.     my ($head, @lst) = split (/([^a-zA-Z0-9_\']+)/,uc($line));
  521.     my $ix = $type;
  522.  
  523.     if(@lst){
  524.  
  525.     push @extend, [@lst];
  526.     $ix = "$type,$#extend";
  527.     }
  528.     my $entry = $lhash{$head};
  529.     if ($entry){
  530.     $lhash{$head} .= "|" . $ix;
  531.     }
  532.     else{$lhash{$head}=$ix;
  533.      }
  534. }
  535.  
  536. # End of setup_item()
  537. #***********************************************************************************************************
  538. #***********************************************************************************************************
  539. #***********************************************************************************************************
  540. sub typename {
  541.     my ($num) = @_;
  542.     return($typename[$num]);
  543. }
  544. # End of typename()
  545. #***********************************************************************************************************
  546. #***********************************************************************************************************
  547. #***********************************************************************************************************
  548. # Takes in an array of words and compares them with hashes of known PHI. Recognizes and adds PHI using addType().
  549.  
  550. sub lookhash {
  551.     my @txtlst = @_;
  552.     my $pos = 0;
  553.     my $npos = 0;
  554.     my $tl,$ty;
  555.     my $txt = 1; # item is text not separator
  556.     my $item;
  557.  
  558.     while (@txtlst) {
  559.     $item = shift(@txtlst);
  560.  
  561.     $npos = $pos + length($item);
  562.     if ($item =~ /([a-zA-Z\']+)/ig) {
  563.         if($item){
  564.         $item = uc($item);
  565.         $tl = $lhash{$item};
  566.  
  567.         #if the term ends with 's, remove it and see if
  568.                 #there is a match in PHI hash
  569.         #if (!($tl) && $item =~/([a-zA-Z\'\-]+)\'s$/ig) {
  570.         if (!($tl) && $item =~/([a-zA-Z\']+)\'s$/ig) {
  571.             $tl = $lhash {$1};
  572.         }
  573.  
  574.         if($tl){
  575.  
  576.             # Compares with known PHI by calling bestmatch()
  577.             ($xpos, $done, @types)=&bestmatch($tl,@txtlst);
  578.             splice(@txtlst,0,$#txtlst-$done);
  579.             $npos += $xpos;
  580.  
  581.             foreach $typ (@types){
  582.             #print "item $item, adding type $type for position $pos-$npos\n";
  583.             #print "type is $typ , key is $pos - $npos  \n";
  584.             addType("$pos-$npos",$typ);    
  585.             #print "positions are $pos-$npos, typ is $typ\n";
  586.             }
  587.         }
  588.         }  #end if $item
  589.         $txt = 0;
  590.     }  #end if $txt
  591.     else {
  592.         $txt = 1;
  593.     }
  594.     $pos = $npos;
  595.    }
  596. }
  597. # End of lookhash()
  598. #***********************************************************************************************************
  599. #***********************************************************************************************************
  600. #***********************************************************************************************************
  601. sub bestmatch {
  602.    my ($tl,@txtlst) = @_;
  603.    my $pos = 0;
  604.    my $bestpos = 0;
  605.    my $bestrest = $#txtlst;
  606.    my @besttyp = ();
  607.    my ($type, $rest, $xpos, @nlst);
  608.  
  609.    foreach $ck (split '\|',$tl) {
  610.     ($type, $rest) = split ',',$ck;
  611.  
  612.     if($rest) {
  613.         ($xpos , @nlst) = &matchrest($rest,@txtlst);
  614.         if ($xpos) {
  615.         if($xpos > $bestpos) {
  616.  
  617.             $bestrest = $#nlst; $bestpos = $xpos;
  618.             @besttyp = ($typename[$type]);
  619.         }
  620.         elsif($xpos == $bestpos) {
  621.             push @besttyp,$typename[$type];
  622.         }
  623.         }
  624.     }
  625.     elsif ($bestpos == 0) {
  626.         push @besttyp,$typename[$type];
  627.     }
  628.    }
  629.    return ($bestpos, $bestrest, @besttyp);
  630. }
  631. # End of bestmatch()
  632. #***********************************************************************************************************
  633. #***********************************************************************************************************
  634. #***********************************************************************************************************
  635. sub matchrest {
  636.    my ($rest, @txtlst) = @_;
  637.    my @mlst = @{$extend[$rest]};
  638.    my $item;
  639.    my $pos = 0;
  640.    my $pm = 0;
  641.  
  642.    my $tmppos = 0;
  643.  
  644.    if ($mlst[0] eq '|') {
  645.     $pm = 1; shift(@mlst);
  646.    }
  647.    foreach $i (@mlst) {
  648.     if ($i !~ /([a-zA-Z\'\-]+)$/ig){
  649.         next;
  650.     }
  651.  
  652.     $item = shift(@txtlst);
  653.     $pos += length($item);
  654.  
  655.  
  656.     while ( $#txtlst >= 0 && ($item !~ /([a-zA-Z\'\-]+)/ig)){
  657.         $item = shift(@txtlst);
  658.         $pos += length ($item);
  659.         #print "item is $item, len is $#txtlst";
  660.     }
  661.  
  662.     if ($pm ? (uc($item) !~ /$i/) : ($i ne uc($item))) {
  663.         #print "pm is $pm, returning zero\n";
  664.         return 0;
  665.     }
  666.    }
  667.    return ($pos, @txtlst);
  668. }
  669. # End of matchrest()
  670. #***********************************************************************************************************
  671. #***********************************************************************************************************
  672. #***********************************************************************************************************
  673. # Function: setup()
  674. # Arguments: None
  675. # Returns: None
  676. # Called by: Topmost level of code
  677. # Description: Creates some lookup lists to have in memory
  678. sub setup {
  679.    
  680.    # This part is not necessary
  681.    open LP, $last_name_prefix_file or die "Cannot open $last_name_prefix_file";
  682.    while ($line = <LP>) {
  683.     chomp $line;
  684.     $prefixes{uc($line)} = 1;
  685.    }
  686.    close LP;
  687.  
  688.    #Added to reduce false positives
  689.    &setup_hash($medical_phrases_file,"MedicalPhrase");
  690.  
  691.    # Sets up hashes of some PHI lists for direct identification
  692.    if ($namefilter =~ /y/) {
  693.     &setup_hash($female_unambig_file,"Female First Name (un)");
  694.     &setup_hash($female_ambig_file,"Female First Name (ambig)");
  695.     &setup_hash($male_unambig_file,"Male First Name (un)");
  696.     &setup_hash($male_ambig_file,"Male First Name (ambig)");
  697.     &setup_hash($last_unambig_file,"Last Name (un)");
  698.     &setup_hash($last_popular_file,"Last Name (popular/ambig)");
  699.     &setup_hash($last_ambig_file,"Last Name (ambig)");
  700.     &setup_hash($female_popular_file, "Female First Name (popular/ambig)");
  701.     &setup_hash($male_popular_file, "Male First Name (popular/ambig)");
  702.    
  703.     if ($doctor_list =~ /y/) {
  704.         &setup_hash($doctor_first_unambig_file, "Doctor First Name");
  705.         &setup_hash($doctor_last_unambig_file, "Doctor Last Name");
  706.     }
  707.    
  708.    }
  709.  
  710.    if ($locfilter =~ /y/) {
  711.     if ($location_list =~ /y/) {
  712.         &setup_hash($locations_ambig_file,"Location (ambig)");
  713.         &setup_hash($locations_unambig_file,"Location (un)");
  714.      
  715.     } else {
  716.         @loc_unambig = ();
  717.         @more_loc_unambig = ();
  718.         @loc_ambig = ();
  719.     }
  720.  
  721.  
  722.     if ($hospital_list =~ /y/) {
  723.         &setup_hash($hospital_file,"Hospital");
  724.     }
  725.     if ($ethnicity_list =~ /y/) {      
  726.         &setup_hash($ethnicities_unambig_file, "Ethnicity");
  727.     }
  728.     if ($company_list =~ /y/) {
  729.         &setup_hash($companies_file, "Company");
  730.         &setup_hash($companies_ambig_file, "Company (ambig)");
  731.    
  732.     }
  733.     if ($country_list =~ /y/) {
  734.         &setup_hash($countries_file, "Country");
  735.     }
  736.  
  737.     if ($local_list =~ /y/){
  738.         &setup_hash($local_places_unambig_file, "Location (un)");
  739.         &setup_hash($local_places_ambig_file, "Location (ambig)");
  740.    
  741.     }
  742.    }
  743.  
  744.    # Preloads PHI in some lists into corresponding arrays    
  745.    @female_popular = &preload($female_popular_file);
  746.    @male_popular = &preload($male_popular_file);
  747.    #@last_name_prefixes = &preload_uc($last_name_prefixes_file);
  748.    @prefixes_unambig = &preload_uc($prefixes_unambig_file);
  749.    
  750.  
  751.    if ($hospital_list =~ /y/) {
  752.     @hospital = &preload($hospital_file);
  753.    } else {@hospital = ();}
  754.  
  755.    if ($state_list =~ /y/){
  756.     @us_states = &preload($us_states_file);
  757.     @us_states_abbre =  &preload($us_states_abbre_file);
  758.     @more_us_states_abbre =  &preload($more_us_states_abbre_file);
  759.    }
  760.  
  761.  
  762.    # Generates associations between PHI in some lists and PHI categories
  763.    &preload_assoc($common_words_file,"common_words");
  764.    &preload_assoc($medical_words_file,"common_words");
  765.  
  766.    &preload_assoc($very_common_words_file,"very_common_words");
  767.    &preload_assoc($unambig_common_words_file, "unambig_common_words");
  768.    &preload_assoc($male_unambig_file, "male_unambig");
  769.    &preload_assoc($female_unambig_file, "female_unambig");
  770.    &preload_assoc($female_ambig_file, "female_ambig");
  771.    &preload_assoc($male_ambig_file, "male_ambig");
  772.    &preload_assoc($last_ambig_file, "last_ambig");
  773.    &preload_assoc($male_popular_file, "male_popular");
  774.    &preload_assoc($female_popular_file, "female_popular");
  775.    &preload_assoc($us_area_code_file,"us_area_code");
  776.  
  777.     # Opens debug file for debugging
  778.    open D, ">".$debug_file;
  779.    close D;
  780. }
  781. # End of setup()
  782. #***********************************************************************************************************
  783. #***********************************************************************************************************
  784. #***********************************************************************************************************
  785. # Function: deid()
  786. # Arguments: None
  787. # Returns: None
  788. # Called by: Topmost level of code
  789. # Description: One of the 2 major branches of the code
  790. # This function takes over de-identification if no performance statistic is required
  791. # Function first loads the patient name list file.
  792. # Then reads the medical text line by line.
  793. #  If the line indicates START_OF_RECORD, read the Patient ID (PID), Note ID (NID), (and Note Date if any) info
  794. # Scan for PHI a paragraph at a time
  795. # output PHI locations into .phi file
  796. my $currentID;
  797. my @known_phi;
  798. my @known_first_name;
  799. my @known_last_name;
  800. my %pidPtNames; # key = pid, value = [0] first name [1] last name
  801.  
  802. #***********************************************************************************************************
  803. #***********************************************************************************************************
  804. #***********************************************************************************************************
  805.  
  806. sub deid {
  807.    
  808.    $allText = "";
  809.    $allallText = "";
  810.    open DF, $data_file or die "Cannot open $data_file";
  811.  
  812.    my $paraCount = 0;
  813.  
  814.    my $stpos = 0;
  815.  
  816.  
  817.    my %deids; #key = start index, value = array of (end index, ID)
  818.    my %phiTerms; #key = lc(word), value = # of occurrences
  819.    my %phiT;
  820.    my %phiTT;
  821.    my $noteDate;
  822.    my $line;
  823.  
  824.    $currentID = 0; #initialize current ID to a non-existent PID 0
  825.  
  826.    # Code runs through text by paragraph so things that extend over lines aren't missed
  827.     $para = "";
  828.  
  829.     #load the patient name file
  830.     if ($pid_patientname_list =~ /y/) {
  831.     open PF, $patient_file or die "Cannot open $patient_file";
  832.    
  833.     while ($pfline = <PF>) {
  834.         chomp $pfline;
  835.  
  836.         if ($pfline =~ /((.)+)\|\|\|\|((.)+)\|\|\|\|((.)+)/ig) {       
  837.         my $pid = $1;
  838.  
  839.         $known_first_names = $3;
  840.         $known_last_names = $5;        
  841.         $pidPtNames{$pid}[0] = $known_first_names;
  842.         $pidPtNames{$pid}[1] = $known_last_names;      
  843.        
  844.         }# end if pfline = ~ /((.)+)\|\|\|\|((.)+)\|\|\|\|((.)+)/ig)
  845.     } # end while $pfline = <PF>
  846.     } #end if pid_patient_name_list ~= y
  847.     ###End loading the patient names
  848.  
  849.     while ($line = <DF>) {
  850.    
  851.     #If this is a new record, set PID, Note ID
  852.     #we assume all notes for the same patient go together
  853.     if ( $line =~ /\ASTART_OF_RECORD=(([0-9]+)(\|)+([0-9]+)(\|)+(\d\d\/\d\d\/\d{4})?)/) {
  854.        
  855.        # $currentNote = $2;
  856.        # $thePT = $4;
  857.        # $noteDate =$6; #noteDate is empty in gold std
  858.  
  859.         #print "pt $thePT, currentNote $currentNote\n";
  860.  
  861.         #if it's gold standard, the header contains PID then NID
  862.        
  863.         $currentNote = $4;
  864.         $thePT = $2;
  865.         $noteDate =$6; #noteDate is empty in gold std
  866.  
  867.         my $label = "Patient $thePT\tNote $currentNote";
  868.         open DEST, ">>".$debug_file or die "Cannot open $debug_file";
  869.         print DEST "$label\n";
  870.         close DEST;
  871.        
  872.             #Gold Standard corpus does not specify note date, so assign a default
  873.             #If you would like deid to date shift the notes on a per note basis, make sure you
  874.             #specify the record date in the header!
  875.         if (length($noteDate) ==0){
  876.             #$noteDate="01/01/2020";
  877.         $noteDate = $DEFAULT_DATE;
  878.         }
  879.  
  880.         #if this is a new patient, set the PID, and lookup patient name
  881.         if ($thePT != $currentID) {  #This is a new patient!   
  882.         $currentID = $thePT;
  883.         %phiTerms = ();  #clear the phiTerms on new pt
  884.  
  885.  
  886.         # Find the patient name for the current PID
  887.         #lookup first and last name of this patient
  888.         if ($pid_patientname_list =~ /y/ && exists $pidPtNames{$currentID} ) {
  889.            
  890.             $known_first_names = $pidPtNames{$currentID}[0];
  891.             $known_last_names = $pidPtNames{$currentID}[1];
  892.            
  893.             #print "Found patient names, first = $known_first_names   last = $known_last_names\n";
  894.  
  895.             if ($known_first_names =~ /([a-z][a-z]+)[\s\-]([a-z][a-z]+)/ig) {
  896.                 @known_first_name = ($1, $2);}
  897.             else {
  898.             if ($known_first_names =~ /\b([a-z])(\.?)\s([a-z]+)/ig) {
  899.                 @known_first_name = ($3);
  900.             }
  901.                
  902.             elsif ($known_first_names =~ /\b([a-z][a-z]+)\s([a-z])(\.?)/ig) {
  903.                 @known_first_name = ($1);
  904.             }
  905.             else {
  906.                 @known_first_name = ($known_first_names);
  907.             }
  908.             }
  909.            
  910.             if ($known_last_names =~ /([a-z][a-z]+)[\s\-]([a-z][a-z]+)/ig) {
  911.             @known_last_name = ($1, $2);}
  912.             else {
  913.             if ($known_last_names =~ /\b([a-z])(\.?)\s([a-z]+)/ig) {
  914.                 @known_last_name = ($3);
  915.             }
  916.             elsif ($known_last_names =~ /\b([a-z][a-z]+)\s([a-z])(\.?)/ig) {
  917.                 @known_last_name = ($1);
  918.             }
  919.            
  920.             else {
  921.                 @known_last_name = ($known_last_names);
  922.             }
  923.             }
  924.  
  925.  
  926.         } else {  #if no pid/patient name file, just set the first name and last name to null
  927.             @known_first_name = ();
  928.             @known_last_name = ();
  929.         }
  930.         } # end if this is a new patient
  931.  
  932.         #output the header to output file      
  933.             $allText = "";       #reset,
  934.         $allallText = "";    #reset
  935.         $stpos = 0;
  936.         $para = "";
  937.  
  938.         #if output mode, output the header line (with patient and note ID) to .res file
  939.             if ($comparison == 0) {
  940.           open TF, ">>$deid_text_file" or die "Cannot open $deid_text_file";   #now open in append mode
  941.           print TF "\n$line";
  942.           close TF;
  943.             }  
  944.         next; #skip to next line
  945.       }  #end if this is start of a record
  946.     else {  #else this is not the start of a record, just append the line to the end of the current text
  947.         chomp $line;  
  948.         $allText .= $line."\n";
  949.         $allallText .= $line."\n";   
  950.            
  951.             #$myline = $line;
  952.             #chomp $myline;
  953.         #$allText .= $myline."\n";
  954.         #$allallText .= $myline."\n";    
  955.     }
  956.  
  957.     #Look for paragraph separator: if this is a line is entirely non-alphanumeric or
  958.         #if it starts with spaces, or if it is an empty line, or if this line marks the end of record,
  959.         #then call findPHI() for the current paragraph we have so far.
  960.         #If end of record is encoutnered, output the de-id text; else if
  961.         #it's not end of record yet, append the line to the paragraph.
  962. #   if ( (!($line =~ /[a-zA-Z\d]+/)) || $line =~ /^ +/ || $line eq "" || ($line =~ /\|\|\|\|END_OF_RECORD/) ) {  
  963.     if (   (!($line =~ /[a-zA-Z\d]+/)) || ($line eq "") || ($line =~ /\|\|\|\|END_OF_RECORD/) ) {  
  964.         if ($para =~ /\w/ ){  #if para contains alphanumeric
  965.                
  966.                 # Calls findPHI() with current paragraph; resulting PHI locations are stored in %phiT
  967.         #%phiT = findPHI("Para $paraCount", $date, $stpos, $para);
  968.         %phiT = findPHI("Para $paraCount", $noteDate, $stpos, $para);
  969.         $paraCount++;
  970.         # %phiT is copied over to %phiTT
  971.         foreach $x (sort numerically keys %phiT) {
  972.             @{$phiTT{$x}} = @{$phiT{$x}};
  973.         }
  974.        
  975.         #Sorts keys in %phiT; outputs text accordingly
  976.         foreach $k (keys %phiT) {        
  977.             my ($start, $end) = split '-', $k;
  978.             # $deids_end = ${@{$deids{$start}}}[0]; #does not work with perl v5.10
  979.             my @deidsval =  ${@{$deids{$start}}};
  980.             $deids_end = $deidsval[0];
  981.                
  982.             $found = $phiT{$k};
  983.             foreach $t (@{$phiT{$k}}) {
  984.             }
  985.             my $word = lc(substr $allallText, $start, ($end - $start));
  986.  
  987.             #print "Key in PhiT = $k, word = $word\n"; #DEBUG
  988.  
  989.            # if ($end > ${@{$deids{$start}}}[0]) {  
  990.             if ($end > $deidsval[0]) {
  991.             $deids{$start}[0] = $end;
  992.             $deids{$start}[1] = $currentID;
  993.             $deids{$start}[2] = $noteDate;                     
  994.             }
  995.  
  996.  
  997.             #############################################################################
  998.             #Now remember the PHI terms that are important names for checking for repeated occurrences of PHIs
  999.             #PHI Name Tags
  1000.                     #(NI)       Name indicators
  1001.                     #(LF)       Lastname Firstnames
  1002.                     #(PTitle)   plural titles
  1003.                     #(MD)       followed by  "MD" or "M.D"
  1004.                     #(PRE)      checks up to 3 words following "PCP Name" ("PCP", "physician", "provider", "created by", "name");
  1005.                     #(NameIs)   followed by pattern "name is"
  1006.                     #(Prefixes) for @prefixes_unambig)
  1007.                     #(STitle)   @specific_titles = ("MR", "MISTER", "MS");
  1008.                     #(Titles)       @titles
  1009.                     #(NamePattern)  all other name patterns in sub name3
  1010.             #remember all the PHI of type name (strict_titles) and name (indicators)
  1011.             #print "checking for repeated occurences of PHIs: word is $word, phitype is (@{$phiT{$k}})\n";
  1012.  
  1013.             if ( ($word !~ /\d/) && ((length $word) > 3) && !(isCommon($word)) &&
  1014.              ( isPHIType( "(NI)", @{$phiT{$k}}) ||
  1015.                isPHIType( "(PTitle)", (@{$phiT{$k}})) || isPHIType( "(LF)", (@{$phiT{$k}})) ||  
  1016.                isPHIType( "(NamePattern)", (@{$phiT{$k}})) ||
  1017.                isPHIType( "(MD)", (@{$phiT{$k}})) ||                         
  1018.                isPHIType( "(NameIs)", (@{$phiT{$k}})) || isPHIType("(STitle)",  (@{$phiT{$k}})) ||
  1019.                isPHIType("(Titles)", (@{$phiT{$k}}) ))
  1020.             ) {    
  1021.                 #$phiTerms{$word}++;
  1022.                 if (!(exists $phiTerms{$word})) {
  1023.                
  1024.                 $phiTerms{$word} = 1;}
  1025.                 else {
  1026.                
  1027.                 $phiTerms{$word} = $phiTerms{$word}+ 1;
  1028.                 }
  1029.  
  1030.  
  1031.             } #end if
  1032.         }  # end foreach $k (keys %phiT)
  1033.             } #end if ($para =~ /\W/)
  1034.  
  1035.         if ($line =~ /\|\|\|\|END_OF_RECORD/ ) {  
  1036.        
  1037.         open DEST, ">>".$debug_file or die "Cannot open $debug_file";
  1038.         ####################################################
  1039.         #check for repeated occurences of PHIs for this note
  1040.         while ($allallText =~ /\b([A-Za-z]+)\b/g) {
  1041.             my $token = $1;
  1042.             my $start = (length ($`));  #$` is the string preceding what was matched by the last successful match
  1043.             my $end = $start + length($token);
  1044.  
  1045.            if (!(exists    $deids{$start})   ) {
  1046.             #if (!(exists ${@{$deids{$start}}}[0])) {
  1047.               L:
  1048.             foreach $word (keys %phiTerms) {
  1049.        
  1050.                 if ( (uc($token) eq uc($word))  ) {
  1051.                
  1052.                 $deids{$start}[0] = $end;
  1053.                 $deids{$start}[1] = $currentID;
  1054.                 $deids{$start}[2] = $noteDate;
  1055.                
  1056.                 $term = substr $text, $start, ($end - $start +1);
  1057.                 $outstr = "$start \t $end \t $term \t Name (Repeated Occurrence) \n";
  1058.                 print DEST $outstr;
  1059.                
  1060.                 next L;
  1061.                 } #end if
  1062.             } # end foreach
  1063.             } # end if
  1064.         } # end while
  1065.         #end checking for repeated occurences of PHIs
  1066.         close DEST;
  1067.         #####################################################
  1068.  
  1069.         ##output PHI locations to the .phi file
  1070.         open OUTF, ">>$output_file" or die "Cannot open $output_file";
  1071.         print OUTF "\nPatient $currentID\tNote $currentNote";
  1072.         foreach $k (sort numerically keys %deids) {
  1073.             my @deidvals = @{$deids{$k}};
  1074.             $thisend = $deidvals[0];
  1075.             if ($thisend ){
  1076.             print OUTF "\n$k\t$k\t$thisend";
  1077.             }
  1078.         }
  1079.         close OUTF;
  1080.  
  1081.         ###output de-ided text to .res file
  1082.         if ($comparison==0) {
  1083.             outputText(\%deids, \%phiTT);  
  1084.         }
  1085.  
  1086.         #now that we have output text for this record, we reset the
  1087.                 #variables to get ready for the next record
  1088.         $para = "";
  1089.         $paraCount = 0;
  1090.         $stpos = 0;
  1091.        
  1092.         %deids=(); #clear the deid hash
  1093.         %phiTT=(); #clear the phiTT hash
  1094.         %phiT=();
  1095.  
  1096.  
  1097.         $allText = "";       #reset,
  1098.         $allallText = "";    #reset
  1099.  
  1100.         } else {  # this is not end of record yet ...
  1101.         my $tmp = length($para);
  1102.         $stpos += length ($para);
  1103.         $para = $line.' ';
  1104.         }
  1105.  
  1106.                
  1107.     }  #end if line starts with empty spaces || empty line || end of record
  1108.     else { # else this line is still a part of the current paragraph
  1109.         #$para .= ' '.$line;  #just append to end of current paragraph
  1110.         #$para .= $line.' '; #just append to end of current paragraph
  1111.          if ($line eq "") {
  1112.         $para .= "\n";
  1113.         } else {
  1114.         $para .= $line.' '; #just append to end of current paragraph
  1115.         }
  1116.     }
  1117.     } #end while ($line=<DF>)
  1118.    
  1119.     close DF;
  1120.  
  1121. }
  1122. # End of deid()
  1123.  
  1124.  
  1125.  
  1126.  
  1127.  
  1128. #***********************************************************************************************************
  1129. #***********************************************************************************************************
  1130. #***********************************************************************************************************
  1131. # Function: findPHI()
  1132. # Arguments: string $label ("Para $paraCount"), string $date (e.g. "yyyy-mm-dd"), int $startIndex (normally 0), string $text (paragraph of text)
  1133. # Returns: hash %approved (key=start-end of each PHI, value=PHI types)
  1134. # Called by: deid()
  1135. # Description: Dispatched from deid() to perform de-identification
  1136. # Reads in a paragraph of text and runs the de-identification algorithm on it
  1137.  
  1138.  
  1139. sub findPHI {
  1140.     my ($label, $curr_date, $startIndex, $text) = @_;
  1141.  
  1142.  
  1143.     # Initializes the hash %phi which stores PHI locations and their PHI types
  1144.     %phi = (); #key = start-end, value = (type1, type2, ...)
  1145.     local %end = (); # key = start, value = end (dynamic scope so addType can do it)
  1146.     if ($text !~ /\w/) {   
  1147.     return %phi;
  1148.     } #[wjl] skip blank lines
  1149.    
  1150.     # Splits the text into separate items at spaces
  1151.     #my @txtlst = split (/([^a-zA-Z0-9_\'\-]+)/,$text); # used by lookhash and appx match
  1152.     my @txtlst = split (/([^a-zA-Z0-9_\']+)/,$text);
  1153.    
  1154.     # Performs exact matching with the hashes of PHI lists
  1155.     &lookhash(@txtlst);
  1156.  
  1157.     # Initializes hash %phik which stores only start and end indices of each PHI in %phi
  1158.     %phik = ();
  1159.  
  1160.     foreach $k (keys %phi) {
  1161.     ($st,$end) = split ('-',$k);
  1162.     $phik{$st} = $end;
  1163.     }  
  1164.  
  1165.  
  1166.     # Calls each filter module
  1167.  
  1168.     if ($datefilter =~ /y/) {
  1169.     &commonHoliday($text, 0);
  1170.     &date ($text, $curr_date);
  1171.     &dateWithContextCheck($text, $curr_date);
  1172.     &yearWithContextCheck($text, $curr_date);
  1173.     &seasonYear ($text, 0);
  1174.     }
  1175.  
  1176.     if ($telfilter =~ /y/) {
  1177.     &telephone($text, 0);
  1178.     &pager ($text, 0);
  1179.     }
  1180.  
  1181.     if ($locfilter =~ /y/) {
  1182.     &wardname($text, 0);
  1183.     &location1 ($text, 0);
  1184.     &location2 ($text, 0);
  1185.     }
  1186.  
  1187.     if ($emailfilter =~ /y/) {
  1188.     &email ($text, 0);
  1189.     }
  1190.  
  1191.     if ($urlfilter =~ /y/) {
  1192.     &url ($text, 0);
  1193.     }
  1194.  
  1195.     if ($ssnfilter =~ /y/) {   
  1196.     &ssn ($text, 0);
  1197.     }
  1198.  
  1199.     if ($agefilter =~ /y/) {
  1200.     &age ($text, 0);
  1201.     }
  1202.  
  1203.     if ($namefilter =~ /y/) {
  1204.  
  1205.     &name1 ($text, 0);
  1206.     &name2 ($text, 0);
  1207.     &name3 ($text, 0);
  1208.     &knownPatientName($text, 0);
  1209.     &problem ($text, 0);
  1210.     &signatureField ($text, 0);
  1211.     }
  1212.  
  1213.     if ($unitfilter =~ /y/) {
  1214.     &mrn ($text, 0);
  1215.     &unit ($text, 0);
  1216.     &providerNumber ($text, 0);
  1217.     }
  1218.  
  1219.  
  1220.    
  1221.     if ($ds_specific_filter =~ /y/){
  1222.     #discharge summary specific filters
  1223.     #filter not enabled for this version
  1224.     #&dischargeSummarySpecific ($text, 0);
  1225.     }
  1226.  
  1227.  
  1228.     # Call new function here >>>>>>>>>>
  1229.     # Follow format shown here if necessary
  1230.     # &functionName ($text, 0);
  1231.  
  1232.     open DEST, ">>".$debug_file or die "Cannot open $debug_file";
  1233.     #print DEST "$label\n";
  1234.  
  1235.     # Sub-function: finalPHICheck()
  1236.     # After findPHI() has performed most PHI checks, goes through the identified PHI before adding them to the final PHI files
  1237.    
  1238.     my %approved;
  1239.     my ($startp, $endp) = (0, 0);
  1240.     my $notAmbig = 0;
  1241.     my $stg1 = "";
  1242.     my $stg2 = "";
  1243.     my $prevAmbig = 0;
  1244.     my $oldk = "";
  1245.     my $prevKey = "";
  1246.     my ($oldstartp, $oldendp,) = (0, 0);
  1247.     my $oldtext = "";
  1248.    
  1249.     # Prunes keys and checks whether each PHI is ambiguous or is an indicator (e.g. hospital indicator)
  1250.     foreach $k (&pruneKeys("phi",$text)) {
  1251.     ($startp, $endp) = split "-", $k;
  1252.  
  1253.     my $the_word = (substr $text, $startp, ($endp - $startp));
  1254.  
  1255.  
  1256.     $notAmbig = 0;  #so by default, the term is ambiguous
  1257.     foreach $tt (@{$phi{$k}}){
  1258.         #if(($tt !~ /ambig/) && ($tt !~ /Indicator/)    ) {
  1259.         if (($tt !~ /ambig/) && ($tt !~ /Indicator/)  && ($tt !~ /MedicalPhrase/)) {
  1260.         #so IF the term matches ANY type that's non-ambiguous, THEN set it as non-ambiguous
  1261.         $notAmbig = 1; last;}
  1262.     }  #end for each
  1263.  
  1264.        
  1265.     my $notIndicator = 1; #default to be not an indicator
  1266.    
  1267.     foreach $tt (@{$phi{$k}}){
  1268.         if ($tt =~ /Indicator/) {
  1269.         $notIndicator = 0; last;
  1270.         }
  1271.     }
  1272.    
  1273.     $prevText = (substr $text, $oldstartp, ($oldendp - $oldstartp));
  1274.     $newText = (substr $text, $startp, ($endp - $startp));
  1275.        
  1276.  
  1277.     $a = (isType($prevKey, "Male First Name", 1) && ($prevAmbig==1) && (!isCommon($prevText)));
  1278.     $b = (isType($k, "Last Name", 1) && ($notAmbig==0) && (!isCommon($newText)));
  1279.    
  1280.  
  1281.     #if (this is ambig) and (previous is ambig) and ...
  1282.     if ((($notAmbig==0) && ($prevAmbig==1) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText)) && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
  1283.     #if (this is not-ambig) and (previous is ambig) and ...
  1284.         (($notAmbig==1) && ($prevAmbig==1) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
  1285.     #if (this is not-ambig) and (previous is ambig) and ...
  1286.         (($notAmbig==1) && ($prevAmbig==1) && isType($prevKey, "Last Name", 1) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "First Name", 1) && (($startp-$oldendp)<3)) ||
  1287.         #commented out on 1/31/07
  1288.         (($notAmbig==0) && ($prevAmbig==0) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
  1289.  
  1290.     #if (this is ambig) and (previous is not ambig) and ...
  1291.         (($notAmbig==0) && ($prevAmbig==0) && isType($prevKey, "Last Name", 1) && (!isCommon($prevText)) && (!isCommon($newText))  && ($prevText !~ /\./) && isType($k, "First Name", 1) && (($startp-$oldendp)<3))) {
  1292.  
  1293.         print DEST ($startIndex + $oldstartp)."\t".($startIndex+$oldendp)."\t".(substr $text, $oldstartp, ($oldendp - $oldstartp +1));
  1294.  
  1295.  
  1296.         ###################
  1297.  
  1298.         my $oldtext = $text;
  1299.         my $newKey = ($startIndex + $oldstartp)."-".($startIndex + $oldendp);
  1300.         ###my $text = (substr $text, $oldstartp, ($oldendp - $oldstartp));
  1301.         foreach $tt (@{$phi{$prevKey}}) {
  1302.         print DEST "\t$tt";
  1303.         push @{$approved{$newKey}}, $tt;}
  1304.         print DEST "\n";
  1305.         print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t".(substr $oldtext, $startp, ($endp - $startp +1));
  1306.        
  1307.         my $newKey = ($startIndex + $startp)."-".($startIndex + $endp);
  1308.         ###my $text = (substr $oldtext, $startp, ($endp - $startp));
  1309.         foreach $tt (@{$phi{$k}}) {
  1310.         print DEST "\t$tt";
  1311.         push @{$approved{$newKey}}, $tt;
  1312.         }
  1313.         print DEST "\n";
  1314.  
  1315.  
  1316.     }
  1317.    
  1318.     # If the PHI is not ambiguous and not an indicator, recognizes it as PHI; add it to PHI file
  1319.     elsif ($notAmbig && $notIndicator) {
  1320.  
  1321.         ###################
  1322.  
  1323.        
  1324.         print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t".(substr $text, $startp, ($endp - $startp +1));
  1325.         my $newKey = ($startIndex + $startp)."-".($startIndex + $endp);
  1326.         ###my $text = (substr $text, $startp, ($endp - $startp));
  1327.         foreach $tt (@{$phi{$k}}){     
  1328.         print DEST "\t$tt";
  1329.         if (($tt !~ /ambig/) && ($tt !~ /Indicator/)) {
  1330.             push @{$approved{$newKey}}, $tt;
  1331.         }
  1332.         }
  1333.         print DEST "\n";
  1334.  
  1335.     } # Else ck keys discarded
  1336.    
  1337.     else {
  1338.         ###################
  1339.        
  1340.         # Otherwise keeps the remaining PHI as non-PHI
  1341.         print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t# ".(substr $text, $startp, ($endp - $startp +1));
  1342.         foreach $tt (@{$phi{$k}}) {
  1343.         print DEST "\t$tt";
  1344.         }
  1345.         print DEST "\n";}
  1346.  
  1347.     # Sets ambiguous variables for current PHI to be recognized as previous PHI for the next round
  1348.     if ($notAmbig==0) {
  1349.         $prevAmbig = 1;
  1350.         $prevKey = $k;
  1351.         ($oldstartp, $oldendp) = split "-", $prevKey;
  1352.         $oldtext = $text;
  1353.  
  1354.     }
  1355.     else {
  1356.         $prevAmbig = 0;
  1357.     }
  1358.     }    
  1359.     close DEST;
  1360.  
  1361.     # End of sub-function finalPHICheck()
  1362.     #***********************************************************************************************************
  1363.  
  1364.  
  1365.     return %approved;
  1366. }
  1367. # End of findPHI()
  1368.  
  1369.  
  1370.  
  1371. #***********************************************************************************************************
  1372. #***********************************************************************************************************
  1373. #***********************************************************************************************************
  1374. # Function: date ()
  1375. # Searches for date ranges following formats that appear most frequently in text
  1376. sub date {
  1377.     #$text = $_[0];    
  1378.     my ($text, $date) = @_;
  1379.     my $year = substr $date, 0, 4;  
  1380.     # Checks if dates should be filtered
  1381.     if ($datefilter =~ /y/) {
  1382.    
  1383.     # Searches for the pattern mm/dd-mm/dd where the items are valid dates
  1384.     while ($text =~ /\b((\d\d?)\/(\d\d?)\-(\d\d?)\/(\d\d?))\b/ig) {
  1385.         if (isValidDate($2,$3,-1) && isValidDate($4,$5,-1)) {
  1386.         $date_range = $1;
  1387.         $start = length($`);
  1388.         $end = $start + length($date_range);
  1389.         $key = "$start-$end";
  1390.         addType($key, "Date range (1)");
  1391.         }
  1392.     }      
  1393.  
  1394.     # Searches for mm/dd/yy-mm/dd/yy or mm/dd/yyyy-mm/dd/yyyy where the items are valid dates
  1395.     while ($text =~ /\b((\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d)\-(\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d))\b/ig) {
  1396.         if (isValidDate($2,$3,$4) && isValidDate($5,$6,$7)) {
  1397.         $date_range = $1;
  1398.         $start = length($`);
  1399.         $end = $start + length($date_range);
  1400.         $key = "$start-$end";
  1401.         addType($key, "Date range (2)");
  1402.         }
  1403.     }      
  1404.    
  1405.     # Searches for mm/dd-mm/dd/yy or mm/dd-mm/dd/yyyy where the items are valid dates
  1406.     while ($text =~ /\b((\d\d?)\/(\d\d?)\-(\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d))\b/ig) {
  1407.         if (isValidDate($6,$2,$3) && isValidDate($6,$4,$5)) {
  1408.         $date_range = $1;
  1409.         $start = length($`);
  1410.         $end = $start + length($date_range);
  1411.         $key = "$start-$end";
  1412.         addType($key, "Date range (3)");
  1413.         }
  1414.     }  
  1415.     } #end if date filter is on
  1416.     # End of sub-function date1()
  1417.  
  1418.     if ($datefilter =~ /y/) {  
  1419.     # Checks for month/day/year
  1420.     while ($text =~ /\b(\d\d?)[\-\/](\d\d?)[\-\/](\d\d|\d{4})\b/g) {
  1421.         my $startI = (length($`));
  1422.         my $endI = $startI + length($&);
  1423.         my $key = $startI."-".$endI;
  1424.         my $first_num = $1;
  1425.         my $second_num = $2;
  1426.         my $third_num = $3;
  1427.         my $beginr = substr $text, ($startI - 2), 2;
  1428.         my $endr = substr $text, $endI, 2;
  1429.  
  1430.  
  1431.         if (($beginr !~ /(\|\|)/) && ($endr !~ /(\|\|)/)) {
  1432.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1433.  
  1434.             #print "checking M/D/Y, $first_num, $second_num, $third_num\n";
  1435.  
  1436.             if (isValidDate ($first_num, $second_num, $third_num)) {
  1437.             addType ($key, "Month/Day/Year");
  1438.             }
  1439.         }
  1440.         }
  1441.     } #end while
  1442.    
  1443.     # Checks for month/day/year
  1444.     while ($text =~ /\b(\d\d?)\.(\d\d?)\.(\d\d|\d{4})\b/g) {
  1445.         my $startI = (length($`));
  1446.         my $endI = $startI + length($&);
  1447.         my $key = $startI."-".$endI;
  1448.         my $first_num = $1;
  1449.         my $second_num = $2;
  1450.         my $third_num = $3;
  1451.         my $beginr = substr $text, ($startI - 2), 2;
  1452.         my $endr = substr $text, $endI, 2;
  1453.        
  1454.         #print "2. checking M/D/Y, $first_num, $second_num, $third_num\n";
  1455.  
  1456.         if (($beginr !~ /(\|\|)/) && ($endr !~ /(\|\|)/)) {
  1457.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1458.             if (isValidDate ($first_num, $second_num, $third_num)) {
  1459.             addType ($key, "Month/Day/Year");
  1460.             }
  1461.         }
  1462.         }
  1463.     }            
  1464.  
  1465.     # Checks for day/month/year
  1466.     while ($text =~ /\b(\d\d?)[\-\/](\d\d?)[\-\/](\d\d|\d{4})\b/g){
  1467.         my $startI = (length($`));
  1468.         my $endI = $startI + length($&);
  1469.         my $key = $startI."-".$endI;
  1470.         my $first_num = $1;
  1471.         my $second_num = $2;
  1472.         my $third_num = $3;
  1473.         my $beginr = substr $text, ($startI - 2), 2;
  1474.         my $endr = substr $text, $endI, 2;
  1475.        
  1476.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1477.         if (isValidDate ($second_num, $first_num, $third_num)) {
  1478.         }
  1479.         }
  1480.     }
  1481.    
  1482.     # Checks for year/month/day
  1483.     while ($text =~ /\b(\d\d|\d{4})[\-\/](\d\d?)[\-\/](\d\d?)\b/g){
  1484.         my $startI = (length($`));
  1485.         my $endI = $startI + length($&);
  1486.         my $key = $startI."-".$endI;
  1487.         my $yr = $1;
  1488.         $nn_year = $yr;
  1489.         my $mo = $2;
  1490.         my $da = $3;
  1491.         my $beginr = substr $text, ($startI - 2), 2;
  1492.         my $endr = substr $text, $endI, 2;
  1493.        
  1494.        
  1495.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1496.         if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
  1497.             $prevChars = (substr $text, ($startI-4), 4);
  1498.             $nextChars = (substr $text, $endI, 11);
  1499.             if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
  1500.             addType ($key, "Header Date");
  1501.             $longyear = $yr;
  1502.             }
  1503.             else {
  1504.             addType ($key, "Year/Month/Day");
  1505.             }
  1506.         }
  1507.         }
  1508.     } #end while
  1509.    
  1510.    
  1511.    
  1512.     # Checks for year/month/day
  1513.     while ($text =~ /\b(\d\d|\d{4})\.(\d\d?)\.(\d\d?)\b/g){
  1514.         my $startI = (length($`));
  1515.         my $endI = $startI + length($&);
  1516.         my $key = $startI."-".$endI;
  1517.         my $yr = $1;
  1518.         $nn_year = $yr;
  1519.         my $mo = $2;
  1520.         my $da = $3;
  1521.         my $beginr = substr $text, ($startI - 2), 2;
  1522.         my $endr = substr $text, $endI, 2;
  1523.        
  1524.        
  1525.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1526.         #if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
  1527.         if (isValidDate ($mo, $da, $yr)) {
  1528.             $prevChars = (substr $text, ($startI-4), 4);
  1529.             $nextChars = (substr $text, $endI, 11);
  1530.             if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
  1531.             addType ($key, "Header Date");
  1532.             $longyear = $yr;
  1533.             }
  1534.             else {
  1535.             addType ($key, "Year/Month/Day");
  1536.             }
  1537.         }
  1538.         }
  1539.     }
  1540.    
  1541.    
  1542.    
  1543.     # Checks for year/day/month
  1544.     while ($text =~ /\b(\d\d|\d{4})[\-\/](\d\d?)[\-\/](\d\d?)\b/g){
  1545.         my $startI = (length($`));
  1546.         my $endI = $startI + length($&);
  1547.         my $key = $startI."-".$endI;
  1548.         my $yr = $1;
  1549.         $nn_year = $yr;
  1550.         my $mo = $3;
  1551.         my $da = $2;
  1552.         my $beginr = substr $text, ($startI - 2), 2;
  1553.         my $endr = substr $text, $endI, 2;
  1554.        
  1555.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
  1556.         #if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
  1557.         if (isValidDate ($mo, $da, $yr)) {
  1558.             $prevChars = (substr $text, ($startI-4), 4);
  1559.             $nextChars = (substr $text, $endI, 11);
  1560.            
  1561.             if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
  1562.             addType ($key, "Header Date");
  1563.             $longyear = $yr;
  1564.             }
  1565.             else {
  1566.             }
  1567.         }
  1568.         }
  1569.     } #end while
  1570.  
  1571.     # Checks for month/4-digit year
  1572.     while ($text =~ /\b((\d\d?)[\-\/](\d{4}))/g) {
  1573.         my $startI = (length($`));
  1574.         my $endI = $startI + length($&);
  1575.         my $beginr = substr $text, ($startI - 2), 2;
  1576.         my $endr = substr $text, $endI, 2;
  1577.         my $first_num = $2;
  1578.         my $second_num = $3;
  1579.         my $st = length($`);
  1580.         my $endb = $st + length ($2) + length($3) + 1;
  1581.         my $key = "$st-$endb";
  1582.         if (($beginr !~ /\|\|/) && ($endr !~ /\|\|/)) {
  1583.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /[\/\.\%]/)) {
  1584.             #if (($first_num <= 12) && ($first_num > 0) && ($second_num>=1900)) {
  1585.             if (($first_num <= 12) && ($first_num > 0) &&
  1586.             ( $second_num >= $VALID_YEAR_LOW &&  $second_num <= $VALID_YEAR_HIGH  )  ) {
  1587.             addType ($key, "Month/Year 1"); }}}
  1588.     } #end while
  1589.    
  1590.         # Checks for 4-digit year/month
  1591.     while ($text =~ /\b((\d{4})[\-\/](\d\d?))\b/g) {
  1592.         my $first_num = $2;
  1593.         my $second_num = $3;
  1594.         my $st = length($`);
  1595.         my $endb = $st + length ($2) + length($3) + 1;
  1596.         my $key = "$st-$endb";
  1597.         if (($begin !~ /\d[\/\.\-]/) && ($end !~ /[\/\.\%]/)) {
  1598.         #if (($second_num <= 12) && ($second_num > 0) && ($first_num>=1900) && ($first_num<2010)) {
  1599.        
  1600.         if (($second_num <= 12) && ($second_num > 0) && ($first_num>=$VALID_YEAR_LOW) && ($first_num <= $VALID_YEAR_HIGH)) {
  1601.             addType ($key, "Year/Month"); }}
  1602.     } #end while
  1603.  
  1604.    
  1605.     # Checks for spelled-out months
  1606.     # Accounts for ambiguity around the dates, e.g. acronyms for measurements, spelled out months and such
  1607.     foreach $m (@months) {
  1608.        
  1609.         while ($text =~ /\b((\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
  1610.         my $day = $2;
  1611.         my $completeDate = $1;
  1612.         my $st = length($`);
  1613.         my $key = "$st-".($st + length($1));
  1614.         if (($day < 32) && ($day > 0)) {
  1615.             addType ($key, "Day Month Year");
  1616.         }
  1617.         }
  1618.  
  1619.         while ($text =~ /\b((\d{1,2}) ?(\-|to|through)+ ?(\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
  1620.         my $day1 = $2;
  1621.         my $day2 = $4;
  1622.         my $completeDate = $1;
  1623.         my $st = length($`);
  1624.         my $key = "$st-".($st + length($1));
  1625.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 >0)) {
  1626.             addType ($key, "Date range (4)");
  1627.         }
  1628.         }
  1629.        
  1630.         while ($text =~ /\b((\d{1,2}) ?\-\> ?(\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
  1631.         my $day1 = $2;
  1632.         my $day2 = $3;
  1633.         my $completeDate = $1;
  1634.         my $st = length($`);
  1635.         my $key = "$st-".($st + length($1));
  1636.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1637.             addType ($key, "Date range (5)");
  1638.         }
  1639.         }
  1640.        
  1641.        
  1642.         while ($text =~ /\b($m\b\.? (\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
  1643.    
  1644.         my $day = $2;
  1645.         my $completeDate = $1;
  1646.         my $st = length($`);
  1647.         my $key = "$st-".($st + length($1));
  1648.         if (($day < 32) && ($day > 0)) {
  1649.             addType ($key, "Month Day Year");
  1650.         }
  1651.         }
  1652.  
  1653.         while ($text =~ /\b($m\b\.? (\d{1,2}) ?(\-|to|through)+ ?(\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
  1654.         my $day1 = $2;
  1655.         my $day2 = $4;
  1656.         my $completeDate = $1;
  1657.         my $st = length($`);
  1658.         my $key = "$st-".($st + length($1));
  1659.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) & ($day2 > 0)) {
  1660.             addType ($key, "Date range (6)");
  1661.         }
  1662.         }
  1663.  
  1664.         while ($text =~ /\b($m\b\.? (\d{1,2}) ?\-\> ?(\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
  1665.    
  1666.         my $day1 = $2;
  1667.         my $day2 = $3;
  1668.         my $completeDate = $1;
  1669.         my $st = length($`);
  1670.         my $key = "$st-".($st + length($1));
  1671.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) & ($day2 > 0)) {
  1672.             addType ($key, "Date range (7)");
  1673.         }
  1674.         }
  1675.                
  1676.         #while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 12 2000
  1677.        while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|) ?[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 12th 2000
  1678.         my $day = $2;
  1679.         my $completeDate = $1;
  1680.         my $st = length($`);
  1681.         my $key = "$st-".($st + length($1));
  1682.         if (($day < 32) && ($day > 0)) {
  1683.             addType ($key, "Month Day Year (2)");
  1684.             #addType ($key, "Month Day Year");
  1685.         }
  1686.         }
  1687.        
  1688.        # while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
  1689.         while ($text =~ /\b($m\b\.?,?\s*(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
  1690.         my $day = $2;
  1691.         my $completeDate = $1;
  1692.         my $st = length($`);
  1693.         my $key = "$st-".($st + length($1));
  1694.  
  1695.         if (($day < 32) && ($day > 0)) {
  1696.             addType ($key, "Month Day");
  1697.         }
  1698.         }
  1699.        
  1700.         while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?(\-|to|through)+ ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
  1701.         my $day1 = $2;
  1702.         my $day2 = $4;
  1703.         my $completeDate = $1;
  1704.         my $st = length($`);
  1705.         my $key = "$st-".($st + length($1));
  1706.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1707.             addType ($key, "Date range (8)");
  1708.         }
  1709.         }
  1710.        
  1711.         while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?\-\> ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12th
  1712.         my $day1 = $2;
  1713.         my $day2 = $4;
  1714.         my $completeDate = $1;
  1715.         my $st = length($`);
  1716.         my $key = "$st-".($st + length($1));
  1717.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1718.             addType ($key, "Date range (9)");
  1719.         }
  1720.         }
  1721.        
  1722.         while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr, or Second of April
  1723.         my $day = $2;
  1724.         my $completeDate = $1;
  1725.         my $st = length($`);
  1726.         my $key = "$st-".($st + length($1));
  1727.         if (($day < 32) && ($day > 0)) {
  1728.             addType ($key, "Day Month");
  1729.         }
  1730.         }
  1731.  
  1732.         ###
  1733.       #  while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)?\s+(of)?\s+[\-]\b$m\.?,?)\s*(\d{2,4})\b/ig) { # 12-Apr, or Second of April
  1734. while ($text =~ /\b(((\d{1,2})(|st|nd|rd|th|)?\s+(of\s)?[\-]?\b($m)\.?,?)\s+(\d{2,4}))\b/ig) { # 12-Apr, or Second of April
  1735.         my $day = $3;
  1736.         my $month = $6;
  1737.         my $year = $7;
  1738.  
  1739.         my $completeDate = $1;
  1740.         my $st = length($`);
  1741.         my $key = "$st-".($st + length($1));
  1742.         if (($day < 32) && ($day > 0)) {
  1743.             addType ($key, "Day Month Year 2");
  1744.         }
  1745.         }
  1746.            ###
  1747.        
  1748.         while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)? ?(\-|to|through)+ ?(\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr
  1749.         my $day1 = $2;
  1750.         my $day2 = $5;
  1751.         my $completeDate = $1;
  1752.         my $st = length($`);
  1753.         my $key = "$st-".($st + length($1));
  1754.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1755.             addType ($key, "Date range (10)");
  1756.         }
  1757.         }
  1758.        
  1759.         while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)? ?\-\> ?(\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr
  1760.         my $day1 = $2;
  1761.         my $day2 = $5;
  1762.         my $completeDate = $1;
  1763.         my $st = length($`);
  1764.         my $key = "$st-".($st + length($1));
  1765.         if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
  1766.             addType ($key, "Date range (11)");
  1767.         }
  1768.         }
  1769.        
  1770.        
  1771.         while ($text =~ /\b($m\.?,? ?(of )?\d{2}\d{2}?)\b/ig) { # Apr. 2002
  1772.         my $year = $2;
  1773.         my $completeDate = $1;
  1774.         my $st = length($`);
  1775.         my $key = "$st-".($st + length($1));
  1776.         addType ($key, "Month Year");
  1777.         }
  1778.     }
  1779.     }  
  1780. }
  1781. # End of function date()
  1782.  
  1783. #***********************************************************************************************************
  1784. #***********************************************************************************************************
  1785. #***********************************************************************************************************
  1786. # Function: age()
  1787. # Checks for ages that are >=90 (assuming that no one is over 125 years old, just as a sanity check)
  1788. # When ages are spelled-out, assumes that the number will not be over hundred, i.e. highest spelled-out age="hundred"
  1789.  
  1790. sub age {
  1791.     $text = $_[0];  
  1792.     if ($agefilter =~ /y/) {
  1793.    
  1794.     foreach $i (@age_indicators_suff) {
  1795.         if (($text =~ /\b(ninety)[\s\-]+ *$i\b/ig) || ($text =~ /\b(hundred)[\s\-]+ *$i\b/ig)) {
  1796.         my $age = $1;
  1797.         my $st = length($`);
  1798.         my $key = "$st-".((length $age) + $st);
  1799.         addType ($key, "Age over 90");
  1800.         }
  1801.         while ($text =~ /\b(([A-Za-z]+)([\s \-])([A-Za-z]+)) ? *$i\b/ig) {           
  1802.         foreach $j (@digits) {
  1803.             $first = $2;
  1804.             $second = $4;
  1805.             my $age1 = $1;
  1806.             my $st1 = length($`);
  1807.             my $end1 = (length $age1) + $st1;
  1808.             my $key1 = "$st1-$end1";
  1809.             $st2 = $st1+length($2)+length($3);
  1810.             $end2 = $st2+(length($second));
  1811.             my $key2 = "$st2-$end2";
  1812.             if ((($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig)) && (($second=~/\b$digits\b/ig))) {
  1813.             addType ($key1, "Age over 90");    
  1814.             }
  1815.             else {
  1816.             if (!(($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig))) {
  1817.                 if (($second=~/\bninety\b/ig) || ($second=~/\bhundred\b/ig)) {
  1818.                 addType ($key2, "Age over 90");
  1819.                 }
  1820.             }
  1821.             }
  1822.         }
  1823.         }
  1824.                
  1825.         while ($text =~ /\b(\d+) *$i/ig) {    
  1826.         my $age = $1;
  1827.         my $st = length($`);
  1828.         my $key = "$st-".((length $age) + $st);
  1829.         if (($age >= 90) && ($age <= 125)) {
  1830.             addType ($key, "Age over 90");
  1831.         }
  1832.         }
  1833.     }
  1834.    
  1835.     foreach $i (@age_indicators_pre) {
  1836.         while ($text =~ /\b($i + *)(([A-Za-z]+)([\s \-])([A-Za-z]+))\b/ig) {         
  1837.         foreach $j (@digits) {
  1838.             $first = $3;
  1839.             $second = $5;
  1840.             my $age1 = $2;
  1841.             my $st1 = length($`)+length($1);
  1842.             my $end1 = (length $age1) + $st1;
  1843.             my $key1 = "$st1-$end1";
  1844.             $st2 = $st1;
  1845.             $end2 = $st2+length($first);
  1846.             my $key2 = "$st2-$end2";
  1847.             if ((($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig)) && (($second=~/\b$digits\b/ig) || (length($second)))) {
  1848.             addType ($key1, "Age over 90");    
  1849.             }
  1850.             else {
  1851.             if (!(($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig))) {
  1852.                 if (($second=~/\bninety\b/ig) || ($second=~/\bhundred\b/ig)) {
  1853.                 addType ($key2, "Age over 90");
  1854.                 }
  1855.             }
  1856.             }
  1857.         }
  1858.         }
  1859.        
  1860.        
  1861.         while ($text =~ /\b($i + *)(\d+)\b/ig) {    
  1862.         my $age = $2;
  1863.         my $st = length($`)+length($1);
  1864.         my $key = "$st-".((length $age) + $st);
  1865.         if (($age >= 90) && ($age <= 125)) {
  1866.             addType ($key, "Age over 90");
  1867.         }
  1868.         }
  1869.     }
  1870.     }
  1871. }
  1872. # End of function age()
  1873. #***********************************************************************************************************
  1874. #***********************************************************************************************************
  1875. #***********************************************************************************************************
  1876. # Function: pager()
  1877. # Checks for pager numbers
  1878.  
  1879. sub pager {
  1880.     $text = $_[0];
  1881.     if ($telfilter =~ /y/) {
  1882.     foreach $i ("Pager", "Pg", "Pgr", "Page", "Beeper", "Beeper number", "Pager number") {  #removed "P"
  1883.  
  1884.         while ($text =~ /\b$i[\s\#\:\-\=]*([a-zA-Z]\s+)*[a-zA-Z]*\s*(\d\d\d+)\b/gi) {
  1885.         my $num = $2;
  1886.         my $end = length($`)+length($&);
  1887.         my $key = ($end - (length $num))."-$end";
  1888.         addType ($key, "Pager number");
  1889.         }
  1890.         while ($text =~ /\b$i[\s\#\:\-\=]*/gi){
  1891.         my $startp = length($`);
  1892.                 my $endp =  length($`)+length($&);
  1893.         #get the next 30 characters
  1894.         my $the_chunck = (substr $text, $endp, 30);
  1895.                 #now look for a 5-digit number
  1896.         while ($the_chunck =~ /(\D)(\d{5})(\D)/gi){
  1897.             my $pager_startp = $endp + length($`) + length($1);
  1898.             my $pager_endp = $pager_startp + length($2);   
  1899.             my $key = "$pager_startp-$pager_endp";
  1900.             addType ($key, "Pager number");
  1901.            
  1902.         } #end while
  1903.         } #end while
  1904.     }
  1905.     }
  1906. }
  1907. # End of function pager()
  1908. #***********************************************************************************************************
  1909. #***********************************************************************************************************
  1910. #***********************************************************************************************************
  1911. # Function: email()
  1912. # Looks for e-mail addresses
  1913.  
  1914. sub email {
  1915.     $text = $_[0];  
  1916.     if ($emailfilter =~ /y/) {
  1917.     while ($text =~ /\b([\w\.]+\w ?@ ?\w+[\.\w+]\.\w{2,3})\b/g) {
  1918.         my $st = length($`);
  1919.         my $key = "$st-".($st+length($&));
  1920.         addType ($key, "E-mail address");
  1921.     }
  1922.     }
  1923. }
  1924. # End of function email()
  1925. #***********************************************************************************************************
  1926. #***********************************************************************************************************
  1927. #***********************************************************************************************************
  1928. # Function: url()
  1929. # Checks for URLs of different types
  1930.  
  1931. sub url {
  1932.     $text = $_[0];  
  1933.     if ($urlfilter =~ /y/) {
  1934.    
  1935.     while ($text =~ /\bhttps?\:\/\/[\w\.]+\w{2,4}\/\S+\b/gi) {
  1936.         my $st = length($`);
  1937.         my $key = "$st-".($st+length($&));
  1938.         addType ($key, "URL");
  1939.     }
  1940.     while ($text =~ /\bftp\:\/\/[\w\.]+\w{2,4}\/\S+\b/gi) {
  1941.         my $st = length($`);
  1942.         my $key = "$st-".($st+length($&));
  1943.         addType ($key, "URL");
  1944.     }
  1945.     while ($text =~ /\bwww\.[\w\.]+\w{2,4}\/\S+\b/gi) {
  1946.         my $st = length($`);
  1947.         my $key = "$st-".($st+length($&));
  1948.         addType ($key, "URL");
  1949.     }
  1950.     while ($text =~ /\bwww\.[\w\.]+\w{2,4}\b/gi) {
  1951.         my $st = length($`);
  1952.         my $key = "$st-".($st+length($&));
  1953.         addType ($key, "URL");
  1954.     }
  1955.     while ($text =~ /\bweb\.[\w\.]+\w{2,4}\/\S+\b/gi) {
  1956.         my $st = length($`);
  1957.         my $key = "$st-".($st+length($&));
  1958.         addType ($key, "URL");
  1959.     }
  1960.     while ($text =~ /\bweb\.[\w\.]+\w{2,4}\b/gi) {
  1961.         my $st = length($`);
  1962.         my $key = "$st-".($st+length($&));
  1963.         addType ($key, "URL");
  1964.     }
  1965.  
  1966.     while ($text =~ /\bhttps?\:\/\/[\w\.]+\w{2,4}\b/g) {
  1967.         my $st = length($`);
  1968.         my $key = "$st-".($st+length($&));
  1969.         addType ($key, "URL");
  1970.     }
  1971.     while ($text =~ /\bftp\:\/\/[\w\.]+\w{2,4}\b/g) {
  1972.         my $st = length($`);
  1973.         my $key = "$st-".($st+length($&));
  1974.         addType ($key, "URL");
  1975.     }
  1976.     }
  1977. }
  1978. # End of function url()
  1979. #***********************************************************************************************************
  1980. #***********************************************************************************************************
  1981. #***********************************************************************************************************
  1982. # Function: ssn()
  1983. # Checks for social security numbers (SSNs)
  1984.  
  1985. sub ssn {
  1986.     $text = $_[0];  
  1987.      
  1988.     if ($ssnfilter =~ /y/) {
  1989.     while ($text =~ /\b\d\d\d([- \/]?)\d\d\1\d\d\d\d\b/g) {
  1990.         my $st = length($`);
  1991.         my $key = "$st-".($st+length($&));
  1992.         addType ($key, "Social Security Number");
  1993.     }
  1994.     }
  1995. }
  1996. # End of function ssn()
  1997. #***********************************************************************************************************
  1998. #***********************************************************************************************************
  1999. #***********************************************************************************************************
  2000. # Function: name1()
  2001. # Uses name prefixes to make last names
  2002.  
  2003. sub name1 {
  2004.     $text = $_[0];  
  2005.     if ($namefilter =~ /y/) {
  2006.  
  2007.     my @keylst = sort numerically keys %phi;
  2008.     my $key;
  2009.     for($i=0;$i<$#keylst;$i++){
  2010.         $key = $keylst[$i];
  2011.         if (isType($key, "Last Prefix", 0)){
  2012.         ($f1,$t1) = split ('-',$key);
  2013.         ($f2,$t2) = split ('-',$keylst[$i+1]);
  2014.         if ($f2 < $t1+3){
  2015.             if (isType($keylst[$i+1], "Last Name", 1)){
  2016.             print "Found Last Prefix match, Adding $keylst[$i+1] type for last name!!";
  2017.             addType("$f1-$t2","Last Name");
  2018.             }
  2019.         }
  2020.         }
  2021.     }
  2022.     #####################################################  
  2023.     # Uses common-sense heuristics to try to find more names
  2024.     foreach $i (@name_indicators){
  2025.      
  2026.         #while ($text =~ /\b($i)(s)?( *)(\-|\,|\.|\()?( *)([A-Za-z]+\b)(\s+)(and )?( *)([A-Za-z]+)\b/ig) {
  2027.         while ($text =~ /\b($i)(s)?( *)(\-|\,|\.|\()?( *)([A-Za-z]+\b)\b/ig) {
  2028.  
  2029.         $potential_name = $6;
  2030.    
  2031.         $start = length($`)+length($1) + length ($2) + length($3) + length($4) + length($5);
  2032.         $end = $start + length($potential_name);
  2033.         $key = "$start-$end";
  2034.         my $tmpstr = substr $text, $start, $end-$start;
  2035.        
  2036.         my $tmp = isType($key, "Name",1);
  2037.          
  2038.  
  2039.         if (isProbablyName($key, $potential_name)){    
  2040.             addType ($key, "Name (NI)");
  2041.         } # end if the first word after the name indicator is a name
  2042.  
  2043.          my $new_start = $end + length($6) + length($7);
  2044.          my $new_end = $new_start + length($8);
  2045.  
  2046.  
  2047.          #########now check the next word
  2048.          my $rest = substr $text, $end+1, 20;
  2049.         if (($rest =~ /\b(and )?( *)([A-Za-z]+)\b/ig)){
  2050.             my $new_start = $end + 1 + length($`) + length($1)+length($2);
  2051.             my $new_end = $new_start + length ($3);
  2052.  
  2053.             my $keyAfter = "$new_start-$new_end";
  2054.             my $wordAfter = substr $rest,   (length ($`)+ length($1) + length($2)) , length ($3);
  2055.          
  2056.  
  2057.             if ( !isNameIndicator($wordAfter) && ( (  !isCommon($wordAfter) ||
  2058.             ((isType($keyAfter, "Name", 1) && isType($keyAfter, "(un)"))  ||
  2059.             (isType($keyAfter, "Name", 1) && ($wordAfter =~ /\b(([A-Z])([a-z]+))\b/g)) ||
  2060.             (!isCommonest($wordAfter) && isType($keyAfter, "Name", 1)) ||
  2061.             (isType($keyAfter, "popular",1)) ) )  ) ) {
  2062.    
  2063.             if ($rest =! /\b[\d]\b/ig) { #disregard if the rest contains numeric patterns
  2064.                 addType ($keyAfter, "Name2 (NI)");
  2065.             }
  2066.             }
  2067.  
  2068.             elsif ($1 =~ /and/ig){
  2069.             if (! (isCommon($wordAfter) || isNameIndicator($wordAfter))) {
  2070.                 addType ($keyAfter, "Name2 (NI)");
  2071.             }
  2072.             }
  2073.         } #end if rest
  2074.          
  2075.        
  2076.         } # end of while
  2077.  
  2078.     } #end for each name indicator
  2079.  
  2080.     #**********************************************************************************************************
  2081.     # Searches for the name pattern LASTNAME, FIRSTNAME
  2082.     # First checks if word2 is a firstname
  2083.     # If it is, then if word1 is not a common or commonest word, identifies word1 as a lastname
  2084.     while ($text =~ /\b([A-Za-z]+)( ?\, ?)([A-Za-z]+)\b/ig) {
  2085.         $input1 = $1;
  2086.         $input2 = $3;
  2087.         my $st1 = length($`);
  2088.         my $end1 = $st1 + length($input1);
  2089.         my $key = $key1;
  2090.         my $key1 = "$st1-$end1";
  2091.         my $st2 = $end1+length($2);
  2092.         my $end2 = $st2 + length($input2);
  2093.         my $key = $key2;
  2094.         my $key2 = "$st2-$end2";
  2095.        
  2096.         if ((isType($key2, "Name", 1)) && (isType($key1, "Name (ambig)", 1)) && (!isNameIndicator($input1)) ) {
  2097.         addType ($key1, "Last Name (LF)");
  2098.         addType ($key2, "First Name1 (LF)");
  2099.         }
  2100.        
  2101.         if ((isType($key1, "Name", 1)) && (isType($key2, "Name (ambig)", 1)) && (!isNameIndicator($input1))  ) {
  2102.         addType ($key2, "Last Name (LF)");
  2103.         addType ($key1, "First Name2 (LF)");
  2104.         }
  2105.        
  2106.         if (isType($key2, "First Name", 1)) {
  2107.         if (   (isType($key1, "Last Name", 1) && (!isCommonest($input1)) &&  (!isNameIndicator($input1))) ||
  2108.                ((!isCommon($input1)) && (!isCommonest($input1)))   ) {
  2109.             addType ($key1, "Last Name (LF)");
  2110.             addType ($key2, "First Name3 (LF)");
  2111.         }
  2112.         }  
  2113.     }
  2114.     }
  2115. }
  2116. # End of function name1()
  2117. #***********************************************************************************************************
  2118. #***********************************************************************************************************
  2119. #***********************************************************************************************************
  2120. # Function: seasonYear()
  2121. # Checks for both season and year from patterns like "<season> of? <year>"
  2122.  
  2123. sub seasonYear {
  2124.     $text = $_[0];  
  2125.    
  2126.     @seasons = ("winter", "spring", "summer", "autumn", "fall");
  2127.    
  2128.     foreach $i (@seasons) {
  2129.         while ($text =~ /\b(($i)(( +)of( +))? ?\,?( ?)\'?(\d{2}|\d{4}))\b/gi) {
  2130.         $f2=$2;
  2131.         $f3=$3;
  2132.         $f6=$6;
  2133.         $f7=$7;
  2134.         if (length($f7)==4) {
  2135.             if (($f7<=$longyear) && ($f7>1900)) {  
  2136.             my $st1 = length($`);
  2137.             my $end1 = (length $f2) + $st1;
  2138.             my $key1 = "$st1-$end1";
  2139.            
  2140.             my $st2 = $end1+(length $f3)+(length $f6);
  2141.             my $key2 = "$st2-".((length $f7) + $st2);
  2142.             addType ($key2, "Year (4 digits)");    
  2143.             }
  2144.         }
  2145.         else {                   
  2146.             my $st1 = length($`);
  2147.             my $end1 = (length $f2) + $st1;
  2148.             my $key1 = "$st1-$end1";
  2149.            
  2150.             my $st2 = $end1+(length $f3)+(length $f6);
  2151.             my $key2 = "$st2-".((length $f7) + $st2);
  2152.             addType ($key2, "Year (4 digits)");                    
  2153.         }
  2154.         }
  2155.     }
  2156.    
  2157. }
  2158. # End of function seasonYear()
  2159. #***********************************************************************************************************
  2160. #***********************************************************************************************************
  2161. #***********************************************************************************************************
  2162. # Function: name2()
  2163. # Checks for more than 1 name following plural drs, drs., doctors, professors,
  2164. # Checks up to 3 names
  2165.  
  2166. sub name2 {
  2167.     $text = $_[0];  
  2168.     if ($namefilter =~ /y/) {
  2169.  
  2170.     @plural_titles = ("doctors", "drs", "drs\.", "professors");
  2171.    
  2172.     foreach $p (@plural_titles) {
  2173.     while ($text =~ /\b((($p +)([A-Za-z]+) *(and +)?\,? *)([A-Za-z]+) *(and +)?\,? *)([A-Za-z]+)?\b/gi) {  
  2174.         my %names = ();
  2175.         $st3 = length($`);
  2176.         $st4 = $st3+length($3);
  2177.         $end4 = $st4+length($4);
  2178.         $key4 = "$st4-$end4";
  2179.         $st6 = $st3+length($2);
  2180.         $end6 = $st6+length($6);
  2181.         $key6 = "$st6-$end6";
  2182.         $st8 = $st3+length($1);
  2183.         $end8 = $st8+length($8);
  2184.         $key8= "$st8-$end8";
  2185.         $names{$4} = $key4;
  2186.         $names{$6} = $key6;
  2187.         $names{$8} = $key8;
  2188.         foreach $i (keys %names) {
  2189.         $val = $names{$i};
  2190.         if (length($i)>0) {
  2191.             if (!(isCommonest($i)) || (isType($val, "Name", 1))) {
  2192.             #print "addtype, val is $val\n";
  2193.             addType ($val, "Name5 (PTitle)");
  2194.             }
  2195.         }
  2196.         }
  2197.     }
  2198.     }
  2199.  
  2200.  
  2201.  
  2202.  
  2203. #****************************************************************************************
  2204. # Checks for names followed by "MD" or "M.D"
  2205. # Checks up to 3 previous words
  2206.     while ($text =~ /\b((([A-Za-z\']+) +)?(([A-Za-z])\. +)?([A-Za-z\-\']+)((\, *)|(\s+))(rrt|md|m\.d\.|crt|np|rn|nnp|msw|r\.n\.)(\.|\,)*\b)/ig) {
  2207.    
  2208.     $name = $1;
  2209.     $start = length($`);
  2210.     $end = $start + length($name);
  2211.     $key = "$start-$end";
  2212.    
  2213.     $name1 = $3; #if present, would be first name
  2214.     $name2 = $4;  #if present, would be initial
  2215.     $name3 = $6;  #if present would be last name
  2216.  
  2217.     $st1 = length($`);
  2218.     $end1 = $st1 + (length $name1);
  2219.     $key1 = "$st1-$end1";
  2220.  
  2221.     $st2 = $st1 + length($2);
  2222.     $end2 = $st2 + (length $name2);
  2223.     $key2 = "$st2-$end2";  
  2224.  
  2225.     $st3 = length($`) + (length $2) + (length $4);
  2226.     $end3 = $st3 + (length $name3);
  2227.     $key3 = "$st3-$end3";
  2228.    
  2229.     if ((!($text =~ /(m\.?d\.?\')/)) && (!($text =~ /(m\.?d\.?s)/))) {
  2230.    
  2231.         if (length($name1)>0) {
  2232.             if ((length($name1)==1) || ((length($name1)==2) && ($name1 =~ /\b([A-Za-z])\.+\b/ig))) {  
  2233.             addType($key1, "Name Initial (MD)");
  2234.         }  
  2235.         else {
  2236.             if (isProbablyName($key1, $name1)) {
  2237.             addType($key1,"Name6 (MD)" );
  2238.             }
  2239.         }
  2240.         }
  2241.  
  2242.         if (length($name2)>0 && length($name3)>0) {
  2243.         if ((length($name2)==1) || ((length($name2)==2) && ($name2 =~ /\b([A-Za-z])\.+\b/ig))) {    
  2244.              #addType($key2, "Name Initial (MD)");
  2245.         }  
  2246.         else {
  2247.             #if (!(isCommon($name2) && !(isType($key2, "Name", 1)))) {
  2248.             if (isProbablyName($key2, $name2)) {
  2249.             addType($key2,"Name7 (MD)" );
  2250.             }
  2251.         }
  2252.         }
  2253.  
  2254.         if (length($name3)>0) {
  2255.         if ((length($name3)==1) || ((length($name3)==2) && ($name3 =~ /\b([A-Za-z])\.\b/ig))) {  
  2256.             #addType($key3, "Name Initial (MD)");
  2257.         }  
  2258.         else {
  2259.             if (isProbablyName($key3, $name3)) {   
  2260.            
  2261.             addType($key3,"Name8 (MD)" );
  2262.  
  2263.             }
  2264.         } # end else
  2265.         } #endif
  2266.     } #end if text does not have M.D.' or M.D.s
  2267.     }
  2268.  
  2269.  
  2270. #****************************************************************************************
  2271. # Removes PCP name field, leaving "PCP name:" intact, from discharge summaries
  2272. # Does not check for name patterns, since these should be caught by the other methods
  2273. # Required mainly for unknown names, checks up to 3 words following "PCP Name"
  2274. # Follows the pattern seen in discharge summaries, may not work well in nursing notes
  2275.  
  2276.     #@name_pre = ("PCP", "physician", "provider", "created by", "name");
  2277.     @name_pre = ("PCP", "physician", "provider", "created by");
  2278.    
  2279.     foreach $l (@name_pre) {
  2280.     while ($text =~/\b(($l( +name)?( +is)?\s\s*)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)(((\s*\,*\s*)? *)([A-Za-z\-]+))?)\b/ig) {  
  2281.         my $key1 = $5;
  2282.         my $st1 = length($`)+(length $2);
  2283.         my $end1 = $st1+(length $5);
  2284.         my $keyloc1 = "$st1-$end1";
  2285.         my $key2 = $8;
  2286.         my $st2 = $end1+(length $6);
  2287.         my $end2 = $st2+(length $8);
  2288.         my $keyloc2 = "$st2-$end2";
  2289.         my $key3 = $12;
  2290.         my $st3 = $end2+(length $10);
  2291.         my $end3 = $st3+(length $12);
  2292.         my $keyloc3 = "$st3-$end3";    
  2293.         my %pcp = ();
  2294.         $pcp{$keyloc1} = $key1;
  2295.         $pcp{$keyloc2} = $key2;
  2296.         $pcp{$keyloc3} = $key3;
  2297.        
  2298.         foreach my $keyloc (keys %pcp ) {
  2299.         my $val = $pcp{$keyloc};
  2300.         if (length($val)>0) {
  2301.             if ((length($val)==1) || ($val =~ /\b([A-Za-z])\.\b/ig)) {    
  2302.             addType($keyloc, "Name Initial (PRE)");
  2303.             }  
  2304.             else {
  2305.             #if (!(isCommonest($val) && !(isType($keyloc, "Name", 1)))) {
  2306.             if (isProbablyName($keyloc, $val)){
  2307.            
  2308.                 addType($keyloc,"Name9 (PRE)" );
  2309.             }
  2310.             }
  2311.         }
  2312.         }
  2313.     }
  2314.        
  2315.         #followed by pattern "name is"
  2316.     while ($text =~ /\b(($l( +name)?( +is)? ?([\#\:\-\=\.\,])+ *)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)?)\b/ig) {
  2317.         my $key1 = $6;
  2318.         my $st1 = length($`)+(length $2);
  2319.         my $end1 = $st1+(length $6);
  2320.         my $keyloc1 = "$st1-$end1";
  2321.         my $key2 = $9;
  2322.         my $st2 = $end1+(length $7);
  2323.         my $end2 = $st2+(length $9);
  2324.         my $keyloc2 = "$st2-$end2";
  2325.         my $key3 = $12;
  2326.         my $st3 = $end2+(length $10);
  2327.         my $end3 = $st3+(length $12);
  2328.         my $keyloc3 = "$st3-$end3";
  2329.         my %pcp = ();
  2330.         my $firstfound = 0;
  2331.         my $secondfound = 0;
  2332.         $pcp{$keyloc1} = $key1;
  2333.         $pcp{$keyloc2} = $key2;
  2334.         $pcp{$keyloc3} = $key3;
  2335.         $blah = isCommonest($key3);
  2336.         $blah2 = isType($keyloc3, "Name", 1);
  2337.        
  2338.         if (length($key1)>0) {
  2339.         if ((length($key1)==1) || ($key1 =~ /\b([A-Za-z])\.\b/ig)) {
  2340.             addType($keyloc1, "Name Initial (NameIs)");
  2341.             $firstfound = 1;
  2342.         }
  2343.         else {
  2344.             if (isProbablyName($keyloc1, key1)){
  2345.             addType($keyloc1,"Name10 (NameIs)" );
  2346.             $firstfound = 1;
  2347.             }
  2348.         }
  2349.         }
  2350.         if ($firstfound == 1) {
  2351.         if (length($key2)>0) {
  2352.             if ((length($key2)==1) || ($key2 =~ /\b([A-Za-z])\.\b/ig)) {
  2353.             addType($keyloc2, "Name Initial (NameIs)");
  2354.             $secondfound = 1;
  2355.             }
  2356.             else {
  2357.             if (isProbablyName($keyloc2, $key2)){
  2358.                 addType($keyloc2,"Name11 (NameIs)" );
  2359.                 $secondfound = 1;
  2360.             }
  2361.             }
  2362.         }
  2363.         }
  2364.         if ($secondfound == 1) {
  2365.         if (length($key3)>0) {
  2366.             if ((length($key3)==1) || ($key3 =~ /\b([A-Za-z])\.\b/ig)) {
  2367.             addType($keyloc3, "Name Initial (NameIs)");
  2368.             }
  2369.             else {
  2370.             if (isProbablyName ($keyloc3, $key3)){
  2371.                 addType($keyloc3,"Name12 (NameIs)" );
  2372.             }
  2373.             }
  2374.         }
  2375.         }
  2376.     }
  2377.     }
  2378. }
  2379. }
  2380. # End of function name2()
  2381. #***********************************************************************************************************
  2382. #***********************************************************************************************************
  2383. #***********************************************************************************************************
  2384. # Function: providerNumber()
  2385. # Removes "provider number", whole field, from discharge summaries
  2386. # Does not remove the field if there is no number following it
  2387.  
  2388. sub providerNumber {
  2389.     $text = $_[0];      
  2390.     if ($unitfilter =~ /y/) {
  2391.    
  2392.     while ($text =~ /\b(provider(( +)number)?( ?)[\#\:\-\=\s\.]?( ?)(\d+)([\/\-\:](\d+))?)\b/gi) {  
  2393.         my $unit = $1;
  2394.         my $st = length($`);
  2395.         my $key = "$st-".((length $unit) + $st);
  2396.         addType ($key, "Provider Number");
  2397.     }
  2398.     }
  2399. }
  2400. # End of function providerNumber()
  2401. #***********************************************************************************************************
  2402. #***********************************************************************************************************
  2403. #***********************************************************************************************************
  2404. # Function: signatureField()
  2405. # Removes signature fields from discharge summaries
  2406. # signature field taken as 3 or more underscores
  2407. # does not remove doctor names, since these are handled by other name handlers
  2408.    
  2409. sub signatureField {
  2410.     $text = $_[0];  
  2411.  
  2412.     while ($text =~ /\b(\_\_+)\b/gi) {
  2413.    
  2414.     my $sigfield = $1;
  2415.     my $st = length($`);
  2416.     my $key = "$st-".((length $sigfield) + $st);
  2417.     addType ($key, "Signature");
  2418.     }
  2419. }
  2420. # End of function signatureField()
  2421. #***********************************************************************************************************
  2422. #***********************************************************************************************************
  2423. #***********************************************************************************************************
  2424. # Function: location1()
  2425. # Removes complete street addresses only when including one of the terms in @street_add_suff
  2426.  
  2427. sub location1 {
  2428.     $text = $_[0];    
  2429.  
  2430.     if ($locfilter =~ /y/) {
  2431.     #check if ambiguous locations are preceded by any of the location indicators,
  2432.         #if so, add in PHI list
  2433.     foreach $i (@location_indicators){
  2434.      
  2435.         while ($text =~ /\b($i)(\s+)([A-Za-z]+)\b/ig) {
  2436.         #print "Location Match 1 is $1, 2 is $2, 3 is $3\n";
  2437.         my $st = length ($`) +  length ($1) + length ($2);
  2438.                 my $end = $st + length ($3);
  2439.         my $key = "$st-$end";
  2440.         my $word = substr $text, $st, ($end-$st);
  2441.         #print "word is $word\n";
  2442.            
  2443.         if (isType($key, "Location",1)  || (length ($word) > 1 && !(isCommon($word))) ) {  
  2444.            
  2445.             addType ($key, "Location");
  2446.         }
  2447.         }
  2448.     } #end for each i in location indicators
  2449.  
  2450.  
  2451.  
  2452.     #if the company dictionary is loaded, check if any of the ambiguous company names
  2453.     #are preceded by the employment indicators
  2454.     if ($company_list =~/y/){
  2455.         #print "company list is on\n";
  2456.         foreach $i (@employment_indicators_pre){
  2457.         while ($text =~ /\b($i)(\s+)([A-Za-z]+)\b/ig) {
  2458.            
  2459.             $st = length ($`) +  length ($1) + length ($2);
  2460.             $end = $st + length ($3);
  2461.             $key = "$st-$end";
  2462.             my $word = substr $text, $st, ($end-$st);
  2463.            
  2464.             my $tmp = isCommon($word);
  2465.            
  2466.  
  2467.             if (isType($key, "Company",1)|| (length ($word) > 1 && !(isCommon($word)))  ) {
  2468.             #print "adding  $3 key $key as company unambig";
  2469.             addType ($key, "Company");
  2470.             }  
  2471.         }
  2472.         } #end for each i in location indicators
  2473.     }
  2474.  
  2475.  
  2476.  
  2477.     #strict address suffix, case-sensitive match, PHI regardless of ambiguity
  2478.     foreach $i (@strict_street_add_suff) { 
  2479.         #make it a case-sensitive match for street address suffix
  2480.         while ($text =~ /\b(([0-9]+ +)?(([A-Za-z\.\']+) +)?([A-Za-z\.\']+) +\b$i\.?\b)\b/g) {
  2481.  
  2482.         $st = length($`);
  2483.         $end = $st + length($1);
  2484.  
  2485.         #check next segment for apartment, suite, floor #s
  2486.         my $nextSeg = substr $text, $end, 30;
  2487.         #print "check nextSeg for apt and sutie #, seg is  $nextSeg\n";
  2488.         foreach $k (@apt_indicators){
  2489.             if ($nextSeg =~ /\b($k\.?\#? +[\w]+)\b/gi) {
  2490.             $end += length ($`) + length($1);
  2491.             }
  2492.         }
  2493.         $key = "$st-$end";
  2494.         #addType ($key, "Street Address");
  2495.  
  2496.         if (length($3) == 0) {
  2497.             if (!isUnambigCommon($5)) {
  2498.             addType ($key, "Street Address");
  2499.             }
  2500.         }
  2501.         elsif (!((isUnambigCommon($4)) && (isUnambigCommon($5)))) {
  2502.             addType($key, "Street Address");
  2503.         }
  2504.         } # end while
  2505.     } #end foreach
  2506.     }#end if
  2507.  
  2508.     #Non-strict address suffix, case-insensitive match, PHI if no ambiguity
  2509.     if ($locfilter =~ /y/) {
  2510.    
  2511.     foreach $i (@street_add_suff) {
  2512.  
  2513.         while ($text =~ /\b(([0-9]+) +(([A-Za-z]+) +)?([A-Za-z]+) +$i)\b/gi) {
  2514.         $st = length($`);
  2515.         $end = $st + length($1);
  2516.         $key = "$st-$end";
  2517.  
  2518.         if (length($3) == 0) {
  2519.         if (!isUnambigCommon($word)){
  2520.             addType ($key, "Street Address");
  2521.         }
  2522.         }
  2523.         elsif ( ! (isUnambigCommon($4) || isUnambigCommon($5))){
  2524.         addType($key, "Street Address");
  2525.         }
  2526.     }
  2527.     }
  2528. }
  2529. #****************************************************************************************
  2530.     # Removes 2-word location PHI ending with @loc_indicators_suff or preceded by @loc_indicators_pre
  2531.  
  2532.     # Words potentially PHI
  2533.     if ($locfilter =~ /y/) {
  2534.    
  2535.     foreach $i (@loc_indicators_suff) {
  2536.         while ($text =~ /\b(([A-Za-z\-]+)? +)?(([A-Za-z\-]+) + *$i +)\b/ig) {
  2537.         if (!isCommon($4)) {       
  2538.             $st2 = length($`)+length($1);
  2539.             $end2 = $st2 + length($3);
  2540.             $key2 = "$st2-$end2";
  2541.             addType ($key2, "Location");
  2542.            
  2543.             if (length $2>0) {
  2544.             if (!isCommon($2)) {
  2545.                 $st1 = length($`);
  2546.                 $end1 = $st1 + length($2);
  2547.                 $key1 = "$st1-$end1";
  2548.                 addType ($key1, "Location");}
  2549.             }
  2550.            
  2551.         }
  2552.         }  
  2553.     }
  2554.     }
  2555.     if ($locfilter =~ /y/) {
  2556.    
  2557.     # Words most likely PHI
  2558.     foreach $i (@loc_ind_suff_c) {
  2559.        
  2560.         while ($text =~ /\b(([A-Za-z]+ +)?)(([A-Za-z]+)$i+)\b/ig) {
  2561.         if (!isCommon($3)) {       
  2562.             $st2 = length($`)+length($1);
  2563.             $end2 = $st2 + length($3);
  2564.             $key2 = "$st2-$end2";
  2565.             addType ($key2, "Location");
  2566.         }
  2567.         }
  2568.     }
  2569.     }
  2570.    
  2571.     if ($locfilter =~ /y/) {
  2572.    
  2573.     # Words potentially PHI
  2574.     foreach $i (@loc_indicators_pre) {
  2575.         while ($text =~ /\b((($i + *([A-Za-z\-]+)) *)([A-Za-z\-]+)?)\b/ig) {
  2576.         if (!isCommon($4)) {       
  2577.             $st2 = length($`);
  2578.             $end2 = $st2 + length($3);
  2579.             $key2 = "$st2-$end2";
  2580.             addType ($key2, "Location");
  2581.            
  2582.             if (length $5>0) {
  2583.             if (!isCommon($5)) {
  2584.                 $st1 = length($`)+length($2);
  2585.                 $end1 = $st1 + length($5);
  2586.                 $key1 = "$st1-$end1";
  2587.                 addType ($key1, "Location");
  2588.             }
  2589.             }              
  2590.         }
  2591.         }
  2592.     }
  2593.     } # end if locfilter =~ /y/
  2594.  
  2595.  
  2596.     @universities_pre = ("University", "U", "Univ", "Univ.");
  2597.  
  2598.     #catches "University of", "U of", "Univ of", "Univ. of"
  2599.     if ($locfilter =~ /y/) {
  2600.    
  2601.     # Words potentially PHI
  2602.     foreach $i (@universities_pre) {
  2603.         while ($text =~ /\b((($i +of *([A-Za-z\-]+)) *)([A-Za-z\-]+)?)\b/ig) {
  2604.         my $tmp = isUSStateAbbre($4);
  2605.  
  2606.         if (isUSStateAbbre($4) || isUSState($4) ||  !isCommon($4) ) {      
  2607.             $st2 = length($`);
  2608.             $end2 = $st2 + length($3);
  2609.             $key2 = "$st2-$end2";
  2610.             addType ($key2, "Location");
  2611.            
  2612.             if (length $5>0) {
  2613.             if (!isCommon($5)) {
  2614.                 $st1 = length($`)+length($2);
  2615.                 $end1 = $st1 + length($5);
  2616.                 $key1 = "$st1-$end1";
  2617.                 addType ($key1, "Location (Universities)");
  2618.             }
  2619.             }              
  2620.         }
  2621.         }
  2622.     }
  2623.     } # end if locfilter =~ /y/
  2624.  
  2625.    
  2626.  
  2627. }
  2628. # End of function location1()
  2629.  
  2630. #***********************************************************************************************************
  2631. #***********************************************************************************************************
  2632. #***********************************************************************************************************
  2633. # Function: location2()
  2634. # Searches for multiple hospital and location terms
  2635.  
  2636. sub location2 {
  2637.     $text = $_[0];
  2638.     if ($locfilter =~ /y/) {
  2639.    
  2640.     foreach $hos (@hospital) {
  2641.        
  2642.         my @hospital_terms = split " ", $hos;
  2643.         $len = 0;
  2644.         foreach $h (@hospital_terms) {
  2645.         if (length($h) != 0) {
  2646.             $len = $len+1;
  2647.             $hos[$len] = $h;
  2648.         }
  2649.         }
  2650.        
  2651.         if ($len == 1) {
  2652.         while ($text =~ /\b($hos[1])\b/ig) {
  2653.             $hospital = $1;
  2654.             $st = length($`);
  2655.             $end = $st + length($hospital);
  2656.             $key = "$st-$end";
  2657.             addType($key, "Hospital1");    
  2658.         }
  2659.         }
  2660.        
  2661.         if ($len == 2) {
  2662.         while ($text =~ /\b($hos[1])( )($hos[2])\b/ig) {
  2663.            
  2664.             $hos1 = $1;
  2665.             $hos2 = $3;
  2666.             $space = $2;
  2667.             $st1 = length($`);
  2668.             $end1 = $st1 + length($hos1);
  2669.             $key1 = "$st1-$end1";
  2670.             addType($key1, "Hospital2");
  2671.             $st2 = $end1 + length($space);
  2672.             $end2 = $st2 + length($hos2);
  2673.             $key2 = "$st2-$end2";
  2674.             addType($key2, "Hospital3");       
  2675.         }
  2676.         }
  2677.        
  2678.         if ($len == 3) {
  2679.         while ($text =~ /\b($hos[1])( )($hos[2])( )($hos[3])\b/ig) {
  2680.             $hos1 = $1;
  2681.             $hos2 = $3;
  2682.             $hos3 = $5;
  2683.             $st1 = length($`);
  2684.             $end1 = $st1 + length($hos1);
  2685.             $key1 = "$st1-$end1";
  2686.             addType($key1, "Hospital4");
  2687.             $st2 = $end1 + length($2);
  2688.             $end2 = $st2 + length($hos2);
  2689.             $key2 = "$st2-$end2";
  2690.             addType($key2, "Hospital5");
  2691.             $st3 = $end2 + length($4);
  2692.             $end3 = $st3 + length($hos3);
  2693.             $key3 = "$st3-$end3";
  2694.             addType($key3, "Hospital6");
  2695.         }
  2696.         }
  2697.                        
  2698.         if ($len == 4) {
  2699.         while ($text =~ /\b($hos[1])( )($hos[2])( )($hos[3])( )($hos[4])\b/ig) {
  2700.             $hos1 = $1;
  2701.             $hos2 = $3;
  2702.             $hos3 = $5;    
  2703.             $st1 = length($`);
  2704.             $end1 = $st1 + length($hos1);
  2705.             $key1 = "$st1-$end1";
  2706.             addType($key1, "Hospital");
  2707.             $st2 = $end1 + length($2);
  2708.             $end2 = $st2 + length($hos2);
  2709.             $key2 = "$st2-$end2";
  2710.             addType($key2, "Hospital");
  2711.             $st3 = $end2 + length($4);
  2712.             $end3 = $st3 + length($hos3);
  2713.             $key3 = "$st3-$end3";
  2714.             addType($key3, "Hospital");
  2715.             $st4 = $end3 + length($6);
  2716.             $end4 = $st4 + length($hos4);
  2717.             $key4 = "$st4-$end4";
  2718.             addType($key4, "Hospital");
  2719.         }
  2720.         }          
  2721.     }
  2722.        
  2723.     foreach $loc (@loc_unambig) {
  2724.        
  2725.         my @loc_terms = split " ", $loc;
  2726.         $len = 0;
  2727.         foreach $h (@loc_terms) {
  2728.         if (length($h) != 0) {
  2729.             $len = $len+1;
  2730.             $loc[$len] = $h;
  2731.         }
  2732.         }
  2733.        
  2734.         if ($len == 1) {
  2735.         while ($text =~ /\b($loc[1])\b/ig) {
  2736.             $location = $1;
  2737.             $st = length($`);
  2738.             $end = $st + length($location);
  2739.             $key = "$st-$end";
  2740.             addType($key, "Location");     
  2741.         }
  2742.         }
  2743.        
  2744.         if ($len == 2) {
  2745.         while ($text =~ /\b($loc[1])( )($loc[2])\b/ig) {
  2746.             $loc1 = $1;
  2747.             $loc2 = $3;
  2748.             $st1 = length($`);
  2749.             $end1 = $st1 + length($loc1);
  2750.             $key1 = "$st1-$end1";
  2751.             addType($key1, "Location");
  2752.             $st2 = $end1 + length($2);
  2753.             $end2 = $st2 + length($loc2);
  2754.             $key2 = "$st2-$end2";
  2755.             addType($key2, "Location");    
  2756.         }
  2757.         }
  2758.        
  2759.         if ($len == 3) {
  2760.         while ($text =~ /\b($loc[1])( )($loc[2])( )($loc[3])\b/ig) {
  2761.             $loc1 = $1;
  2762.             $loc2 = $3;
  2763.             $loc3 = $5;
  2764.             $st1 = length($`);
  2765.             $end1 = $st1 + length($loc1);
  2766.             $key1 = "$st1-$end1";
  2767.             addType($key1, "Location");
  2768.             $st2 = $end1 + length($2);
  2769.             $end2 = $st2 + length($loc2);
  2770.             $key2 = "$st2-$end2";
  2771.             addType($key2, "Location");
  2772.             $st3 = $end2 + length($4);
  2773.             $end3 = $st3 + length($loc3);
  2774.             $key3 = "$st3-$end3";
  2775.             addType($key3, "Location");
  2776.         }
  2777.         }
  2778.                    
  2779.         if ($len == 4) {
  2780.         while ($text =~ /\b($loc[1])( )($loc[2])( )($loc[3])( )($loc[4])\b/ig) {
  2781.             $loc1 = $1;
  2782.             $loc2 = $3;
  2783.             $loc3 = $5;    
  2784.             $st1 = length($`);
  2785.             $end1 = $st1 + length($loc1);
  2786.             $key1 = "$st1-$end1";
  2787.             addType($key1, "Location");
  2788.             $st2 = $end1 + length($2);
  2789.             $end2 = $st2 + length($loc2);
  2790.             $key2 = "$st2-$end2";
  2791.             addType($key2, "Location");
  2792.             $st3 = $end2 + length($4);
  2793.             $end3 = $st3 + length($loc3);
  2794.             $key3 = "$st3-$end3";
  2795.             addType($key3, "Location");
  2796.             $st4 = $end3 + length($6);
  2797.             $end4 = $st4 + length($loc4);
  2798.             $key4 = "$st4-$end4";
  2799.             addType($key4, "Location");
  2800.         }
  2801.         }
  2802.     }
  2803.  
  2804.         #######
  2805.         #PO Box number
  2806.    
  2807.         while ($text =~ /\b(P[\.]?O[\.]? *Box *[\#]? *[0-9]+)\b/gi) {
  2808.           $location = $1;
  2809.           $st = length($`);
  2810.           $end = $st + length($location);
  2811.           $key = "$st-$end";
  2812.           addType($key, "PO Box");   
  2813.     }
  2814.  
  2815.    
  2816.  
  2817.  
  2818.  
  2819.         ######
  2820.         #Zipcodes
  2821.     foreach $loc (@us_states_abbre) {
  2822.             while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
  2823.             $location = $1;
  2824.             $st = length($`);
  2825.             $end = $st + length($location);
  2826.             $key = "$st-$end";
  2827.             addType($key, "State/Zipcode");  
  2828.         }
  2829.  
  2830.     }
  2831.  
  2832.         #Zipcodes with more US states abbreviations
  2833.     foreach $loc (@more_us_states_abbre) {
  2834.             while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
  2835.             $location = $1;
  2836.             $st = length($`);
  2837.             $end = $st + length($location);
  2838.             $key = "$st-$end";
  2839.             addType($key, "State/Zipcode");  
  2840.         }
  2841.  
  2842.     }      
  2843.     #Zipcodes with full US state names
  2844.     foreach $loc (@us_states) {
  2845.             while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
  2846.             $location = $1;
  2847.             $st = length($`);
  2848.             $end = $st + length($location);
  2849.             $key = "$st-$end";
  2850.             addType($key, "State/Zipcode");  
  2851.         }
  2852.  
  2853.     }
  2854.         ##########
  2855.         #remove US states if filter flag for State is on
  2856.  
  2857.     if ($us_state_filter =~ /y/) {
  2858.  
  2859.       foreach $loc (@us_states) {
  2860.        
  2861.         my @loc_terms = split " ", $loc;
  2862.         $len = 0;
  2863.         foreach $h (@loc_terms) {
  2864.         if (length($h) != 0) {
  2865.             $len = $len+1;
  2866.             $loc[$len] = $h;
  2867.         }
  2868.         }
  2869.        
  2870.         if ($len == 1) {
  2871.         while ($text =~ /\b($loc[1])\b/ig) {
  2872.             $location = $1;
  2873.             $st = length($`);
  2874.             $end = $st + length($location);
  2875.             $key = "$st-$end";
  2876.             addType($key, "State");    
  2877.         }
  2878.         }
  2879.        
  2880.         if ($len == 2) {
  2881.         while ($text =~ /\b(($loc[1])( )($loc[2]))\b/ig) {
  2882.             $location = $1;
  2883.             $st = length($`);
  2884.             $end = $st + length($location);
  2885.             $key = "$st-$end";
  2886.             addType($key, "State");      
  2887.         }
  2888.         }
  2889.        
  2890.         if ($len == 3) {
  2891.         while ($text =~ /\b(($loc[1])( )($loc[2])( )($loc[3]))\b/ig) {
  2892.             $location = $1;
  2893.             $st = length($`);
  2894.             $end = $st + length($location);
  2895.             $key = "$st-$end";
  2896.             addType($key, "State");
  2897.         }
  2898.         }
  2899.                    
  2900.         if ($len == 4) {
  2901.         while ($text =~ /\b(($loc[1])( )($loc[2])( )($loc[3])( )($loc[4]))\b/ig) {
  2902.             $location = $1;
  2903.             $st = length($`);
  2904.             $end = $st + length($location);
  2905.             $key = "$st-$end";
  2906.             addType($key, "State");
  2907.            
  2908.         }
  2909.         }
  2910.     }
  2911.       } #end if us_state_filter is on
  2912.  
  2913.  
  2914.  
  2915.  
  2916.  
  2917.  
  2918.  
  2919.  
  2920.         #######
  2921.     # Sub-function: hospitalIndicators()
  2922.     # Searches for hospital indicators and checks if previous and following words are hospitals
  2923.    
  2924.     foreach $h (@hospital_indicators) {
  2925.  
  2926.         while ($text =~ /((([A-Za-z\-\']+)( + *))?(([A-Za-z\-\']+)( + *))?($h\b)(( + *)([A-Za-z\-\']+))?(( + *)([A-Za-z\-\']+))?\b)/ig) {  
  2927.  
  2928.        
  2929.         my $typeadded = 0;
  2930.         $st1 = length($`);
  2931.         $end1 = $st1 + length($3);
  2932.         $key1 = "$st1-$end1";
  2933.         $st2 = $st1 + length($2);
  2934.         $end2 = $st2 + length($6);
  2935.         $key2 = "$st2-$end2";
  2936.         $st3 = $st1 + length($2) + length ($5) + length($8) + length($10);
  2937.         $end3 = $st3 + length($11);
  2938.         $key3 = "$st3-$end3";
  2939.         $st4 = $end3 + length($13);
  2940.         $end4 = $st4 + length($14);
  2941.         $key4 = "$st4-$end4";
  2942.         $st5 = $end2 + length($7);
  2943.         $end5 = $st5 + length($8);
  2944.         $key5 = "$st5-$end5";
  2945.  
  2946.        
  2947.         if (length($5)==0) {
  2948.        
  2949.             if ((length($3) > 1) &&  (!isUnambigCommon($3)) && (!(isCommon ($3)) || (isType ($key1, "Hospital", 1)))) {
  2950.             addType ($key1, "Hospital");
  2951.             #addType ($key5, "Hospital-Ind");
  2952.             $typeadded = 1;
  2953.             }
  2954.         }
  2955.  
  2956.          elsif ((length($6) > 1) && (!isUnambigCommon($6)) && (!(isCommon ($6)) || isUSState($6) || isUSStateAbbre($6) || (isType ($key2, "Hospital", 1)))) {
  2957.    
  2958.             addType ($key2, "Hospital");
  2959.                     #addType ($key5, "Hospital-Ind");
  2960.             $typeadded = 1;
  2961.            
  2962.             if ((length($3) > 1) &&  (!isUnambigCommon($3)) &&  (!(isCommon ($3)) || isUSState($3) || isUSStateAbbre($3) || (isType ($key1, "Hospital", 1)))) {
  2963.             addType ($key1, "Hospital");
  2964.             #addType ($key5, "Hospital-Ind");
  2965.             $typeadded = 1;
  2966.             }
  2967.         }
  2968.  
  2969.         #   #Generating too many false positives.
  2970.         #   #Need a better common word dictionary to enable this.
  2971.         #if ($typeadded == 0) {
  2972.         #    if ((length($11) > 1) && (!(isCommonest ($11)) || (isType ($key3, "Hospital", 1)))) {
  2973.         #   #addType ($key3, "Hospital");
  2974.                 #       #addType ($key5, "Hospital-Ind");
  2975.         #   #if ((length($14) > 1) && (!(isCommonest ($14)) || (isType ($key4, "Hospital", 1)))) {
  2976.         #   #      # addType ($key4, "Hospital");
  2977.         #   #       #addType ($key5, "Hospital-Ind");          
  2978.         #   #   }
  2979.         #    }
  2980.         #} #end if (typeadded == 0)
  2981.         }
  2982.     }
  2983.     }
  2984. }
  2985. # End of function location2()
  2986.  
  2987.  
  2988. #***********************************************************************************************************
  2989. #***********************************************************************************************************
  2990. #***********************************************************************************************************
  2991. # Function: problem()
  2992. # Checks for names preceded by "problem", pattern found in discharge summaries
  2993.  
  2994. sub problem {
  2995.     $text = $_[0];  
  2996.  
  2997.     $k = "problem";
  2998.     $l = ":";
  2999.     while ($text =~ /\b(([A-Za-z\-]+) + *($k))\b/ig) {
  3000.     if ((!isCommon($2)) || (isNameAmbig($2))) {
  3001.         $st = length($`);
  3002.         $end = $st + length($2);
  3003.         $key = "$st-$end";    
  3004.         addType ($key, "Last Name");
  3005.     }
  3006.     }
  3007. }
  3008. # End of function problem()
  3009. #***********************************************************************************************************
  3010. #***********************************************************************************************************
  3011. #***********************************************************************************************************
  3012. # Function: mrn()
  3013. # Checks for medical record numbers, i.e. numbers preceded by "mrn" or "medical record number"
  3014.  
  3015. sub mrn {
  3016.     $text = $_[0];      
  3017.     if ($unitfilter =~ /y/) {
  3018.    
  3019.     while ($text =~ /\b(mrn( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
  3020.         my $unit = $1;
  3021.         my $st = length($`);
  3022.         my $key = "$st-".((length $unit) + $st);
  3023.         addType ($key, "Medical Record Number");
  3024.     }
  3025.    
  3026.     @numbers = ("number", "no", "num", "");
  3027.    
  3028.     foreach $i (@numbers) {
  3029.         while ($text =~ /\b(medical record( *)$i?( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
  3030.         my $unit = $1;
  3031.         my $st = length($`);
  3032.         my $key = "$st-".((length $unit) + $st);
  3033.         addType ($key, "Medical Record Number");
  3034.         }
  3035.     }
  3036.     }
  3037. }
  3038. # End of function mrn()
  3039. #***********************************************************************************************************
  3040. #***********************************************************************************************************
  3041. #***********************************************************************************************************
  3042. # Function: unit()
  3043. # Checks for unit numbers in discharge summaries
  3044. # Removes the entire field of "unit <number>" or unit <number>/<number>
  3045. # If "unit" is not followed by a number, does not remove it
  3046.  
  3047. sub unit {
  3048.     $text = $_[0];      
  3049.     if ($unitfilter =~ /y/) {
  3050.    
  3051.     @numbers = ("number", "no", "num", "");
  3052.    
  3053.     foreach $i (@numbers) {
  3054.         while ($text =~ /\b(unit( ?)$i?( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
  3055.         my $unit = $1;
  3056.         my $st = length($`);
  3057.         my $key = "$st-".((length $unit) + $st);
  3058.         addType ($key, "Unit Number");
  3059.         }
  3060.     }
  3061.     }
  3062. }
  3063. # End of function unit()
  3064. #***********************************************************************************************************
  3065. #***********************************************************************************************************
  3066. #***********************************************************************************************************
  3067. # Function: name3()
  3068. # Checks every lastnameprefix
  3069. # If the following word is either a name or not commonest, identifies it as lastname
  3070.  
  3071. sub name3 {
  3072.     $text = $_[0];  
  3073.     if ($namefilter =~ /y/) {
  3074.  
  3075.     foreach $line (@prefixes_unambig) {
  3076.     while ($text =~ /\b(($line)([\s\'\-])+ *)([A-Za-z]+)\b/ig) {
  3077.         my $pre = $2;
  3078.         my $prestart = length($`);
  3079.         my $preend = $prestart+(length $pre);
  3080.         my $prekey = "$prestart-$preend";
  3081.         my $lname = $4;
  3082.         my $lstart = $prestart+length($1);
  3083.         my $lend = $lstart+length($4);
  3084.         my $lnamekey = "$lstart-$lend";
  3085.         if ((!(isCommonest ($lname))) || (isType ($lnamekey, "Name", 1))) {
  3086.         addType ($prekey, "Name Prefix (Prefixes)");
  3087.         addType ($lnamekey, "Last Name (Prefixes)");
  3088.         }
  3089.     }
  3090.     }
  3091.        #****************************************************************************************
  3092.    
  3093.     @specific_titles = ("MR", "MISTER", "MS");
  3094.     foreach $i (@specific_titles) {
  3095.        
  3096.         while ($text =~ /\b($i\.( *))([A-Za-z\'\-]+)\b/ig) {
  3097.         $potential_name = $3;
  3098.         $start = length($`)+length($1);
  3099.         $end = $start + length($potential_name);
  3100.         $key = "$start-$end";
  3101.         if (isType($key, "Name", 1)) {
  3102.             addType ($key, "Name13 (STitle)");
  3103.         }
  3104.        
  3105.         elsif (!(isCommon($potential_name))) {
  3106.             addType ($key, "Name14 (STitle)");
  3107.         }
  3108.         }
  3109.     }
  3110.         #****************************************************************************************
  3111.     # Goes through word by word looking for unspotted names
  3112.     # All words have already been marked as potential names (where appropriate) by the previous routines
  3113.     # Looks for last names following titles (Dr. Smith)
  3114.     # Also should pick up "Dr. S"
  3115.     # Looks for the last name prefixes
  3116.    
  3117.     foreach $i (@strict_titles) {      
  3118.       L:
  3119.  
  3120.         while ($text =~ /\b($i\b\.? *)([A-Za-z\'\-]+)\b/ig) { # added ' -
  3121.  
  3122.         my $tt = $1;
  3123.             my $word = $2;
  3124.            
  3125.         my $st = length($`) + length($1);
  3126.         my $fi = $st + length($2);
  3127.         my $key = "$st-$fi";
  3128.         if (exists $prefixes{uc($word)}) {
  3129.    
  3130.             addType ($key, "Last Name (STitle)");
  3131.             my $start = $fi;
  3132.             my $nextWord = substr $text, $start;
  3133.  
  3134.             if ($nextWord =~ /\A( ?)(\')?( ?)([A-Za-z]+)\b/g) {
  3135.            
  3136.             my $token = $4;
  3137.             my $lstart = $start+length($1)+length($2)+length($3);
  3138.             my $lend = $lstart+length($4);
  3139.             my $fi += length($1)+length($4);
  3140.             if (exists $prefixes{uc($token)}){
  3141.                 addType ("$start-$fi", "Last Name (STitle)");
  3142.                 my $start = $fi;
  3143.                 my $nextWord = $nextWord;
  3144.                 my $token = $token;
  3145.                 if ($nextWord =~ /\A( ?$token( ?))([A-Za-z]+)\b/g) {
  3146.                 $word = $3;
  3147.                 $key = "$fi-".($fi + length($2) + length($3));
  3148.                 } else {
  3149.                 next L;
  3150.                 }
  3151.             }
  3152.             else {
  3153.                 # Has already identified one prefix, should now check to see if next word is name or is not commonest  
  3154.                 $word = $token;
  3155.                 $key = "$lstart-$lend";
  3156.            
  3157.                 if (isProbablyName($key, $word)){              
  3158.                 addType ($key, "Last Name (STitle)");
  3159.                 }
  3160.             }
  3161.             } else {
  3162.             next L;
  3163.             }
  3164.         } else { #else $word is not a prefix
  3165.  
  3166.             if ($word =~ /\'([A-Za-z]+)/) {
  3167.             $word = $1;
  3168.             $st--;
  3169.             $key = $st."-".$fi;
  3170.             }
  3171.             if ($word =~ /([A-Za-z]+)\'/) {
  3172.             $word = $1;
  3173.             $key =  $st."-".($fi-1);   
  3174.             }
  3175.         }
  3176.        
  3177.  
  3178.         ###########################################################
  3179.         if (exists $phi{$key}) {
  3180.        
  3181.             addType($key, "Last Name (STitle)");
  3182.             if ((isType($key, "First Name", 1))) {
  3183.        
  3184.                addType($key, "First Name (STitle)");
  3185.             }
  3186.         } else {
  3187.             if (isProbablyName($key, $word)){
  3188.             addType ($key, "Last Name (STitle)");
  3189.        
  3190.             }
  3191.             else {
  3192.             addType ($key, "Last Name (STitle)");
  3193.  
  3194.             }
  3195.         } #end else (!exists $phi{$key})
  3196.  
  3197.  
  3198.  
  3199.         ################################
  3200.                 #added to catch Dr. <firstname> <lastname>
  3201.         #check the word after $word
  3202.         my ($tmpStart,$tmpEnd) = split '-', $key;
  3203.         my $following = substr $text, $tmpEnd;
  3204.  
  3205.         if($following =~/\A(\s+)([A-Za-z\-\']{2,})\b/g){
  3206.             my $fword = $2;
  3207.             my $newst = $tmpEnd + (length ($1));
  3208.             my $nextKey = "$newst-".($newst + length($2));
  3209.    
  3210.             if (isProbablyName($nextKey, $fword)){
  3211.             addType($nextKey, "Name (STitle)");
  3212.             }
  3213.         }
  3214.         ##########################################
  3215.  
  3216.         } #end while text matches the pattern
  3217.     }  #end for each $i strict_titles
  3218.  
  3219.  
  3220.  
  3221.        #****************************************************************************************
  3222.     # Goes through word by word looking for unspotted names
  3223.     # All words have already been marked as potential names (where appropriate) by the previous routines
  3224.     # Looks for last names following titles (Dr. Smith)
  3225.     # Also should pick up "Dr. S"
  3226.     # Looks for the last name prefixes
  3227.    
  3228.        #mark as ambiguous if common words
  3229.     foreach $i (@titles) {     
  3230.       L:
  3231.         while ($text =~ /\b($i\b\.? ?)([A-Za-z]+) *([A-Za-z]+)?\b/ig) {
  3232.    
  3233.         my $tt = $1;
  3234.         my $word = $2;     
  3235.         my $st = length($`) + length($1);
  3236.         my $fi = $st + length($2);
  3237.         my $key = "$st-$fi";
  3238.  
  3239.         my $wordAfter = $3; ##added to catch last names
  3240.         my $stAfter = $fi + 1;
  3241.         my $fiAfter = $stAfter + length ($3);
  3242.             my $keyAfter = "$stAfter-$fiAfter";
  3243.        
  3244.  
  3245.         if (exists $prefixes{uc($word)}) { 
  3246.            
  3247.             addType ($key, "Last Name (Titles)");
  3248.             my $start = $fi;
  3249.             my $nextWord = substr $text, $start;
  3250.             if ($nextWord =~ /\A( ?)(\')?( ?)([A-Za-z]+)\b/g) {
  3251.             my $token = $4;
  3252.             my $lstart = $start+length($1)+length($2)+length($3);
  3253.             my $lend = $lstart+length($4);
  3254.             my $fi += length($1)+length($4);
  3255.             if (exists $prefixes{uc($token)}){
  3256.                 addType ("$start-$fi", "Last Name (Titles)");
  3257.                 my $start = $fi;
  3258.                 my $nextWord = $nextWord;
  3259.                 my $token = $token;
  3260.                 if ($nextWord =~ /\A( ?$token( ?))([A-Za-z]+)\b/g) {
  3261.                 $word = $3;
  3262.                 $key = "$fi-".($fi + length($2) + length($3));
  3263.                 } else {
  3264.                 next L;
  3265.                 }
  3266.             }
  3267.             else {
  3268.                 # Has already identified one prefix, should now check to see if next word is name or is not commonest  
  3269.                 $word = $token;
  3270.                 $key = "$lstart-$lend";
  3271.  
  3272.                 if (isProbablyName($key, $word) && length($word) > 1 ){
  3273.  
  3274.                 addType ($key, "Last Name (Titles)");
  3275.                 }
  3276.             }
  3277.             } else {
  3278.             next L;
  3279.             }
  3280.         } else {
  3281.             if ($word =~ /\'([A-Za-z]+)/) {
  3282.             $word = $1;
  3283.             $st--;
  3284.             $key = $st."-".$fi;
  3285.             }
  3286.             if ($word =~ /([A-Za-z]+)\'/) {
  3287.             $word = $1;
  3288.             $key =  $st."-".($fi-1);
  3289.             }
  3290.         }
  3291.        
  3292.             if (length ($wordAfter) > 1) {
  3293.  
  3294.             my $tmp = isCommon($wordAfter);
  3295.             if (!isCommonest($wordAfter)  ||  (isType($keyAfter, "Name", 1) && isType($keyAfter, "(un)"))  ||
  3296.                 (isType($keyAfter, "Name", 1) && ($wordAfter =~ /\b(([A-Z])([a-z]+))\b/g)) ) {
  3297.            
  3298.             addType($keyAfter, "Last Name (Titles)");
  3299.             addType($key, "First Name (Titles)");
  3300.             }  
  3301.         }
  3302.  
  3303.         elsif (exists $phi{$key}) {
  3304.             if ((isType($key, "Name", 1))) {
  3305.             addType($key, "Last Name (Titles)");
  3306.             }
  3307.         } else {
  3308.             if ( length($word)  > 1 && !(isCommon($word)) ) {
  3309.             addType ($key, "Last Name (Titles)");
  3310.             }
  3311.             else {
  3312.             if (($word =~ /\b[A-Z][a-z]+\b/) || ($tt =~ /$i\. /)) {
  3313.              
  3314.                 addType ($key, "Last Name (Titles  ambig)");
  3315.                
  3316.             } else {
  3317.                 addType ($key, "Last Name (Titles ambig)");
  3318.             }
  3319.             }
  3320.         }
  3321.         }
  3322.     }
  3323.  
  3324.    
  3325.  
  3326.    
  3327.        #****************************************************************************************
  3328.     # Implements simple rules for finding names that aren't in the list or are ambiguous...
  3329.     # first name + last name -> first name + last name (ambig), first name +
  3330.         #not-on-any-safe-word-list, else save the second word and see whether it appears
  3331.         #in the patient text not associated with the first name or any other name indicator
  3332.     # Also first + initial + last name
  3333.     # Finds each prefix, labels the next not uncommonest word    
  3334.     # Finds all first names (unambig), look at following word -> make last name unambigs
  3335.    
  3336.     foreach $k (keys %phi) {
  3337.        if (((isType($k, "Male First Name", 1)) || (isType($k, "Female First Name", 1))) && ((isType($k, "(un)", 1)) || (isType($k, "pop", 1)))) {      
  3338.    
  3339.         my ($start, $end) = split '-', $k;
  3340.         my $following = substr $text, $end;
  3341.        
  3342.         # No middle initial
  3343.            
  3344.         if ($following =~ /\A( +)([A-Za-z\']{2,})\b/g) { #added to catch firstname s.a. O'Connell
  3345.             my $fword = $2;
  3346.             my $st = $end + (length $1);
  3347.             my $nextKey = "$st-".($st + length($2));
  3348.  
  3349.             if (exists $phi{$nextKey}) {
  3350.  
  3351.             if ((isType($nextKey, "Name", 1) == 1) && isProbablyName($nextKey, $fword)) {    
  3352.                 addType($nextKey, "Last Name (NamePattern1)");
  3353.                 addType($k,"First Name4 (NamePattern1)"); # make it unambig
  3354.             }
  3355.             }
  3356.             else {
  3357.             if (isProbablyName($nextKey, $fword)){
  3358.              
  3359.                 addType ($nextKey, "Last Name (NamePattern1)");
  3360.                 addType($k,"First Name5 (NamePattern1)"); }}}# make it unambig
  3361.                
  3362.                 # Looks for that middle initial
  3363.                 if ($following =~ /\A( +)([A-Za-z])(\.? )([A-Za-z\-][A-Za-z\-]+)\b/g) {
  3364.                     my $initial = $2;
  3365.                     my $lastN = $4;
  3366.                     my $st = $end + (length $1);
  3367.                     my $iniKey = "$st-".($st+1);
  3368.                     my $stn = $st + (length $2) + (length $3);
  3369.                     my $nextKey = "$stn-".($stn + (length $4));
  3370.                     if (exists $phi{$nextKey}) {
  3371.                     if ((isType($nextKey, "Last Name", 0) == 0)) {
  3372.                         addType($nextKey, "Last Name (NamePattern1)");
  3373.                         addType($iniKey, "Initial (NamePattern1)");
  3374.                         addType($k,"First Name11 (Name Pattern1)");
  3375.                     }
  3376.                     }
  3377.                     else {         
  3378.                     if ($following =~ /\A( +)([A-Za-z])(\.? )([A-Za-z][A-Za-z]+)\b\s*\Z/g){
  3379.                         addType ($nextKey, "Last Name (NamePattern1)");
  3380.                         addType($iniKey, "Initial (NamePattern1)");
  3381.                         addType($k,"First Name6 (NamePattern1)");
  3382.                     }
  3383.                     elsif (!(isCommonest($lastN))) {
  3384.                         addType ($nextKey, "Last Name (NamePattern1)");
  3385.                         addType($iniKey, "Initial (NamePattern1)");
  3386.                         addType($k,"First Name7 (NamePattern1)");
  3387.                     }
  3388.                     }
  3389.                 }
  3390.         }
  3391.     }
  3392.    
  3393.     # Finds all last names (unambig), looks at proceeding word -> make first names unambigs
  3394.     foreach $k (keys %phi) {
  3395.         if (isType($k, "Last Name", 1) && (isType($k, "(un)", 1))) {
  3396.        
  3397.         my ($start, $end) = split '-', $k;
  3398.         my $preceding = substr $text, 0, $start;
  3399.        
  3400.         if ($preceding =~ /\b([A-Za-z]+)( *)\Z/g) {  
  3401.             my $pword = $1;
  3402.             my $st = length($`);
  3403.             my $prevKey = "$st-".($st + (length $1));
  3404.             if (exists $phi{$prevKey}) {
  3405.             #my $result = isNameIndicator($pword);
  3406.             #print "pword is $pword, isNameIndicator returns $result";
  3407.             #if ((isType($prevKey, "First Name", 1)) && (!isType($prevKey, "Name Indicator", 0))) {
  3408.             if ((isType($prevKey, "First Name", 1)) && (!isNameIndicator($pword)) ) {
  3409.                 addType($prevKey, "First Name8 (NamePattern2)");
  3410.             } # Else it's been positively identified as something that is not a name so leave it
  3411.             }
  3412.             else {
  3413.             # Sees whether it appears in the common words...
  3414.             if (!(isCommon($pword))) {
  3415.                
  3416.                 addType ($prevKey, "First Name9 (NamePattern2)");
  3417.             }
  3418.             }
  3419.         }
  3420.         }
  3421.     }  
  3422.         #****************************************************************************************
  3423.     # Looks for compound last names -> last name + last name (ambig), last name + not-on-any-safe-word-list, last name "-" another word
  3424.     # Last name with an ambiguous name preceding it has already labeled the preceding thing a first name; no huge loss if it's just a weird first part of a compound last name
  3425.    
  3426.     foreach $k (keys %phi) {
  3427.         if (isType($k, "Last Name", 0)) {
  3428.        
  3429.         my ($start, $end) = split '-', $k;
  3430.         my $following = substr $text, $end;
  3431.        
  3432.         if ($following =~ /\A-([A-Za-z]+)\b/g) { #hypen-ated last name
  3433.             my $newend = $end+length($1)+1;
  3434.             my $nextKey = "$end-$newend";
  3435.             addType ($nextKey, "Last Name (NamePattern3)");
  3436.         }
  3437.         if ($following =~ /\A( *)([A-Za-z]+)\b/g) {
  3438.             my $fword = $2;
  3439.         my $st = $end + (length $1);
  3440.             my $nextKey = "$st-".($st + length($2));
  3441.             if (exists $phi{$nextKey}) {
  3442.             if (!(isType($nextKey, "ambig", 1))) {
  3443.             if (isType($nextKey, "Last Name", 0) == 0) {
  3444.                 addType($nextKey, "Last Name (NamePattern3)");
  3445.             }
  3446.             } # Else it's been positively identified as something that is not a name so leaves it
  3447.         }
  3448.             else {
  3449.             # Sees whether it appears in the common words
  3450.             if (!(isCommon($fword))) {
  3451.                 addType ($nextKey, "Last Name (NamePattern3)");
  3452.             }
  3453.             }
  3454.         }
  3455.         }
  3456.     }
  3457.        #****************************************************************************************
  3458.     # Looks for initials
  3459.     # Many last names get classified as first names and other PHI -> looks for initial before all unambig names and locations
  3460.    
  3461.       INI:
  3462.     foreach $k (keys %phi) {
  3463.         if (  ((!(isType($k, "ambig", 1))) ||    isType($k, "(un)",1))   && (isType($k, "Name", 1))) {
  3464.         #if (isType($k, "Name", 1)) {  
  3465.         my ($start, $end) = split '-', $k;
  3466.         my $preceding = substr $text, 0, $start;
  3467.        
  3468.         # Checks for two initials
  3469.    
  3470.         if ($preceding =~ /\b([A-Za-z][\. ] ?[A-Za-z]\.?) ?\Z/g) {         
  3471.             my $key = (length ($`))."-".(length($`) + (length $1));
  3472.             addType ($key, "Initials (NamePattern4)");
  3473.             if (!(isType($k, "Last Name", 0))) {
  3474.             }
  3475.         }
  3476.        
  3477.         # Checks if preceding word is an initial
  3478.         elsif ($preceding =~ /\b([A-Za-z]\.?) ?\Z/g) { #1 initial
  3479.        
  3480.             my $tmp = substr $text, $start, $end - $start +1;
  3481.        
  3482.             my $init = $1;
  3483.             my $key = (length ($`))."-".(length($`) + (length $1));
  3484.             if (lc($init) eq "s") {
  3485.             if ((substr $preceding, (length($`) - 1), 1) eq "'") {  #for 's
  3486. #           next INI;
  3487.             }
  3488.             }
  3489.             if ((lc($init) eq "a") || (lc($init) eq "i")) {
  3490.             if (isCommon(substr $text, $start, ($end - $start))) {
  3491. #           next INI;
  3492.             }
  3493.             }
  3494.        
  3495.             if (length($init)==2 || length($init)==1) {
  3496.             addType ($key, "Initials (NamePattern4)");
  3497.             }
  3498.             if (!(isType($k, "Last Name", 0))) {
  3499.             addType ($k, "Last Name (NamePattern4)");
  3500.             }
  3501.         }      
  3502.         }
  3503.     }
  3504.        #****************************************************************************************
  3505.        # Looks for initials; similar to previous code block
  3506.    
  3507.     foreach $k (keys %phi) {
  3508.         if (isType($k, "Last Name", 1) && (!isType ($k, "ambig", 1))) {
  3509.        
  3510.         my ($start, $end) = split '-', $k;
  3511.         my $preceding = substr $text, 0, $start;
  3512.         #two initials (why would they write that?  Why not?) {
  3513.         if ($preceding =~ /\b([A-Za-z][\. ] ?[A-Za-z]\.?) ?\Z/g) {
  3514.             my $key = (length ($`))."-".(length($`) + (length $1));
  3515.             addType ($key, "Initials (NamePattern5)");
  3516.             if (!(isType($k, "Last Name", 0))) {
  3517.             addType ($k, "Last Name (NamePattern5)");
  3518.             }
  3519.         }
  3520.        
  3521.         elsif ($preceding =~ /\b([A-Za-z]\.?) ?\Z/g) { #1 initial
  3522.             my $init = $1;
  3523.             my $key = (length ($`))."-".(length($`) + (length $1));
  3524.             if (lc($init) eq "s") {
  3525.             if ((substr $preceding, (length($`) - 1), 1) eq "'") {  #for 's
  3526.                 #next INI;
  3527.             }
  3528.             }
  3529.             if ((lc($init) eq "a") || (lc($init) eq "i")) {
  3530.             if (isCommon(substr $text, $start, ($end - $start))) {
  3531. #           next INI;
  3532.             }
  3533.             }
  3534.    
  3535.         }      
  3536.         }
  3537.     }
  3538.         #****************************************************************************************
  3539.  
  3540.     # Searches for patterns "name and/or," comma list names
  3541.     foreach $k (keys %phi) {
  3542.        
  3543.         if ((isType($k, "Last Name", 0)) || (isType($k, "Male First Name", 0)) || (isType($k, "Female First Name", 0))) {
  3544.         my ($start, $end) = split '-', $k;
  3545.         my $following = substr $text, $end;
  3546.        
  3547.         if ((length $following) == 0) { next; }
  3548.        
  3549.         # First just looks for "and"/"or"
  3550.         if ($following =~ /\A and ([A-Za-z]+)\p{IsPunct}/ig) {
  3551.             my $word = $text1;
  3552.             my $key = ($end + 5)."-".($end + 5 + length($1));
  3553.             if ((isType($key, "Name", 1))  || (!(isCommon($word)))) {
  3554.             addType ($key, "Last Name (NamePattern6)");
  3555.             }
  3556.         }
  3557.         elsif ($following =~ /\A and ([A-Za-z]+)\b/ig) {
  3558.             my $word = $1;
  3559.             my $key = ($end + 5)."-".($end + 5 + length($1));
  3560.             if (!(isCommon($word))) {
  3561.             addType ($key, "Last Name (NamePattern6)");
  3562.             }
  3563.         }
  3564.         elsif ($following =~ /\A or ([A-Za-z]+)\b/ig) {
  3565.             my $word = $1;
  3566.             my $key = ($end + 4)."-".($end + 4 + length($1));
  3567.             if (!(isCommon($word))) {
  3568.             addType ($key, "Last Name (NamePattern6)");
  3569.             }
  3570.         }
  3571.         elsif ($following =~ /\A( ?[\&\+] ?)([A-Za-z]+)\b/ig) {
  3572.             my $word = $2;
  3573.             my $st = $end + (length $1);
  3574.             my $key = "$st-".($st + length($2));
  3575.             if (!(isCommon($word))) {
  3576.             addType ($key, "Last Name (NamePattern6)");
  3577.             }
  3578.         }
  3579.         elsif ($following =~ /\A, ([A-Za-z]+)(,? and )([A-Za-z]+)\b/ig) {
  3580.             # Searches up to 3 names in a list
  3581.             my $name1 = $1;
  3582.             my $name2 = $3;
  3583.             my $st2 = $end + 2 + (length $name1) + length($2);
  3584.             my $key1 = ($end + 2)."-".($end + 2 + (length $name1));
  3585.             my $key2 = "$st2-".($st2 + length($name2));
  3586.             if (!(isCommon($name1))) {
  3587.             addType ($key1, "Last Name (NamePattern6)");
  3588.             }
  3589.             if (!(isCommon($name2))) {
  3590.             addType ($key2, "Last Name (NamePattern6)");
  3591.             }
  3592.         }
  3593.         }
  3594.     }
  3595. }
  3596. }
  3597. # End of function name3()
  3598.  
  3599.  
  3600.  
  3601. #***********************************************************************************************************
  3602. #***********************************************************************************************************
  3603. #***********************************************************************************************************
  3604. # Function: commonHoliday()
  3605. # Searches for some common holiday names that can identify the date
  3606. # Extension: Add new holiday names to this regex
  3607.  
  3608. sub commonHoliday() {
  3609.     $text = $_[0];
  3610.     if ($datefilter =~ /y/) {
  3611.    
  3612.     while ($text =~ /\b(christmas|thanksgiving|easter|hannukah|rosh hashanah|ramadan)\b/ig) {
  3613.         $holidayname = $1;
  3614.         $start = length($`);
  3615.         $end = $start + length($holidayname);
  3616.         $key = $start."-".$end;
  3617.         addType ($key, "Holiday");
  3618.     }
  3619.     }
  3620. }
  3621. # End of function commonHoliday()
  3622. #***********************************************************************************************************
  3623. #***********************************************************************************************************
  3624. #***********************************************************************************************************    
  3625. # Function: knownPatientName()
  3626. # Searches for PID-specific patient names known a priori, i.e. the patient first and last names for this particular PID
  3627. # Indiscriminately removes these PHI from anywhere in the text
  3628. # Extension: To include new PID-patient name mappings, extend the file $patient_file
  3629.  
  3630. sub knownPatientName {
  3631.     $text = $_[0];
  3632.     if ($namefilter =~ /y/) {
  3633.  
  3634.     foreach $i (@known_first_name) {
  3635.         while ($text =~ /\b($i)\b/ig) {
  3636.         my $start = length($`);
  3637.         my $end = $start + length($1);
  3638.         my $key = "$start-$end";
  3639.         addType ($key, "Known patient firstname");
  3640.         }
  3641.     }
  3642.    
  3643.     foreach $j (@known_last_name) {
  3644.         while ($text =~ /\b($j)\b/ig) {
  3645.         my $start = length($`);
  3646.         my $end = $start + length($1);
  3647.         my $key = "$start-$end";
  3648.         addType ($key, "Known patient lastname");
  3649.         }
  3650.     }      
  3651.     }
  3652. }
  3653. # End of function knownPatientName()
  3654.  
  3655.  
  3656.  
  3657.  
  3658. #***********************************************************************************************************
  3659. #***********************************************************************************************************
  3660. #***********************************************************************************************************
  3661. #Returns true if the number passed in matches a defined us area code
  3662. sub isCommonAreaCode  {
  3663.     $areacode = $_[0];
  3664.     return ($us_area_code{$areacode});
  3665.  
  3666. }
  3667.  
  3668.  
  3669.  
  3670. #***********************************************************************************************************
  3671. #***********************************************************************************************************
  3672. #***********************************************************************************************************
  3673. # Function: telephone()
  3674. # Searches for telephone numbers, with or without area codes, with or without extensions
  3675. # Extension: To add new formats, add a new rule
  3676.  
  3677. sub telephone() {
  3678.     $text = $_[0];
  3679.     if ($telfilter =~ /y/) {
  3680.         while ($text =~ /\b\(?\d{3}\s*[ \-\.\/\=\,]*\s*\d{4}\)?\b/g) { #added back \b to avoid high fp
  3681.     #while ($text =~ /\b\d{3}\s*[ \-\.\/\=\,]*\s*\d{4}\b/g) { #added back \b to avoid high fp
  3682.         my $start = length($`);
  3683.         my $end = $start + length($&);
  3684.  
  3685.         my $nextSeg = substr $text, $end, 20;
  3686.  
  3687.         #catch extensions
  3688.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {       
  3689.         $end += length($1);
  3690.         } elsif ($nextSeg =~ /\A(\s*ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3691.         $end += length($1);
  3692.         } elsif ($nextSeg =~ /\A(\s*extension\s*\(?[\d]+\)?)\b/) {
  3693.         $end += length($1);
  3694.         }
  3695.  
  3696.        
  3697.         my $key = "$start-$end";
  3698.  
  3699.         #now checks the context
  3700.         $context_len = 20;
  3701.         my $start_pos = $start - $context_len;
  3702.         if ($start_pos < 0) {
  3703.         $start_pos = 0;
  3704.         }
  3705.         my $len = $context_len;
  3706.         if  (length ($text) < ($end + $context_len)){
  3707.         $len = length($text) - $end;
  3708.         }
  3709.         my $textBefore = substr $text, $start_pos, ($start - $start_pos);
  3710.         my $textAfter = substr $text, $end, $len;
  3711.         if (isProbablyPhone($textBefore)){
  3712.         addType ($key, "Telephone/Fax (1)");
  3713.         }
  3714.     }
  3715.    
  3716.    
  3717.        #pattern such as ###-###-####
  3718.        #let's not worry about patterns such as ###-HART and ###-LUNG for now
  3719.     #while ($text =~ /\(?\d{3}?\s?[\)\.\/\-\=\, ]*\s?\d{3}\s?[ \-\.\/\=]*\s?\d{4}\b/g) {
  3720.         while ($text =~ /\d{3}\s*[\)\.\/\-\, ]*\s*\d{3}\s*[ \-\.\/]*\s*\d{4}/g) {
  3721.  
  3722.        
  3723.     #if (isCommonAreaCode($1)){
  3724.         my $st = length($`);
  3725.         my $end = $st + length($&);
  3726.  
  3727.         my $nextSeg = substr $text, $end, 20;
  3728.        
  3729.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3730.        
  3731.         $end += length($1);
  3732.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3733.        
  3734.         $end += length($1);
  3735.         }  elsif ($nextSeg =~ /\A([\,]?\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3736.        
  3737.         $end += length($1);
  3738.         } elsif ($nextSeg =~ /\b(\s?extension\s*\(?[\d]+\)?)\b/) {
  3739.        
  3740.         $end += length($1);
  3741.         }
  3742.  
  3743.         my $key = "$st-$end";      
  3744.         addType ($key, "Telephone/Fax (2)");
  3745.  
  3746.     }
  3747.  
  3748.     #allow arbitrary line break (almost) anywhere in the phone numbers (except first 3 digit to reduce fp)
  3749.     #only scrubbs the pattern, if it's a known area code
  3750.     while ($text =~ /(\d\d\d)\s*[\)\.\/\-\, ]*\s*\d\s*\d\s*\d\s*[ \-\.\/]*\s*\d\s*\d\s*\d\s*\d/g) {
  3751.        
  3752.         if (isCommonAreaCode($1)){
  3753.         my $st = length($`);
  3754.         my $end = $st + length($&);
  3755.  
  3756.        
  3757.         my $nextSeg = substr $text, $end, 20;
  3758.        
  3759.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3760.        
  3761.             $end += length($1);
  3762.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3763.        
  3764.             $end += length($1);
  3765.         }  elsif ($nextSeg =~ /\A([\,]?\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3766.        
  3767.             $end += length($1);
  3768.         } elsif ($nextSeg =~ /\b(\s?extension\s*\(?[\d]+\)?)\b/) {
  3769.        
  3770.             $end += length($1);
  3771.         }
  3772.  
  3773.         my $key = "$st-$end";      
  3774.         addType ($key, "Telephone/Fax (2)");
  3775.         }
  3776.     }
  3777.  
  3778.     #check phone pattern that has 1 extra or 1 less digit at end
  3779.     #in case pattern such as xxx-xxx-xxx?, check if the first 3 digits match with
  3780.         #common area code
  3781.  
  3782.     while (($text =~ /\(?(\d{3})\s*[\)\.\/\-\=\, ]*\s*\d{3}\s*[ \-\.\/\=]*\s*\d{3}\b/g)){
  3783.        
  3784.         #match it with common local area code
  3785.         if (isCommonAreaCode($1)){
  3786.         my $st = length($`);
  3787.         my $end = $st + length($&);    
  3788.        
  3789.         my $nextSeg = substr $text, $end, 20;
  3790.        
  3791.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3792.        
  3793.             $end += length($1);
  3794.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3795.        
  3796.             $end += length($1);
  3797.         }  elsif ($nextSeg =~ /\A(\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3798.        
  3799.             $end += length($1);
  3800.         } elsif ($nextSeg =~ /\A(\s?extension\s*\(?[\d]+\)?)\b/) {
  3801.        
  3802.             $end += length($1);
  3803.         }
  3804.        
  3805.         my $key = "$st-$end";      
  3806.         addType ($key, "Telephone/Fax (3)");
  3807.         } #end if the first 3 digits are area codes
  3808.     }  #end while
  3809.  
  3810.     #check phone pattern that has 1 extra  digit at end
  3811.     #in case pattern such as xxx-xxx-xxxxx, check if the first 3 digits match with
  3812.         #common area code
  3813.     while (
  3814.            ($text =~ /\(?(\d{3})\s*[\)\.\/\-\=\, ]*\s*\d{3}\s*[ \-\.\/\=]*\s*\d{5}\b/g)) {
  3815.  
  3816.        
  3817.         #match it with common local area code
  3818.         if (isCommonAreaCode($1)){
  3819.         my $st = length($`);
  3820.         my $end = $st + length($&);    
  3821.        
  3822.         my $nextSeg = substr $text, $end, 20;
  3823.        
  3824.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3825.        
  3826.             $end += length($1);
  3827.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3828.        
  3829.             $end += length($1);
  3830.         }  elsif ($nextSeg =~ /\A(\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3831.            
  3832.             $end += length($1);
  3833.         } elsif ($nextSeg =~ /\A(\s?extension\s*\(?[\d]+\)?)\b/) {
  3834.            
  3835.             $end += length($1);
  3836.         }
  3837.    
  3838.         my $key = "$st-$end";      
  3839.         addType ($key, "Telephone/Fax (4)");
  3840.         } #end if the first 3 digits are area codes
  3841.     }  #end while
  3842.  
  3843.  
  3844.      #in case typed in pattern such as ###-####-###
  3845.        while ($text =~ /\(?\d{3}?\s?[\)\.\/\-\=\, ]*\s?\d{4}\s?[ \-\.\/\=]*\s?\d{3}\b/g) {
  3846.         my $st = length($`);
  3847.         my $end = $st + length($&);
  3848.         my $nextSeg = substr $text, $end, 20;
  3849.  
  3850.         if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3851.        
  3852.         $end += length($1);
  3853.         }  elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
  3854.        
  3855.         $end += length($1);
  3856.         }
  3857.         elsif ($nextSeg =~ /\A(\s*ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
  3858.        
  3859.         $end += length($1);
  3860.         } elsif ($nextSeg =~ /\A(\s*extension\s*\(?[\d]+\)?)\b/) {
  3861.        
  3862.         $end += length($1);
  3863.         }
  3864.  
  3865.  
  3866.         my $key = "$st-$end";      
  3867.        
  3868.         addType ($key, "Telephone/Fax (5)");
  3869.     }
  3870.     }    
  3871. }
  3872. # End of function telephone()
  3873.  
  3874. #***********************************************************************************************************
  3875. #***********************************************************************************************************
  3876. #***********************************************************************************************************
  3877. # Add new function here >>>>>>>>>>
  3878. # Follow format shown here if necessary
  3879. # sub functionName {
  3880. #    $text = $_[0];
  3881. #    while ($text =~ /(<search pattern>)/) {
  3882. #       $startIndex = length($');
  3883. #       $endIndex = $startIndex + length($1);
  3884. #       $phiKey = $startIndex."-".$endIndex;
  3885. #       addType ($phiKey, "Name of PHI Category");
  3886. #   }
  3887. # }
  3888.  
  3889. #***********************************************************************************************************
  3890. #***********************************************************************************************************
  3891. #***********************************************************************************************************
  3892. # Function: pruneKeys()
  3893. # Arguments: hash %keys, string $text
  3894. # Returns: array @keylst
  3895. # Called by: findPHI()
  3896. # Description: Extracts PHI locations from hash %keys, compares each loc with previous loc to prevent overlaps
  3897. # Returns an array of pruned PHI keys
  3898.  
  3899. sub pruneKeys {
  3900.     my ($keyh,$text) = @_;
  3901.     my $lk = "";
  3902.     my ($ls,$le)=(0,0);
  3903.     my ($cs,$ce)=(0,0);
  3904.     my @keylst = ();
  3905.     foreach $k (sort numerically keys %{$keyh}) {
  3906.     #print "prunkey, key = $k  values = \n";
  3907.     $ls = $cs;
  3908.     $le = $ce;
  3909.     ($cs,$ce)= split ('-',$k);
  3910.     if ($cs > $le){push (@keylst,$lk);} # proper relation
  3911.     elsif($cs > $ls){
  3912.         if($ce > $le) {
  3913.         my $stgl = substr($text,$ls,$le-$ls);          
  3914.         my $stgc = substr($text,$cs,$ce-$cs);  
  3915.         $cs = $ls; $k = "$ls-$ce"; $$keyh{$k} = $$keyh{$lk}} # include both transfer types
  3916.         else{$cs = $ls; $ce = $le; $k = $lk;}} # use previous (current in previous)
  3917.     elsif($le > $ce){$cs = $ls; $ce = $le; $k = $lk;} # use previous (current in previous)
  3918.     $lk = $k;}
  3919.     #print "pushing $lk to keylst\n";
  3920.     push (@keylst,$lk); # last one
  3921.     return (@keylst)
  3922.     }    
  3923. # End of pruneKeys()
  3924. #***********************************************************************************************************
  3925. #***********************************************************************************************************
  3926. #***********************************************************************************************************
  3927. # Function: addType()
  3928. # Arguments: string $key ("start-end" of PHI loc), string $type (PHI type)
  3929. # Returns: None
  3930. # Called by: findPHI()
  3931. # Description: Pushes PHI key and type into the hash %phi
  3932. # Keeps track of all possible PHI types for each PHI key
  3933.  
  3934. sub addType {
  3935.  
  3936.     my ($key,$type) = @_;
  3937.     ($st,$end) = split '-',$key;
  3938.     if ($end > $end{$st}) {
  3939.     $end{$st} = $end;
  3940.     }
  3941.     #print "in addType, key is $key\n";
  3942.     push @{$phi{$key}}, $type;
  3943.     $t = (@{$phiT{$key}});
  3944.     $start = $st - 1 - 64;
  3945.     $ending = $end - 1 - 64;
  3946.     return;
  3947. }
  3948. # End of addType()
  3949. #***********************************************************************************************************
  3950. #***********************************************************************************************************
  3951. #***********************************************************************************************************
  3952. # Function: isType()
  3953. # Arguments: string $key ("start-end" of PHI loc), string $type (PHI type), int $pattern (1 if PHI type can be matched with '=~'; 0 otherwise)
  3954. # Returns: 1 if PHI $key is of PHI type $type; 0 otherwise
  3955. # Called by: findPHI()
  3956. # Description: Given a PHI loc, checks its PHI type in the existing PHI hash. If the type in the hash is equal to the given type, then returns 1.
  3957.  
  3958. sub isType {
  3959.     my ($key, $type, $pattern) = @_;
  3960.     foreach $tt (@{$phi{$key}}){
  3961. #   print "isType, tt is $tt key is $key\n";
  3962.     if ($pattern) {
  3963.         if ($tt =~ /$type/) {
  3964.         return 1;
  3965.         }
  3966.     }
  3967.     elsif ($tt eq $type) {
  3968.         return 1;
  3969.     }
  3970.     }
  3971.     return 0;
  3972. }
  3973.  
  3974. # End of isType()
  3975.  
  3976. #***********************************************************************************************************
  3977. #***********************************************************************************************************
  3978. #***********************************************************************************************************
  3979. # Function: isPHIType()
  3980. # Arguments: string $mytype (PHI type),  array of string @phiTypes
  3981. # Returns: 1 if PHI $mytype appears in @phiTypes, 0 otherwise.
  3982. # Called by: deid()
  3983. # Description: Given a PHI type, checks if it appears in @phiTypes, if so, returns 1. Returns 0 otherwise.
  3984.  
  3985. sub isPHIType {
  3986.   my (  $mytype,  @phiTypes) = @_;
  3987.    
  3988.     #foreach $tt (@{$phi{$key}}){
  3989.     foreach $tt (@phiTypes){
  3990.    
  3991.     if ($tt =~ /$mytype/) {
  3992.         return 1;
  3993.     } #end if
  3994.     } #end foreach
  3995.     return 0;
  3996. }
  3997.  
  3998. # End of isPHIType()
  3999.  
  4000.  
  4001. #***********************************************************************************************************
  4002. #***********************************************************************************************************
  4003. #***********************************************************************************************************
  4004. # Function: isCommon()
  4005. # Arguments: string $word
  4006. # Returns: 1 if given word is a common word; 0 otherwise
  4007. # Called by: findPHI()
  4008. # Description: Compares the given word to the common_words association, compiled from dictionary files for common English words and from SNOMED.
  4009. # Returns 1 if given word is in one of those lists, i.e. is a common word.
  4010.  
  4011. sub isCommon {
  4012.     my $word = $_[0];
  4013.     chomp $word;
  4014.     $word = uc($word);
  4015.     return  ($common_words{$word} || $unambig_common_words{$word});
  4016.    
  4017. }
  4018. # End of isCommon()
  4019.  
  4020. #***********************************************************************************************************
  4021. #***********************************************************************************************************
  4022. #***********************************************************************************************************
  4023. # Function: isUnambigCommon()
  4024. # Arguments: string $word
  4025. # Returns: 1 if given word is a really common word or unambig med terms; 0 otherwise
  4026. sub isUnambigCommon {
  4027.     my $word = $_[0];
  4028.     $word = uc($word);
  4029.     return $unambig_common_words{$word};
  4030. }
  4031. # End of isUnambigCommon()
  4032.  
  4033.  
  4034. #***********************************************************************************************************
  4035. #***********************************************************************************************************
  4036. #***********************************************************************************************************
  4037. # Function: isNameAmbig()
  4038. # Arguments: string $word
  4039. # Returns: 1 if given word is an ambiguous name
  4040. # Called by: findPHI()
  4041. # Description: Searches for given word in lists of ambiguous male, female and last names
  4042. # Returns 1 if word is in any of those lists
  4043. sub isNameAmbig {
  4044.     my $word = $_[0];
  4045.     $word = uc($word);
  4046.     return (($male_ambig{$word}) || ($female_ambig{$word}) || ($last_ambig{$word}));
  4047. }
  4048.  
  4049. #***********************************************************************************************************
  4050. #***********************************************************************************************************
  4051. #***********************************************************************************************************
  4052. #Function: isProbablyName()
  4053. #Arguments: string $key, $potential_name
  4054. #Returns true if
  4055. #name is Not a common word  OR
  4056. #name is Unambiguous name OR
  4057. #name maybe ambiguous BUT starts with Capital_letter followed by small_case letter OR
  4058. #name is popular
  4059. sub isProbablyName{
  4060.     my ($key, $potential_name) = @_;
  4061.  
  4062.     if ( (!isCommon($potential_name)) ||
  4063.      ((isType($key, "Name", 1) && isType($key, "(un)"))  ||
  4064.       (isType($key, "Name", 1) && ($potential_name =~ /\b(([A-Z])([a-z]+))\b/g)) ||
  4065.       (isType($key, "popular",1)) )) {
  4066.      
  4067.     return 1;
  4068.     } else {
  4069.     return 0;
  4070.     }
  4071.  
  4072. }
  4073. #***********************************************************************************************************
  4074. #***********************************************************************************************************
  4075. #***********************************************************************************************************
  4076. #Function: isUSStateAbbre
  4077. #Data structure used for us states
  4078. #Returns true if the word is a US state abbreviation
  4079. sub isUSStateAbbre {
  4080.     my $word = $_[0];
  4081.     $word = uc($word);
  4082.  
  4083.     foreach $loc (@us_states_abbre){
  4084.     if ($word =~/\b$loc\b/gi){
  4085.         return 1;
  4086.     }
  4087.     }
  4088.     foreach $loc (@more_us_states_abbre){
  4089.     if ($word =~/\b$loc\b/gi){
  4090.         return 1;
  4091.     }
  4092.     }
  4093.     return 0;
  4094.  
  4095. }
  4096.  
  4097. #***********************************************************************************************************
  4098. #***********************************************************************************************************
  4099. #***********************************************************************************************************
  4100. #Function: isNameIndicator
  4101. #Returns true if the word is a name indicator
  4102. sub isNameIndicator {
  4103.     my $word = $_[0];
  4104.     $word = uc($word);
  4105.  
  4106.     foreach $nam (@name_indicators){
  4107.     #print "nam in name indicators is $nam";
  4108.     if ($word =~/\b$nam\b/gi){
  4109.        
  4110.         return 1;
  4111.     }
  4112.     }
  4113.  
  4114.     return 0;
  4115. }
  4116.  
  4117. #***********************************************************************************************************
  4118. #***********************************************************************************************************
  4119. #***********************************************************************************************************
  4120. #Function: isUSState
  4121. #Returns true if the word is a US State
  4122. sub isUSState {
  4123.     my $word = $_[0];
  4124.     $word = uc($word);
  4125.     #return (($us_states{$word}));
  4126.     foreach $loc (@us_states){
  4127.     if ($word =~/\b$loc\b/gi){
  4128.         return 1;
  4129.     }
  4130.     }
  4131.     return 0;
  4132.  
  4133.  
  4134. }
  4135.  
  4136. #***********************************************************************************************************
  4137. #***********************************************************************************************************
  4138. #***********************************************************************************************************
  4139. # Function: isFirstName()
  4140. # Arguments: string $word
  4141. # Returns: 1 if given word is an ambiguous or unambiguous firstname
  4142. # Called by: findPHI()
  4143.  
  4144. sub isFirstName {
  4145.     my $word = $_[0];
  4146.     #$word = ($word);
  4147.     $word = uc($word);
  4148.     return (($male_ambig{$word}) || ($female_ambig{$word}) || ($male_unambig{$word}) || ($female_unambig{$word}) || ($male_popular{$word}) || ($female_popular{$word}));
  4149. }
  4150. # End of isFirstName()
  4151.  
  4152. #***********************************************************************************************************
  4153. #***********************************************************************************************************
  4154. #***********************************************************************************************************
  4155. # Function: isCommonest()
  4156. # Arguments: string $word
  4157. # Returns: 1 if given word is a commonest word; 0 otherwise
  4158. # Called by: findPHI()
  4159. # Description: Compares the given word to the commonest_words association, compiled from dictionary file for commonest English words.
  4160. # Returns 1 if given word is in that list, i.e. is a commonest word.
  4161.  
  4162. sub isCommonest {
  4163.     my $word = $_[0];
  4164.     $word = uc($word);
  4165.     return ($very_common_words{$word} || isUnambigCommon($word));
  4166. }
  4167.  
  4168. # End of isCommonest()
  4169. #***********************************************************************************************************
  4170. #***********************************************************************************************************
  4171. #***********************************************************************************************************
  4172. # Function: getNote()
  4173. # Arguments: int $patient (patient number), int $note (note number)
  4174. # Returns: string $noteText
  4175. # Called by: deidStats() (obsolete)
  4176. # Description: Given patient and note numbers, looks up header in $header_file
  4177. # If it finds the header, extracts the body of the note, and returns text until the end pattern
  4178. sub getNote{
  4179.  
  4180.     my ($patient, $note) = @_;    
  4181.     open DF, $data_file or die "Cannot open $data_file";
  4182.     my $noteFound = 0;
  4183.     my $noteText = "";
  4184.    
  4185.   D:    
  4186.     # Parses the data file line-by-line to match the header found in the header file
  4187.     while ($line = <DF>) {
  4188.     chomp $line;
  4189.    
  4190.  
  4191.     # If header is found in the text, then matches the end pattern, and sets the body of the note (excluding the header) as the note text
  4192.  
  4193.     if ($line =~ /\b$patient\|\|\|\|$note\|\|\|\|/) {
  4194.         $noteFound = 1;
  4195.         $noteText = "";
  4196.     }
  4197.     else {     
  4198.         if ($noteFound) {
  4199.         if ($line eq "||||END_OF_RECORD"){
  4200.             #$noteText = $noteText."\n".$1;
  4201.             #$noteText = $noteText.$1."\n";
  4202.             $end = $2;
  4203.             last D;
  4204.         }
  4205.         else {
  4206.             $noteText = $noteText.$line."\n";
  4207.         }
  4208.         }
  4209.     }
  4210.     }
  4211.     close DF;
  4212.    
  4213.     # If the note text has zero length, prints an error message
  4214.     if ((length $noteText) == 0) {
  4215.     print("Warning. No text found for Patient $patient, Note $note ");
  4216.     }
  4217.  
  4218.     # Returns the body of the note (everything excluding the header) as the note text
  4219.     return $noteText;
  4220. }
  4221. # End of getNote()
  4222.  
  4223.  
  4224.  
  4225.  
  4226. #***********************************************************************************************************
  4227. #***********************************************************************************************************
  4228. #***********************************************************************************************************
  4229. # Function: isValidDate()
  4230. # Arguments: int $month, int $day, int $year (If the date being validated doesn't specify year, the "year" argument should be "-1")
  4231. # Returns: 1 if given date is valid based on the calendar
  4232. # Called by: findPHI(), isValidDay()
  4233. # Description: Verifies if the given date is valid or not.
  4234.  
  4235. sub isValidDate{
  4236.     my ($month, $day, $year) = @_;
  4237.     if (($year!=-1) && (length $year) == 2) {
  4238.     #if ($year < 30) {
  4239.     if ($year <= $TWO_DIGIT_YEAR_THRESHOLD) {
  4240.         $year = "20".$year;
  4241.     }
  4242.     else {
  4243.         $year = "19".$year;
  4244.     }
  4245.     }
  4246.    
  4247.     #if (($year != -1) && ($year < 1900 || $year > 2030)){
  4248.     #if (($year != -1) && ($year <= $VALID_YEAR_LOW || $year >= $VALID_YEAR_HIGH)){
  4249.     if (($year != -1) && ($year < $VALID_YEAR_LOW || $year > $VALID_YEAR_HIGH)){
  4250.    
  4251.     return 0;
  4252.     }
  4253.    
  4254.     # Invalid months and days
  4255.     if (($month< 1) || ($month > 12) || ($day < 1) || ($day > 31)) {
  4256.     return 0;
  4257.     }
  4258.  
  4259.     # Checks validity of February days
  4260.     if ($month == 2) {
  4261.     if (($year != -1) && (($year % 4) == 0) && ($year != 2000)) {
  4262.         return ($day <= 29);
  4263.     }
  4264.     return ($day <= 28);
  4265.    
  4266.     # Checks validity of months consisting of 30 days
  4267.     }
  4268.     elsif (($month == 4) || ($month == 6) || ($month== 9) || ($month == 11)) {
  4269.     return ($day <= 30);
  4270.     }
  4271.    
  4272.     # Checks validity of months consisting of 31 days
  4273.     return ($day <= 31);
  4274. }
  4275. # End of isValidDate()
  4276. #***********************************************************************************************************
  4277. #***********************************************************************************************************
  4278. #***********************************************************************************************************
  4279. # Function: isValidDay()
  4280. # Arguments: int $day, string $month, int $year
  4281. # Returns: 1 if given date is valid; 0 otherwise
  4282. # Called by: findPHI()
  4283. # Description: Verifies validity of date when month is printed, by calling isValidDate()
  4284.  
  4285. sub isValidDay {
  4286.     my ($day, $month, $year) = @_;
  4287.     my $mnum = 0;
  4288.  
  4289.     # Converts printed out months to numerical months
  4290.     if ($month =~ /Jan|January|Januar/i) { $mnum = 1; }
  4291.     elsif ($month =~ /Feb|February|Februar/i) { $mnum = 2; }
  4292.     elsif ($month =~ /Mar|March|Maerz/i) { $mnum = 3; }
  4293.     elsif ($month =~ /Apr|April/i) { $mnum = 4; }
  4294.     elsif ($month =~ /May|Mai/i) { $mnum = 5; }
  4295.     elsif ($month =~ /June|Jun|Juni/i) { $mnum = 6; }
  4296.     elsif ($month =~ /July|Jul|Juli/i) { $mnum = 7; }
  4297.     elsif ($month =~ /August|Aug/i) { $mnum = 8; }
  4298.     elsif ($month =~ /September|Sept|Sep/i) { $mnum = 9; }
  4299.     elsif ($month =~ /October|Oct/i) { $mnum = 10; }
  4300.     elsif ($month =~ /November|Nov/i)  { $mnum = 11; }
  4301.     elsif ($month =~ /December|Dec/i) { $mnum = 12; }
  4302.     if ($mnum == 0) { return 0; }
  4303.    
  4304.     return (isValidDate($mnum, $day, $year));
  4305. }
  4306. # End of isValidDay()
  4307.  
  4308.  
  4309. #***********************************************************************************************************
  4310. #***********************************************************************************************************
  4311. #***********************************************************************************************************
  4312. # Function: outputText()
  4313. # Arguments: hash %deids (key=PHI start index, value=PHI end index), hash %phiT (key=PHI start-end indices, value=PHI type)
  4314. # Returns: None
  4315. # Called by: deid()
  4316. # Description: Creates the de-identified version of the text
  4317. # Replaces dates with shifted dates, and other PHI with their PHI types
  4318. sub outputText {
  4319.     my %deids = %{$_[0]};
  4320.     my %phiT = %{$_[1]};
  4321.  
  4322.     # These are the date and PID of the medical record. The year needs to be initialized.
  4323.    # my $checkYear = "2005";
  4324.     my $checkYear;
  4325.     my $checkDate;
  4326.     my $checkID;
  4327.  
  4328.     # Forms associations between printed months and numerical ones
  4329.     # This is because the date-shifting function only accepts numerical months, and months in the text can be non-numerical as well
  4330.     my %months;
  4331.     $months{"jan"} = 1;
  4332.     $months{"feb"} = 2;
  4333.     $months{"mar"} = 3;
  4334.     $months{"apr"} = 4;
  4335.     $months{"may"} = 5;
  4336.     $months{"jun"} = 6;
  4337.     $months{"jul"} = 7;
  4338.     $months{"aug"} = 8;
  4339.     $months{"sep"} = 9;
  4340.     $months{"oct"} = 10;
  4341.     $months{"nov"} = 11;
  4342.     $months{"dec"} = 12;
  4343.  
  4344.     #TF is the .res file (de-identified corpus), and OF is the (.phi file) list of PHIs
  4345.     # open TF, ">$deid_text_file" or die "Cannot open $deid_text_file";
  4346.     open TF, ">>$deid_text_file" or die "Cannot open $deid_text_file";   #now open in append mode
  4347.     my $lastEnd = 0;
  4348.     #open OF, ">>$output_file_old" or die "Cannot open $output_file_old";
  4349.  
  4350.     my $phiType = "";
  4351.  
  4352.     # Prints the PHI locations to the output file (.phi)
  4353.     foreach $k (sort numerically keys %deids) {        
  4354.     my @deidsval = @{$deids{$k}};
  4355.     local $added = 0;
  4356.     my $phiID;
  4357.  
  4358.     # Loops over each PHI recorded in %deids
  4359.     # %deids maps each PHI start index to an array of 3 items
  4360.     # These items are in order: the PHI end index, the PID of the PHI, the record date of the PHI    
  4361.     #if (exists ${@{$deids{$k}}}[0]) {
  4362.     if (exists $deidsval[0]){
  4363.         # Sets some key variables for the PHI
  4364.  
  4365.         #the following no longer works with perl v5.10
  4366.         #$deidsend = ${@{$deids{$k}}}[0]; # End index
  4367.         #$checkID = ${@{$deids{$k}}}[1]; # PID
  4368.         #$checkDate = ${@{$deids{$k}}}[2]; # Record date
  4369.  
  4370.         $deidsend = $deidsval[0]; # End index
  4371.         $checkID = $deidsval[1]; # PID
  4372.         $checkDate = $deidsval[2]; # Record date
  4373.         $checkYear = (substr $checkDate, 6, 4); # Record year
  4374.         if (length($checkYear) ==0) {
  4375.         $checkYear = extractYear($DEFAULT_DATE);
  4376.         print "Warning, in outputText(), cannot extract year from noteDate, setting year to default year $checkYear.";
  4377.         }
  4378.  
  4379.         # Immediately prints the start and end indices of the PHI to the .phi file
  4380.         #print OF "$k\t$deidsend\n";
  4381.  
  4382.         # Sets the $key to the current PHI
  4383.         my $key = $k."-".$deidsend;
  4384.         my $lastlast = $lastEnd;
  4385.         my $phiText;
  4386.        
  4387.         # If this PHI is a date element, shifts the date and replaces it in the text
  4388.         # Output format is YYYY/MM/DD always
  4389.         # For month/date formats, assumes that year = record year; for month/year formats, assumes that day = 1
  4390.  
  4391.         # Needs to go over %phiT for every PHI in %deids
  4392.         # %phiT maps each PHI key to its type, e.g. "Mary" -> "First Name"
  4393.         # This part is necessary because re-identification depends on the PHI's type
  4394.         foreach $ky (sort numerically keys %phiT) {
  4395.  
  4396.         my $ky = $ky;
  4397.         ($startp, $endp) = split "-", $ky;
  4398.         $notAmbig = 0; 
  4399.        
  4400.         # Checks to see if PHI type matches any of the date patterns
  4401.         # Each PHI may have more than one listed type, e.g. First Name AND Last Name
  4402.         # For each PHI type listed for the specific PHI
  4403.         foreach $t (@{$phiT{$ky}}) {
  4404.  
  4405.             $datephi1 = "Year/Month/Day"; # e.g. 1999/2/23
  4406.             $datephi2 = "Year/Day/Month"; # e.g. 1999/23/2, note: pattern currently not filtered in sub date()!
  4407.             $datephi3 = "Month/Day/Year"; # e.g. 2/23/1999
  4408.             $datephi5 = "Day/Month/Year"; # 23/2/1999, note: pattern currently not filtered in sub date()!
  4409.             $datephi4 = "Month/Day"; # e.g. 2/23, using record year as year
  4410.             $datephi6 = "Day/Month"; # e.g. 23/2, using record current year as year, note: pattern currently not filtered in sub date()!
  4411.             $datephi7 = "Month/Year";  # e.g. 2/1999, using 1 as day
  4412.             $datephi8 = "Year/Month";  # e.g. 1999/2, using 1 as day
  4413.             $datephi9 = "Day Month Year"; # e.g. 23 february 1999
  4414.             $datephi10 = "Month Day Year"; # e.g. feb 23 1999 or feb. 23rd 1999
  4415.             $datephi11 = "Month Day"; # e.g. feb 23, using record year as year
  4416.             $datephi12 = "Day Month"; # e.g. 23 february, using record year as year
  4417.             $datephi13 = "Month Year"; # e.g. feb 1999, or february of 1999 or feb. 1999, using 1 as day
  4418.             $datephi14 = "Header Date"; # not important
  4419.             $datephi15 = "4 digits"; # 4-digit year, e.g. 1999
  4420.             $datephi16 = "2 digits"; # 2-digit year, e.g. '99
  4421.             $datephi17 = "Day Month Year 2"; # e.g. 23rd february 1999
  4422.  
  4423.            
  4424.             #if ($ky =~/$key/) {$phiType = $t;}
  4425.             if ($ky eq $key) {$phiType = $t;}
  4426.  
  4427.             # Calls the date-shifting function alterdate() with a date argument appropriate for the date pattern
  4428.             # This is because alterdate() accepts an argument of a fixed date pattern
  4429.             # Prints the resulting shifted date in deid_text_file (.res)
  4430.            
  4431.             # If the current PHI has not been output to the .res file yet, checks if PHI type is date
  4432.             # Shifts the date and writes the shifted date to TF (.res file)
  4433.             if ($added == 0) {
  4434.            
  4435.             #if (($t =~ /$datephi1/)  && ($ky=~/$key/)){
  4436.             if (($t =~ /$datephi1/)  && ($ky eq $key)){        
  4437.                 $date = (substr $allText, $k, ($deidsend-$k));             
  4438.                 $date =~ s/\-/\//g;    
  4439.                 $date =~ s/\./\//g;
  4440.                 $altereddate = &alterdate($date, $checkID);
  4441.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4442.                 $longyear = $1;
  4443.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";                        
  4444.                 $added = 1;
  4445.                 $lastEnd = $deidsend;
  4446.                
  4447.             }
  4448.             elsif (($t =~ /$datephi17/)  && ($ky eq $key)){  
  4449.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4450.                
  4451.                 $date =~ /\b(((\d{1,2})(|st|nd|rd|th|)?\s+(of\s)?[\-]?\b([A-Za-z]+)\.?,?)\s+(\d{2,4}))\b/ig; # 12-Apr, or Second of April
  4452.  
  4453.                 $mon = $6;
  4454.                 $day = $3;
  4455.                 $year = $7;
  4456.                 foreach $m (sort keys %months) {
  4457.                 if ($mon =~ /$m/ig) {
  4458.                     $month = $months{$m};
  4459.                 }
  4460.                 }              
  4461.                 $date =  "$year/$month/$day";
  4462.  
  4463.                 $altereddate = &alterdate($date, $checkID );
  4464.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4465.                
  4466.                 $added = 1;
  4467.                 $lastEnd = $deidsend;
  4468.             }              
  4469.             #elsif (($t =~ /$datephi9/)  && ($ky=~/$key/)){
  4470.             elsif (($t =~ /$datephi9/)  && ($ky eq $key)){  
  4471.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4472.                 $date =~ /(\d+) ([A-Za-z]+)\,? (\d+)/;
  4473.                 $mon = $2;
  4474.                 $day = $1;
  4475.                 $year = $3;
  4476.                 foreach $m (sort keys %months) {
  4477.                 if ($mon =~ /$m/ig) {
  4478.                     $month = $months{$m};
  4479.                 }
  4480.                 }              
  4481.                 $date =  "$year/$month/$day";
  4482.                 $longyear = $5;
  4483.                 $altereddate = &alterdate($date, $checkID );
  4484.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4485.                
  4486.                 #local $added = 1;
  4487.                 $added = 1;
  4488.                 $lastEnd = $deidsend;
  4489.             }          
  4490.             #elsif (($t =~ /$datephi10/)  && ($ky=~/$key/)){  
  4491.             elsif (($t =~ /$datephi10/)  && ($ky eq $key)){
  4492.  
  4493.                 $date = (substr $allText, $k, ($deidsend-$k)); 
  4494.                
  4495.                 $date =~ /\b(([A-Za-z]+)\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?[\,\s]+ *\'?(\d{2,4}))\b/ig;
  4496.                 $mon = $2;
  4497.                 $day = $3;
  4498.                 $year = $5;
  4499.                 #$date =~ /([A-Za-z]+) (\d+)\,? (\d+)/;
  4500.                 #$mon = $1;
  4501.                 #$day = $2;
  4502.                 #$year = $3;
  4503.                 #print "DatePHI10: Before date shift: month= $mon, day = $day , year = $year \n";
  4504.  
  4505.                 foreach $m (sort keys %months) {
  4506.                 if ($mon =~ /$m/ig) {
  4507.                     $month = $months{$m};
  4508.                 }
  4509.                 }
  4510.                 $date =  "$year/$month/$day";
  4511.                 $altereddate = &alterdate($date, $checkID);
  4512.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4513.                
  4514.                 #local $added = 1;
  4515.                 $added = 1;
  4516.                 $lastEnd = $deidsend;
  4517.             }                  
  4518.             #elsif (($t =~ /$datephi2/)  && ($ky=~/$key/)){  
  4519.             elsif (($t =~ /$datephi2/)  && ($ky eq $key)){  
  4520.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4521.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4522.                 $date =  "$1/$5/$3";
  4523.                 $longyear = $1;
  4524.                 $altereddate = &alterdate($date, $checkID);
  4525.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4526.                 #local $added = 1;
  4527.                 $added = 1;
  4528.                 $lastEnd = $deidsend;
  4529.             }      
  4530.             #elsif (($t =~ /$datephi3/)  && ($ky=~/$key/)){  
  4531.             elsif (($t =~ /$datephi3/)  && ($ky eq $key)){  
  4532.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4533.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4534.                 $date =  "$5/$1/$3";
  4535.                 $altereddate = &alterdate($date,  $checkID);
  4536.                 $longyear = $5;
  4537.                
  4538.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4539.                 $added = 1;
  4540.                 $lastEnd = $deidsend;              
  4541.             }
  4542.             #elsif (($t =~ /$datephi5/)  && ($ky=~/$key/)){    
  4543.             elsif (($t =~ /$datephi5/)  && ($ky eq $key)){                 
  4544.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4545.                 $date =~ /(\d+)(.)(\d+)(.)(\d+)/;
  4546.                 $date =  "$5/$3/$1";
  4547.                 $altereddate = &alterdate($date, $checkID );
  4548.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4549.                 $longyear = $5;
  4550.                 $added = 1;
  4551.                 $lastEnd = $deidsend;              
  4552.             }  
  4553.             #elsif (($t =~ /$datephi4/)  && ($ky=~/$key/)){  
  4554.             elsif (($t =~ /$datephi4/)  && ($ky eq $key)){                 
  4555.                 $date = (substr $allText, $k, ($deidsend-$k)); 
  4556.                 $date =~ s/\-/\//g;    
  4557.                 $date = "$checkYear/$date";            
  4558.                 $altereddate = &alterdate($date, $checkID);
  4559.                 $altereddate = (substr $altereddate, 5, (length($altereddate)-5));
  4560.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4561.                 $added = 1;
  4562.                 $lastEnd = $deidsend;              
  4563.             }  
  4564.             #elsif (($t =~ /$datephi11/)  && ($ky=~/$key/)){  
  4565.             elsif (($t =~ /$datephi11/)  && ($ky eq $key)){                
  4566.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4567.                 #$date =~ /([A-Za-z]+) (\d+)/;
  4568.                 #$mon = $1;
  4569.                 #$day = $2;
  4570.                 $date =~ /\b(([A-Za-z]+)\b\.?,?\s*(\d{1,2})(|st|nd|rd|th|)?)\b/ig;  # Apr. 12
  4571.                 $mon = $2;
  4572.                 $day = $3;
  4573.                
  4574.                 foreach $m (sort keys %months) {
  4575.                 if ($mon =~ /$m/ig) {
  4576.                     $month = $months{$m};
  4577.                 }
  4578.                 }
  4579.                 $date =  "$checkYear/$month/$day";
  4580.                 $altereddate = &alterdate($date,  $checkID);
  4581.                 $altereddate = substr($altereddate, 5, (length($altereddate)-5));              
  4582.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4583.                 $added = 1;
  4584.                 $lastEnd = $deidsend;
  4585.             }
  4586.                
  4587.             #elsif (($t =~ /$datephi6/)  && ($ky=~/$key/)){  
  4588.             elsif (($t =~ /$datephi6/)  && ($ky eq $key)){                 
  4589.                 $date = (substr $allText, $k, ($deidsend-$k));                     
  4590.                 $date =~ /(\d+)(.)(\d+)/;
  4591.                 #print "DatePHI6: date is $date, 3 is $3, 1 is $1\n";
  4592.                 $date =  "$3/$1";
  4593.                 $date = "$checkYear/$date";
  4594.                 $altereddate = &alterdate($date,  $checkID);
  4595.                 $altereddate = substr($altereddate, 5, (length($altereddate)-5));
  4596.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4597.                 $added = 1;
  4598.                 $lastEnd = $deidsend;              
  4599.             }
  4600.             #elsif (($t =~ /$datephi12/)  && ($ky=~/$key/)){    
  4601.             elsif (($t =~ /$datephi12/)  && ($ky eq $key)){                
  4602.                 $date = (substr $allText, $k, ($deidsend-$k)); 
  4603.                 #$date =~ /(\d+) ([A-Za-z]+)/;
  4604.                 #$mon = $2;
  4605.                 #$day = $1;
  4606.                 $date =~ /\b((\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b([A-Za-z]+))\b/ig;
  4607.                
  4608.                 $mon = $5;
  4609.                 $day = $2;
  4610.                 #print "month is $month, day is $day\n";
  4611.                 foreach $m (sort keys %months) {
  4612.                 if ($mon =~ /$m/ig) {
  4613.                     $month = $months{$m};
  4614.                 }
  4615.                 }
  4616.                 $date =  "$checkYear/$month/$day";
  4617.                 $altereddate = &alterdate($date,  $checkID);
  4618.                 $altereddate = substr($altereddate, 5, (length($altereddate)-5));              
  4619.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4620.                 $added = 1;
  4621.                 $lastEnd = $deidsend;
  4622.             }
  4623.             #elsif (($t =~ /$datephi13/)  && ($ky=~/$key/)){  
  4624.             elsif (($t =~ /$datephi13/)  && ($ky eq $key)){  
  4625.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4626.                 $date =~ /([A-Za-z]+)(\.)?(\s+)(of\s+)?(\d+)/ig;
  4627.                 $mon = $1;
  4628.                 $year = $5;
  4629.                 foreach $m (sort keys %months) {
  4630.                 if ($mon =~ /$m/ig) {              
  4631.                     $month = $months{$m};
  4632.                 }
  4633.                 }
  4634.  
  4635.                 $date = "$year/$month/1";
  4636.                 $longyear = $3;
  4637.                 $altereddate = &alterdate($date, $checkID );
  4638.                
  4639.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4640.                 $added = 1;
  4641.                 $lastEnd = $deidsend;
  4642.             }
  4643.             #elsif (($t =~ /$datephi7/)  && ($ky=~/$key/)){    
  4644.             elsif (($t =~ /$datephi7/)  && ($ky eq $key)){                 
  4645.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4646.                 $date =~ /(\d+)(.)(\d+)/;      
  4647.                 $date =  $3.'/'.$1.'/1';
  4648.                 $altereddate = &alterdate($date,  $checkID);
  4649.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
  4650.                 $longyear = $3;
  4651.                 $added = 1;
  4652.                 $lastEnd = $deidsend;
  4653.             }
  4654.             #elsif (($t =~ /$datephi8/)  && ($ky=~/$key/)){  
  4655.             elsif (($t =~ /$datephi8/)  && ($ky eq $key)){                 
  4656.                 $date = (substr $allText, $k, ($deidsend-$k));         
  4657.                 $date =~ /(\d+)(.)(\d+)/;      
  4658.                 $date =  $1.'/'.$3.'/1';
  4659.                 $altereddate = &alterdate($date,  $checkID);
  4660.                 $longyear = $1;
  4661.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";            
  4662.                 $added = 1;
  4663.                 $lastEnd = $deidsend;
  4664.             }
  4665.             #elsif (($t =~ /$datephi14/)  && ($ky=~/$key/)) {
  4666.             elsif (($t =~ /$datephi14/)  && ($ky eq $key)) {
  4667.                 $date = (substr $allText, $k, ($deidsend-$k));
  4668.                 $date =~ /(\d+)(\-)(\d+)(\-)(\d+)/;
  4669.                 $date = $1.'/'.$3.'/'.$5;
  4670.                 $altereddate = &alterdate($date,  $checkID);
  4671.                 $longyear = $1;
  4672.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."$date";
  4673.                 $added = 1;
  4674.                 $lastEnd = $deidsend;
  4675.             }
  4676.             #elsif (($t =~ /$datephi15/)  && ($ky=~/$key/)) {
  4677.             elsif (($t =~ /$datephi15/)  && ($ky eq $key)) {
  4678.                 $year  = (substr $allText, $k, ($deidsend-$k));
  4679.                 $date = $year.'/1/1';
  4680.                 $altereddate = &alterdate($date, $checkID);
  4681.                 $alteredyear = (substr $altereddate, 0, 4);
  4682.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$alteredyear**]";
  4683.                 $added = 1;
  4684.                 $lastEnd = $deidsend;
  4685.             }  
  4686.             #elsif (($t =~ /$datephi16/)  && ($ky=~/$key/)) {
  4687.             elsif (($t =~ /$datephi16/)  && ($ky eq $key)) {
  4688.                 $year  = (substr $allText, $k, ($deidsend-$k));
  4689.  
  4690.                 #if ($year > 20) {
  4691.                 #   $year = '19'.$year;
  4692.                 #} else {
  4693.                 #   #$year = '20.'.$year;  
  4694.                 #   $year = '20'.$year;
  4695.                 #    }
  4696.              
  4697.                 $date = $year.'/1/1';
  4698.  
  4699.                 $altereddate = &alterdate($date, $checkID);
  4700.                 $alteredyear = (substr $altereddate, 2, 2);
  4701.                 print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$alteredyear**]";
  4702.                 $added = 1;
  4703.                 $lastEnd = $deidsend;
  4704.             }
  4705.             # If the current PHI is not a date, then indicates that it has not been output yet
  4706.             else { my $added = 0;}         
  4707.             }
  4708.         }
  4709.         }    
  4710.        
  4711.         # If the PHI is not a date, replaces it in deid_text_file (.res) by its PHI type tag
  4712.         if ($added==0) {
  4713.        
  4714.         if ($k > $lastEnd || ($k==0)) {
  4715.             $phiText = (substr $allText, $k, ($deidsend-$k));
  4716.        
  4717.  
  4718.             # Parentheses are eliminated so that they do not trip up the run
  4719.             $phiText =~ s/\(//g;
  4720.             $phiText =~ s/\)//g;
  4721.             $phiText =~ s/\+//g;       
  4722.            
  4723.             $found = 0;
  4724.  
  4725.             # Assigns a unique ID to each PHI, e.g. all instances of "Mary" may be assigned "1", but "John" may be assigned "2"
  4726.             # %ID maps each PHI (e.g. "Mary") to its ID (e.g. "1")
  4727.             foreach $phik (keys %ID) {
  4728.             if ($phik =~/$phiText/ig) {
  4729.                 $found = 1;
  4730.             }
  4731.             }
  4732.  
  4733.             # If the current PHI to be added to .res file is already recorded in %ID, then retrieves its unique ID
  4734.             if ($found==1) {
  4735.             $phiID = $ID{$phiText};
  4736.             }
  4737.  
  4738.             # If the current PHI is not recorded in %ID, records it in %ID and assigns the PHI a unique ID
  4739.             else {
  4740.             $ID{$phiText} = keys(%ID) + 1;
  4741.             $phiID = $ID{$phiText};
  4742.             }
  4743.                        
  4744.             # Prints the PHI type and PHI ID in place of the original PHI in the .res file
  4745.             print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$phiType $phiID**]";
  4746.             $lastEnd = $deidsend;
  4747.         }
  4748.  
  4749.         else {
  4750.             if ($lastEnd < $deidsend) {
  4751.             $lastEnd = $deidsend;
  4752.             }
  4753.         }
  4754.        
  4755.         if ($lastEnd == 0) {
  4756.             $lastEnd = $lastlast;
  4757.         }
  4758.         }      
  4759.     }
  4760.     }
  4761.     #close OF;
  4762.     #print "Finished outputing to the .phi file.";
  4763.    
  4764.     # Prints the remaining non-PHI text to the .res file    
  4765.     print TF (substr $allText, $lastEnd);  
  4766.     #print "finished outputing to the .res file";
  4767.  
  4768.     close TF;  
  4769. }
  4770. # End of outputText()
  4771.  
  4772.  
  4773. #***********************************************************************************************************
  4774. #***********************************************************************************************************
  4775. #***********************************************************************************************************
  4776. # Function: alterdate()
  4777. # Arguments: string $date (yyyy/mm/dd), int $pid (patient ID)
  4778. # Returns: string $year-$month-$day
  4779. # Called by: outputText()
  4780. # Description: Converts given date to shifted date, depending on date-shift-mode, by calling doalterdate().
  4781. # Returns shifted date
  4782.  
  4783. sub alterdate {
  4784.     # Separates date fields by splitting along "/" or "-"
  4785.     # Then calls doalterdate on the resulting array of date elements
  4786.     $d = $_[0];  
  4787.     if (substr($d,0,6) =~ /(\/)/) {
  4788.     @d = split '/', $d;
  4789.     }
  4790.     elsif (substr($d,0,6) =~ /(\-)/) {
  4791.     @d = split '-', $d;
  4792.     }
  4793.  
  4794.     ($entryyear, $entrymonth)=@d[0..1];
  4795.     return join "-", &doalterdate(@d, $_[1]);
  4796. }
  4797. # End of alterdate()
  4798. #***********************************************************************************************************
  4799. #***********************************************************************************************************
  4800. #***********************************************************************************************************
  4801. # Function: doalterdate()
  4802. # Arguments: int $year, int $month, int $day, int $pid (patient ID)
  4803. # Returns: array @($year, $month, $day)
  4804. # Called by: alterdate()
  4805. # Description: Converts given date, depending on date-shift-mode, to shifted date.
  4806. # Returns shifted date in an array format to calling function alterdate(). alterdate() converts the shifted date to a string.
  4807.  
  4808. sub doalterdate {
  4809.     my $year=$_[0];
  4810.     if (length $year == 2) {
  4811.  
  4812.             # Limits 2-digit years between 1900 and 2020
  4813.         # Converts them to 4-digit years
  4814.         #if ($year<=10) {$year = 2000+$year;}  
  4815.         #if ($year<=20) {$year = 2000+$year;}
  4816.         if ($year<=$TWO_DIGIT_YEAR_THRESHOLD) {$year = 2000+$year;}
  4817.         else {$year = 1900+$year;}
  4818.     }
  4819.     my $month=$_[1];
  4820.     my $day=$_[2];
  4821.     my $pid=$_[3];          
  4822.    
  4823.     if ($pid_dateshift_list =~ /y/) {
  4824.         open SF, $date_shift_file or die "Cannot open $date_shift_file";
  4825.         while ($line = <SF>) {
  4826.         chomp $line;
  4827.         if ($line =~ /\A($pid)\|\|\|\|([0-9\-]+)/) {
  4828.             $offset = $2;
  4829.         }      
  4830.         }
  4831.         close SF;
  4832.     }
  4833.  
  4834.     my $ml=&monthlength($month, $year);
  4835.    
  4836.     # $offset = days of offset (positive or negative shift)
  4837.     # Sets the shifted year
  4838.     $offset_local = $offset;
  4839.     if ($offset_local>0) {
  4840.         $year += 4*int($offset_local/1461);
  4841.         $offset_local -=1461*int($offset_local/1461);
  4842.     }
  4843.     if ($offset_local<0){
  4844.         $year -= 4*int(-$offset_local/1461);
  4845.         $offset_local +=1461*int(-$offset_local/1461);
  4846.     }
  4847.    
  4848.     # Shifts number of days
  4849.     $day +=$offset_local;
  4850.    
  4851.     $ml=&monthlength($month, $year);
  4852.  
  4853.     # Changes $day, $month, $year based on the remaining offset after shifting $year
  4854.     while ($day>$ml) {
  4855.         $ml=&monthlength($month, $year);
  4856.         $day=$day-$ml;
  4857.         $month++;
  4858.         if ($month>12) {
  4859.         $month -=12;
  4860.         $year++;
  4861.         }
  4862.         $ml = &monthlength($month, $year);
  4863.     }
  4864.     while ($day<1) {
  4865.         $ml=&monthlength($month-1, $year);
  4866.         $day=$day + $ml;
  4867.         $month--;
  4868.         if ($month<1) {
  4869.         $month +=12;
  4870.         $year--;
  4871.         }
  4872.     }
  4873.    
  4874.     # Formats the output of single-digit month and day: "2" becomes "02"
  4875.     if (length($month)<2) {
  4876.         $month="0".$month;
  4877.     }
  4878.     if (length($day)<2) {
  4879.         $day="0".$day;
  4880.     }
  4881.    
  4882.     # Returns the shifted date as an array
  4883.     return ($year, $month, $day);    
  4884.     }
  4885. # End of doalterdate()
  4886. #***********************************************************************************************************
  4887. #***********************************************************************************************************
  4888. #***********************************************************************************************************
  4889. # Function: altermonthdate()
  4890. # Arguments: int $month, int $day, int $pid (patient ID)
  4891. # Returns: call doalterdate(), returns array ($year, $month, $day)
  4892. # Called by: None
  4893. # Description: Calls doalterdate() which performs dateshift
  4894. # This function is not used
  4895.  
  4896. sub altermonthdate {
  4897.     my ($month, $day, $pid)=@_;
  4898.     my $year;
  4899.    
  4900.     if (($month-$entrymonth)%12<6) {
  4901.     if ($month<$entrymonth) {
  4902.         $year=$entryyear+1;
  4903.     }
  4904.     else {
  4905.         $year=$entryyear;
  4906.     }
  4907.     }
  4908.     else {
  4909.     if ($month>$entrymonth) {
  4910.         $year=$entryyear-1;
  4911.     }
  4912.     else {
  4913.         $year=$entryear;
  4914.     }
  4915.     }
  4916.     return (&doalterdate($year, $month, $day, $pid))[1,2];    
  4917. }
  4918. # End of altermonthdate()
  4919. #***********************************************************************************************************
  4920. #***********************************************************************************************************
  4921. #***********************************************************************************************************
  4922. # Function: getcentury()
  4923. # Arguments: int 2-digit $year
  4924. # Returns: int
  4925. # Called by: None
  4926. # Description: Converts a 2-digit year into a 4-digit year based on entryyear
  4927. # This function is not used
  4928.  
  4929. sub getcentury {
  4930.     my $year=$_[0];
  4931.  
  4932.     if (($year-$entryyear)%100<10) {
  4933.     $year=$entryyear+(($year-$entryyear)%100);
  4934.     }
  4935.     else {
  4936.     $year=$entryyear-(($entryyear-$year)%100);
  4937.     }
  4938. }
  4939. # End of getcentury()
  4940. #***********************************************************************************************************
  4941. #***********************************************************************************************************
  4942. #***********************************************************************************************************
  4943. # Function: monthlength()
  4944. # Arguments: int $m, int $y
  4945. # Returns: int
  4946. # Called by: doalterdate()
  4947. # Description: Returns the number of days in given month-year
  4948.  
  4949. sub monthlength {
  4950.     my($m, $y)=@_;
  4951.  
  4952.     while ($m<=0) {
  4953.     $m += 12;
  4954.     $y --;
  4955.     }
  4956.     while ($m>=13) {
  4957.     $m -= 12;
  4958.     $y ++;
  4959.     }    
  4960.     # Checks for February
  4961.     if ($m==2) {
  4962.     if ($y % 4 ==0) {
  4963.         if($y % 100 ==0) {
  4964.         if ($y % 400 ==0){
  4965.             return 29;
  4966.         }
  4967.         else {
  4968.             return 28;
  4969.         }
  4970.         }
  4971.         else {     
  4972.         return 29;
  4973.         }
  4974.     }
  4975.     else {     
  4976.         return 28;
  4977.     }
  4978.     }
  4979.     # Checks for months consisting of 30 days    
  4980.     elsif (($m==4) || ($m==6) || ($m==9) || ($m==11)) {
  4981.     return 30;
  4982.     }
  4983.     # Checks for months consisting of 31 days    
  4984.     else {     
  4985.     return 31;
  4986.     }
  4987. }
  4988. # End of monthlength()
  4989.  
  4990.  
  4991.  
  4992.  
  4993.  
  4994.  
  4995.  
  4996.  
  4997.  
  4998. #***********************************************************************************************************
  4999. #***********************************************************************************************************
  5000. #***********************************************************************************************************
  5001. # Application-specific code follows. Code may contain patterns specific to our medical notes.
  5002. # Customize by replacing with your application-specific filters.
  5003. #***********************************************************************************************************
  5004. #***********************************************************************************************************
  5005. #***********************************************************************************************************
  5006.  
  5007.  
  5008. #***********************************************************************************************************
  5009. #***********************************************************************************************************
  5010. #***********************************************************************************************************
  5011. #Function: isProbablyPhone
  5012. #Argument: context string
  5013. #Returns: Always returns 1 (true) unless the context is one of the words defined in
  5014. #phone_pre_disqualifier. For future extensions, can add qualifier words
  5015. #such as "phone", "pager", etc.
  5016.  
  5017. sub isProbablyPhone {
  5018.     @phone_pre_disqualifier = ("HR","Heart", "BP", "SVR", "STV", "VT", "Tidal Volumes", "Tidal Volume", "TV", "CKS");
  5019.     $context = $_[0];
  5020.     foreach $i (@phone_pre_disqualifier) {
  5021.     if ($context =~ /\b$i\b/i){
  5022.         return 0;
  5023.     }
  5024.     }      
  5025.     return 1;
  5026. }
  5027. #end of isProbablyPhone()
  5028.  
  5029. #***********************************************************************************************************
  5030. #***********************************************************************************************************
  5031. #***********************************************************************************************************
  5032. # Function: wardname()
  5033. # Argument: string
  5034. # Searches for ward names specific to our hospital
  5035. sub wardname() {
  5036.     $text = $_[0];
  5037.    
  5038.     #Added to catch gs specific wardnames
  5039.     if ($gs_specific_filter =~ /y/){
  5040.        
  5041.     @ward_indicators = ("Quartermain");
  5042.     foreach $ward_ind (@ward_indicators){
  5043.         while ($text =~ /\b(($ward_ind) ?(\d))\b/ig){
  5044.         $wardname = $1;
  5045.         $start = length($`);
  5046.         $end = $start + length($wardname);
  5047.         $key = $start."-".$end;
  5048.         addType ($key, "Wardname");
  5049.         }
  5050.     }
  5051.     }
  5052.  
  5053. }
  5054.  
  5055. # End of function wardname()
  5056.  
  5057.  
  5058.  
  5059. #***********************************************************************************************************
  5060. #***********************************************************************************************************
  5061. #***********************************************************************************************************
  5062. #Function: isProbablyMeasurement
  5063. #Argument: context string
  5064. #Returns: Returns true if the text passed in contains (more specifically, ends with) any of the measurement indicators.
  5065. sub isProbablyMeasurement {
  5066.  @measurement_indicators_pre = ("increased to","decreased from","rose to","fell from", "down to",
  5067.                "increased from", "dropped to", "dec to", "changed to","remains on", "change to");
  5068.     $context = $_[0];
  5069.  
  5070.     foreach $i (@measurement_indicators_pre) {
  5071.  
  5072.     if ($context =~ /\b$i\b/i){  #only match if it ends with the phrase
  5073.         return 1;
  5074.     }
  5075.     }      
  5076.  
  5077.     return 0;
  5078. }
  5079.  
  5080. #end isProbablyMeasurement()
  5081.  
  5082. #***********************************************************************************************************
  5083. #***********************************************************************************************************
  5084. #***********************************************************************************************************
  5085. #Function: isProbablyDate
  5086. #Argument: textBefore string and textAfter string
  5087. #Returns: returns 1 if the context check determines that textBefore or textAfter
  5088. #are most likely strings preceding or following a date PHI; returns 0 otherwise.
  5089. #Description: Subroutine performs a context check on the textBefore and textAfter. If it contains one of the
  5090. #keywords (see below) that indicates it's probably not a date, then return false; otherwise return true.
  5091. #Called by dateWithContextCheck on partial dates MM-DD
  5092. sub isProbablyDate{
  5093.     my ($textBefore, $textAfter) = @_;
  5094.    
  5095.     if ((!isProbablyMeasurement($textBefore)) && ($textBefore !~ /\b(drop|up|cc|dose|doses|range|ranged|pad|rate|bipap|pap|unload|ventilation|scale|cultures|blood|at|up|with|in|of|RR|ICP|CVP|strength|PSV|SVP|PCWP|PCW|BILAT|SRR|VENT|PEEP\/PS|flowby|drinks|stage) ?\Z/i) && ($textAfter !~ /\A ?(packs|litres|puffs|mls|liters|L|pts|patients|range|psv|scale|beers|per|esophagus|tabs|tablets|systolic|sem|strength|hours|pts|times|drop|up|cc|mg|\/hr|\/hour|mcg|ug|mm|PEEP|hr|hrs|hour|hours|bottles|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns|units|amp|qd|chest pain|intensity)\b/i)) {
  5096.      return 1;
  5097.  }
  5098.  
  5099.  return 0;
  5100.  
  5101. }
  5102. #end isProbablyDate()
  5103.  
  5104. #***********************************************************************************************************
  5105. #***********************************************************************************************************
  5106. #***********************************************************************************************************
  5107. #Function: isProbablyDate2
  5108. #Argument: textBefore string and textAfter string
  5109. #Returns: returns 1 if the context check determines that textBefore or textAfter
  5110. #are most likely strings preceding or following a date PHI; returns 0 otherwise.
  5111. #Subroutine performs a context check on the textBefore and textAfter.  If it contains one of the
  5112. #keywords (see below) that indicates it's probably not a date, then return false; otherwise return true.
  5113. #Called by dateWithContextCheck on partial dates MM/DD.  
  5114. sub isProbablyDate2{
  5115.  
  5116. if ( (!isProbablyMeasurement($textBefore)) && ($textBefore !~ /\b(drop|up|cc|range|ranged|pad|rate|rating|bipap|pap|unload|ventilation|scale|blood|at|up|with|RR|ICP|CVP|strength|PSV|SVP|PCWP|BILAT|SRR|VENT|PEEP\/PS|flowby) ?\Z/i) && ($textAfter !~ /\A ?(packs|litres|puffs|mls|liters|L|pts|patients|range|psv|scale|drinks|beers|per|esophagus|tabs|tab|tablet|tablets|systolic|sem|strength|hours|pts|times|drop|up|cc|mg|\/hr|\/hour|mcg|ug|mm|hr|hrs|hour|hours|bottles|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|drops|breaths|wbcs|beat|beats|ns|units|amp)\b/i)) {
  5117.      return 1;
  5118.  }
  5119.  return 0;
  5120.  
  5121.  
  5122. }
  5123. #end isProbablyDate2()
  5124.  
  5125. #***********************************************************************************************************
  5126. #***********************************************************************************************************
  5127. #Function: extractYear
  5128. #Argument: a date string in the format MM/DD/YYYY
  5129. #Returns: the 4-digit year if the date is in the correct format
  5130. #returns 0000 otherwise.
  5131. sub extractYear{
  5132.  
  5133.   $date = $_[0];
  5134.  
  5135.   if ($date =~ /\b(\d\d)\/(\d\d)\/(\d\d\d\d)\b/){
  5136.       $year = $3;
  5137.   } else{
  5138.       $year = 0000;
  5139.   }
  5140.   return $year;
  5141. }
  5142.  
  5143. #***********************************************************************************************************
  5144. #***********************************************************************************************************
  5145. #***********************************************************************************************************
  5146. #Function: dateWithContextCheck
  5147. #Argument: text string
  5148. #Returns: none
  5149. #Description: Find date patterns.  Performs context check on text before and after the date patterns
  5150. #to determine if its an actual date.
  5151. sub dateWithContextCheck{
  5152.     #$text = $_[0];    
  5153.     my ($text, $date) = @_;
  5154.     my $year = extractYear($date);
  5155.  
  5156.     #my $year = substr $date, 0, 4;  
  5157.     #**********************************************************************************************
  5158.     # Searches for numerical date formats
  5159.     # Checks if dates should be filtered
  5160.     if ($datefilter =~ /y/) {
  5161.    
  5162.     # Searches for mm/dd or mm/yy
  5163.     while ($text =~ /\b([A-Za-z0-9%\/]+ +)?(\d\d?)([\/\-])(\d\d?)\/?\/?( +[A-Za-z]+)?\b/g) {               
  5164.    
  5165.         $pre = $1;
  5166.         $post = $5;
  5167.         my $first_num = $2;
  5168.         my $divider = $3;
  5169.         my $second_num = $4;
  5170.         my $postdate = $5;
  5171.            
  5172.         my $startI = length($`) + length($pre);
  5173.         my $endI = $startI + length($first_num)+length($divider)+length($second_num);
  5174.         my $key = $startI."-".$endI;
  5175.        
  5176.         my $beginr = substr $text, ($startI - 2), 2;
  5177.         my $endr = substr $text, $endI, 2;
  5178.        
  5179.         #Excludes nn/nn formats when preceded by any in array @prev
  5180.         #@prev = ("cvp", "noc", "peep/ps","%");
  5181.         @prev = ("cvp", "noc", "%", "RR", "PCW");
  5182.         my $dateorno = 1;
  5183.         foreach $j (@prev) {
  5184.         if ((!($pre =~ /\b$j/ig)) && (!($post =~ /\bpersantine\b/ig))) {
  5185.         }
  5186.         else {
  5187.             $dateorno = 0;
  5188.         }
  5189.         }
  5190.  
  5191.         my $context_len = 12; #number of characters we extract before the date
  5192.  
  5193.         if ($dateorno == 1) {
  5194.         if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%]/) && ($endr !~ /\S\d/)) {
  5195.            
  5196.             # Checks if date identified is valid as mm/dd; then adds the date as PHI
  5197.             if (isValidDate ($first_num, $second_num, -1)) {
  5198.            
  5199.             if ($second_num == 5) {
  5200.                
  5201.                 my $start_pos = $startI - $context_len;
  5202.                 if ($start_pos < 0) {
  5203.                 $start_pos = 0;
  5204.                 }
  5205.                 my $len = $context_len;
  5206.                 if  (length($text) < ($endI + $context_len)){
  5207.                 $len = length($text) - $endI;
  5208.                 }
  5209.                    
  5210.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5211.                 my $textAfter = substr $text, $endI, $len;
  5212.  
  5213.                
  5214.                 if (  (!isProbablyMeasurement($textBefore)) &&   ($textBefore !~ /\bPSV? \Z/i) && ($textBefore !~ /\b(CPAP|PS|range|bipap|pap|pad|rate|unload|ventilation|scale|strength|drop|up|cc|rr|cvp|at|up|in|with|ICP|PSV|of) \Z/i) && ($textAfter !~ /\A ?(packs|psv|puffs|pts|patients|range|scale|mls|liters|litres|drinks|beers|per|esophagus|tabs|pts|tablets|systolic|sem|strength|times|bottles|drop|drops|up|cc|mg|\/hr|\/hour|mcg|ug|mm|PEEP|L|hr|hrs|hour|hours|dose|doses|cultures|blood|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns)\b/i)) {
  5215.  
  5216.                 addType ($key, "Month/Day (1)");
  5217.                 }
  5218.            
  5219.             } elsif ($second_num == 2) {
  5220.            
  5221.                 my $start_pos = $startI - $context_len;
  5222.                 if ($start_pos < 0) {
  5223.                 $start_pos = 0;
  5224.                 }
  5225.                 my $len = $context_len;
  5226.                 if  (length ($text) < ($endI + $context_len)){
  5227.                 $len = length($text) - $endI;
  5228.                 }
  5229.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5230.                 my $textAfter = substr $text, $endI, $len;
  5231.  
  5232.                
  5233.                 if (   (!isProbablyMeasurement($textBefore)) &&   ($textAfter !~ /\A ?hour\b/i) && ($textBefore !~ /\b(with|drop|bipap|pap|range|pad|rate|unload|ventilation|scale|strength|up|cc|rr|cvp|at|up|with|in|ICP|PSV|of) \Z/i) && ($textAfter !~ /\A ?hr\b/i) && ($textAfter !~ /\A ?(packs|L|psv|puffs|pts|patients|range|scale|dose|doses|cultures|blood|mls|liters|litres|pts|drinks|beers|per|esophagus|tabs|tablets|systolic|sem|strength|bottles|times|drop|cc|up|mg|\/hr|\/hour|mcg|ug|mm|PEEP|hr|hrs|hour|hours|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns)\b/i)) {
  5234.                
  5235.                
  5236.                 addType ($key, "Month/Day (2)");
  5237.                 }
  5238.             #} elsif (($divider eq "-") && ($startI > 4)) {
  5239.             } elsif (($divider eq "-")) {
  5240.                 my $start_pos = $startI - $context_len;
  5241.                 if ($start_pos < 0) {
  5242.                 $start_pos = 0;
  5243.                 }
  5244.                 my $len = $context_len;
  5245.                 if  (length ($text) < ($endI + $context_len)){
  5246.                 $len = length($text) - $endI;
  5247.                 }
  5248.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5249.                 my $textAfter = substr $text, $endI, $len;
  5250.                                
  5251.                 if (isProbablyDate($textBefore, $textAfter)){
  5252.                
  5253.                 addType ($key, "Month/Day (3)");
  5254.                 }
  5255.             }
  5256.             else {
  5257.  
  5258.                 my $start_pos = $startI - $context_len;
  5259.                 if ($start_pos < 0) {
  5260.                 $start_pos = 0;
  5261.                 }
  5262.                 my $len = $context_len;
  5263.                 if  (length ($text) < ($endI + $context_len)){
  5264.                 $len = length($text) - $endI;
  5265.                 }
  5266.                 my $textAfter = substr $text, $endI, $len;  
  5267.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5268.                            
  5269.                
  5270.                 if (isProbablyDate($textBefore, $textAfter)){  
  5271.                 addType ($key, "Month/Day (4)");
  5272.                 }
  5273.             }
  5274.             }
  5275.            
  5276.             # Checks if date identified is valid as mm/yy; then adds the date as PHI
  5277.                     # Checks for years of length 2, restricted to 1950-2030
  5278.             if (($first_num <= 12) && ($first_num > 0) && ((length $second_num) == 2)
  5279.             && (($second_num>=50) || ($second_num<=30))) {     
  5280.  
  5281.                 #my $textAfter = substr $text, $endI, 9;
  5282.                 my $start_pos = $startI - $context_len;
  5283.                 if ($start_pos < 0) {
  5284.                 $start_pos = 0;
  5285.                 }
  5286.                 my $len = $context_len;
  5287.                 if  (length ($text) < ($endI + $context_len)){
  5288.                 $len = length($text) - $endI;
  5289.                 }
  5290.                 my $textAfter = substr $text, $endI, $len;  
  5291.                 my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
  5292.                #  print "checking mm/yy text before is $textBefore, text after is $textAfter\n";
  5293.                 if (isProbablyDate($textBefore, $textAfter)){
  5294.                 addType ($key, "Month/Year (2)");  
  5295.                 }
  5296.                } #end if the first num and second num are month/year
  5297.  
  5298.         }
  5299.         }  #end if dateno ==1          
  5300.  
  5301.     } #end while the pattern match
  5302.     } # end if datefilter is on
  5303. }
  5304.  
  5305. #end dateWithContextCheck()
  5306.  
  5307. #***********************************************************************************************************
  5308. #***********************************************************************************************************
  5309. #***********************************************************************************************************
  5310. #Function: yearWithContextCheck
  5311. #Argument: text string
  5312. #Returns: none
  5313. #Description: Find date patterns.  Performs context check on text before and after the year patterns
  5314. #to determine if it is a year.
  5315. sub yearWithContextCheck {
  5316.     #$text = $_[0];    
  5317.     my ($text, $date) = @_;
  5318.     #my $year = substr $date, 0, 4;  
  5319.     my $year = extractYear($date);
  5320.  
  5321.  
  5322.     # Checks for 2-digit year written as '01, &c, only when preceded by the following medical terms
  5323.     while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|avr|x2|x3|CHOLECYSTECTOMY|cva|ca|PTC|PTCA|stent|since|surgery|year) + *(\')?)(\d\d)\b/ig) {
  5324.         my $num = $1;
  5325.         #print "2-digit date, 1 is $1, 2 is $2, 3 is $3, 4 is $4, 5 is $5\n";
  5326.         #my $key = (1 + (length($`))+length($1))."-".(pos $text);
  5327.         my $startd =  (length($`))+length($1);
  5328.         my $endd = $startd + length($4);
  5329.         my $key =  $startd."-".$endd;
  5330.  
  5331.         #check if the match is part of number followed by decimal point and a number
  5332.         my $textAfter = substr $text, $endd, 2;
  5333.        
  5334.         if ($textAfter !~ /\.\d/){
  5335.         addType($key, "Year (2 digits)");
  5336.        
  5337.         }
  5338.      }
  5339.        
  5340.  
  5341.      while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|avr|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|surgery|year)(\'+))(\d\d)\b/ig) {
  5342.         my $num = $1;
  5343.  
  5344.         my $startd =  (length($`))+length($1);
  5345.         my $endd = $startd + length($4);
  5346.         my $key =  $startd."-".$endd;
  5347.  
  5348.         #check if the match is part of number followed by decimal point and a number
  5349.         #or if it is a time (HH:MM)
  5350.         my $textAfter = substr $text, $endd, 2;
  5351.        
  5352.         if ($textAfter !~ /(\.|\:)\d/){
  5353.         addType($key, "Year (2 digits)");
  5354.         }
  5355.      }
  5356.  
  5357.      # Checks for 4-digit year written as 2001, &c, only when preceded by the following medical terms
  5358.      while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|in|PTCA|since|from|year) + *)(\d{4})((\,? )\d{4})?\b/ig) {
  5359.         my $num1 = $1;
  5360.         $s1 = length($`) + length($1);
  5361.         $e1 = $s1+length($3);
  5362.         $s2 = $e1+length($5);
  5363.         $e2 = $e1+length($4);
  5364.         $k1 = "$s1-$e1";
  5365.         $k2 = "$s2-$e2";
  5366.  
  5367.        
  5368.         #for 4-digit year, check if the matched number is in the range of [$VALID_YEAR_LOW,$VALID_YEAR_HIGH]
  5369.         #if ($3 <= 2030 && $3 >= 1950){
  5370.         if ($3 <= $VALID_YEAR_HIGH && $3 >= $VALID_YEAR_LOW){
  5371.             addType($k1, "Year (4 digits)");
  5372.             addType($k2, "Year (4 digits)");
  5373.             }
  5374.        }
  5375.        
  5376.       # Looks for year only (esp Patient Medical History): looks for year numbers within the 30 years before
  5377.       # and 2 years after the date passed in as an argument.
  5378.        
  5379.       # for $n (($year - 30) .. $year) {
  5380.       for $n (($year - 30) .. ($year+2)) {
  5381.         my $short = substr $n, 2, 2;
  5382.         if ($n =~ /\d\d\d\d/) {
  5383.         while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|in|PTCA|since|from|year) + *)$n\b/ig) {
  5384.             my $key = (length($`)+length($1))."-".(pos $text);
  5385.             addType ($key, "Year (4 digits)");
  5386.         }}
  5387.         if ($short =~ /\d\d/) {
  5388.         #while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|from|year) + *(\'?))$short\b/ig) {
  5389.         while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|year) + *(\'?))$short\b/ig) {
  5390.             my $key = (length($`)+length($1))."-".(pos $text);
  5391.             addType ($key, "Year (2 digits)");
  5392.         }}
  5393.         if ($short =~ /\d\d/) {
  5394.     #   while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|from|year)(\'+))$short\b/ig) {
  5395.         while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|year)(\'+))$short\b/ig) {
  5396.             my $key = (length($`)+length($1))."-".(pos $text);
  5397.             addType ($key, "Year (2 digits)");
  5398.         }}
  5399.      }
  5400.  
  5401. }
  5402.  
  5403. #end yearWithContextCheck()
  5404.  
  5405.  
  5406.  
  5407.  
  5408. #***********************************************************************************************************
  5409. #***********************************************************************************************************
  5410. #***********************************************************************************************************
  5411. # End of De-Identification Algorithm
  5412. #***********************************************************************************************************
  5413. #***********************************************************************************************************
  5414. #***********************************************************************************************************
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement