Advertisement
oshkoshbagoshh

resume / pdf parser

Feb 28th, 2018
349
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.97 KB | None | 0 0
  1. """
  2. The script is used to parse PDF and DOCX files predominantly resumes and extract all the relevant information from it.
  3. The extracted information is stored on to a Django model, this can be replaced to suite your needs.
  4. """
  5. __author__ = "ssharad"
  6. __license__ = "GPL v.3.0"
  7. # -*- coding: utf-8 -*-
  8. import pyPdf
  9. import docx
  10. import string
  11.  
  12. #Extract text from PDF
  13. def getPDFContent(path):
  14.     content = ""
  15.     # Load PDF into pyPDF
  16.     pdf = pyPdf.PdfFileReader(file(path, "rb"))
  17.     # Iterate pages
  18.     for i in range(0, pdf.getNumPages()):
  19.         # Extract text from page and add to content
  20.         content += pdf.getPage(i).extractText() + "\n"
  21.     # Collapse whitespace
  22.     content = " ".join(content.replace(u"\xa0", " ").strip().split())
  23.     return content
  24.  
  25. #Extract text from DOCX
  26. def getText(filename):
  27.     doc = docx.Document(filename)
  28.     fullText = ""
  29.     for para in doc.paragraphs:
  30.         fullText += para.text
  31.     return fullText
  32.  
  33. #To store extracted resumes
  34. resume = ""
  35. #Select a path to the file - code needs os.path #to be addded
  36. filename = raw_input("Enter file name / path : ")
  37. #Invoking document parsers based on file format
  38. #Note: for TXT - do a normal f.read()
  39. if filename.endswith(".pdf"):
  40.     resume = getPDFContent(filename).encode("ascii", "ignore")
  41. elif filename.endswith(".docx"):
  42.      resume = getText(filename).encode("ascii", "ignore")  
  43. else:
  44.     print "File format is currently not supported"
  45.     exit(0)
  46.  
  47. print "processing..... \nplease wait...."
  48. #Importing NLTK for stopword removal and tokenizing
  49. from nltk.tokenize import word_tokenize
  50. from nltk.corpus import stopwords
  51.  
  52. #Tokenizing/ Filtering the resume off stopwords and punctuations
  53. print "tokenizing the given file ......"
  54. tokens = word_tokenize(resume)
  55. punctuations = ['(',')',';',':','[',']',',']
  56. stop_words = stopwords.words('english')
  57. #storing the cleaned resume
  58. filtered = [w for w in tokens if not w in stop_words and  not w in string.punctuation]
  59. print "removing the stop words....\nCleaning the resumes....\nExtracting Text ......."
  60. print filtered
  61. #get the name from the resume
  62. name  = str(filtered[0])+' ' +str(filtered[1])
  63. print "Name : " + name
  64.  
  65. #using regular expressions we extract phone numbers and mail ids
  66. import re
  67. #get contact info - from resume
  68. #email
  69. email = ""
  70. match_mail = re.search(r'[\w\.-]+@[\w\.-]+', resume)
  71. #handling the cases when mobile number is not given
  72. if(match_mail != None):
  73.     email = match_mail.group(0)
  74. print "Email : " + email
  75.  
  76. #mobile number
  77. mobile = ""
  78. match_mobile = re.search(r'((?:\(?\+91\)?)?\d{9})',resume)
  79. #handling the cases when mobile number is not given
  80. if(match_mobile != None):
  81.     mobile = match_mobile.group(0)
  82. print "Mobile : " +  mobile
  83.  
  84. parsed_resume = ' '.join(filtered)
  85. print "Parsed Resume in plain Text : ", parsed_resume
  86. r = str(parsed_resume)
  87.  
  88. #shingles - for eeach parsed resume
  89. shingle = []
  90. # form n-grams - basically the singles for LSH
  91. from nltk.util import ngrams
  92. #form the shingles of the filtered resume - the length of each shingle is 10
  93. make_shingle = ngrams(filtered,10)
  94. #print the shingles
  95. for s in make_shingle:
  96.     shingle.append(s)  
  97.  
  98. print "Shingles for the resume : ",shingle
  99. #save the name and contact details in separate fields - the parsed resume in anohter field
  100. # the parsed information is stored in a database (Django Model)
  101. import django
  102. #configure the Django envronment to the location of your app
  103. import os
  104. import sys
  105. sys.path.append('/home/sharad/resumes/')
  106. os.environ['DJANGO_SETTINGS_MODULE']='resumes.settings'
  107. django.setup()
  108. #os.environ.setdefault(“DJANGO_SETTINGS_MODULE”, “resumes.settings”)
  109. from django.conf import settings
  110. #Edit the django model
  111. from view_db.models import parsed_resume
  112. #add the new entries to the table
  113. r = parsed_resume(name = name,email = email, mobile = mobile, parsed_resume = r, shingles = shingle)
  114. #commit the changes
  115. r.save()
  116.  
  117. From <https://raw.githubusercontent.com/lovelyk0910/resume-parser-in-python/master/parser.py>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement