Advertisement
Guest User

parser

a guest
Nov 20th, 2019
151
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 24.03 KB | None | 0 0
  1. import csv
  2. import datetime
  3. import unicodedata
  4.  
  5. from django.http import HttpResponse
  6.  
  7. import dateparser
  8.  
  9.  
  10. class LinkedinScraping:
  11.  
  12.     def __init__(self, csv_file=None, accepted_profiles=None, required_skills=None,
  13.                  nice_to_haves=None, last_workplace=None, location=None):
  14.         if required_skills is None:
  15.             required_skills = []
  16.         if nice_to_haves is None:
  17.             nice_to_haves = []
  18.         if last_workplace is None:
  19.             last_workplace = []
  20.         if accepted_profiles is None:
  21.             self.accepted_profiles = []
  22.         else:
  23.             self.accepted_profiles = accepted_profiles
  24.  
  25.         if required_skills and not isinstance(required_skills, list):
  26.             required_skills = [required_skills]
  27.         if nice_to_haves and not isinstance(nice_to_haves, list):
  28.             nice_to_haves = [nice_to_haves]
  29.         if last_workplace and not isinstance(last_workplace, list):
  30.             last_workplace = [last_workplace]
  31.  
  32.         self.csv_file = csv_file
  33.         self.required_skills = required_skills
  34.         self.nice_to_haves = nice_to_haves
  35.         self.last_workplace = last_workplace
  36.         self.location = location
  37.  
  38.         self.mapping = {
  39.             'cpp': 'c++'}
  40.         self.keywords = [
  41.             "programista", "inżynier oprogramowania", "projektant oprogramowania", "developer",
  42.             "engineer", "programmer", "software developer", "software engineer"]
  43.  
  44.         self.profile_scores = {}
  45.  
  46.     def check_profiles_for_skills(self, operator=None):
  47.         required_skills = self.required_skills
  48.         mapping = self.mapping
  49.         keywords = self.keywords
  50.         csv_file = self.csv_file
  51.  
  52.         csv_reader = csv.DictReader(csv_file, delimiter=',')
  53.         for row in csv_reader:
  54.             # PARSES THE 'SKILLS' SECTION AND LISTS 5 MOST ENDORSED SKILLS TO CHECK AGAINST.
  55.             most_endorsed_skills = []
  56.             if "Skills" in row.keys() and row["Skills"]:
  57.                 comma_split_skills = row["Skills"].lower().strip().strip('""').strip().split(",")
  58.  
  59.                 partially_parsed_skills = []
  60.                 incomplete_skill = ''
  61.                 for skill in comma_split_skills:
  62.                     skill = skill.strip().strip('""').strip()
  63.                     if skill:
  64.                         if skill[-1].isalpha():
  65.                             incomplete_skill += skill + ', '
  66.                         elif skill[-1].isdigit():
  67.                             if skill[-2].isalpha():
  68.                                 incomplete_skill += skill + ', '
  69.                             else:
  70.                                 full_skill_name = skill
  71.                                 if incomplete_skill:
  72.                                     full_skill_name = incomplete_skill + skill
  73.                                 partially_parsed_skills.append(full_skill_name)
  74.                                 incomplete_skill = ''
  75.  
  76.                 skills_and_endorsements_parsed = []
  77.                 for string in partially_parsed_skills:
  78.                     string = string.strip('""').strip()
  79.                     if string:
  80.                         string_split = string.split(" : ")
  81.                         skill = string_split[0].strip('""').strip()
  82.                         endorsements = int(string_split[1].strip('""').strip())
  83.                         skills_and_endorsements_parsed.append(dict(skill=skill, endorsements=endorsements))
  84.                 most_endorsed_skills = sorted(
  85.                     skills_and_endorsements_parsed, key=lambda k: k['endorsements'], reverse=True)[:5]
  86.  
  87.             # CHECKS PROFILE SECTIONS FOR THE REQUIRED SKILLS
  88.             incomplete_job_experience_info = False
  89.             has_skills = []
  90.             for required_skill in required_skills:
  91.                 # SEPARATES THE SKILL FROM EXPERIENCE (IF PROVIDED).
  92.                 skill_and_experience = required_skill.split('_')
  93.                 required_skill = skill_and_experience[0].strip().lower()
  94.                 required_experience = 0
  95.                 if len(skill_and_experience) == 2:
  96.                     required_experience = float(skill_and_experience[1])
  97.  
  98.                 # CHECKS SKILL MAPPING
  99.                 if required_skill in mapping.keys():
  100.                     required_skill = mapping[required_skill]
  101.  
  102.                 # CHECKS THE 5 MOST ENDORSED SKILLS FOR THE SKILL.
  103.                 skill_in_skills_section = False
  104.                 if most_endorsed_skills:
  105.                     for endorsed_skill in most_endorsed_skills:
  106.                         if required_skill in endorsed_skill["skill"]:
  107.                             skill_in_skills_section = True
  108.  
  109.                 # CHECKS IF THE SKILL CAN BE FOUND IN A 'ORGANIZATION TITLE' OR 'ORGANIZATION DESCRIPTION',
  110.                 # AND CHECKS IF THE EXPERIENCE IS SUFFICIENT.
  111.                 numeration_of_jobs = []
  112.                 for key, value in row.items():
  113.                     if 'Organization Title' in key or 'Organization Description' in key:
  114.                         number = ''.join(filter(lambda x: x.isdigit(), key))
  115.                         if number not in numeration_of_jobs:
  116.                             numeration_of_jobs.append(number)
  117.                 numeration_of_jobs.sort()
  118.  
  119.                 numeration_of_matching_jobs = []
  120.                 for number in numeration_of_jobs:
  121.                     job_title = row["Organization Title {}".format(number)].lower()
  122.                     job_description = row["Organization Description {}".format(number)].lower()
  123.                     if (required_skill in job_title) or (required_skill in job_description):
  124.                         if number not in numeration_of_matching_jobs:
  125.                             numeration_of_matching_jobs.append(number)
  126.                     elif ((any(kw in job_title for kw in keywords) or any(kw in job_description for kw in keywords))
  127.                             and skill_in_skills_section is True):
  128.                         if number not in numeration_of_matching_jobs:
  129.                             incomplete_job_experience_info = True
  130.                             numeration_of_matching_jobs.append(number)
  131.                 numeration_of_matching_jobs.sort()
  132.  
  133.                 organization_start_end = []
  134.                 for number in numeration_of_matching_jobs:
  135.                     organization_start_end.append(
  136.                         dict(
  137.                             start=row['Organization Start {}'.format(number)].lower(),
  138.                             end=row['Organization End {}'.format(number)].lower()
  139.                         )
  140.                     )
  141.  
  142.                 timedelta_objects = []
  143.                 for start_end_pair in organization_start_end:
  144.                     start_date = start_end_pair['start']
  145.                     end_date = start_end_pair['end']
  146.                     try:
  147.                         formatted_start = dateparser.parse(start_date, date_formats=["%b %Y"], languages=['en', 'pl'])
  148.                     except ValueError:
  149.                         formatted_start = dateparser.parse(start_date, date_formats=["%Y"], languages=['en', 'pl'])
  150.                     if end_date == 'present' or end_date == 'obecnie':
  151.                         present_day = datetime.datetime.today()
  152.                         formatted_end = present_day
  153.                     elif end_date == 'less than a year' or end_date == 'mniej niż rok':
  154.                         formatted_end = datetime.datetime(formatted_start.year, 12, formatted_start.day, 0, 0)
  155.                     else:
  156.                         try:
  157.                             formatted_end = dateparser.parse(end_date, date_formats=["%b %Y"], languages=['en', 'pl'])
  158.                         except ValueError:
  159.                             formatted_end = dateparser.parse(end_date, date_formats=["%Y"], languages=['en', 'pl'])
  160.                     timedelta_objects.append(formatted_end - formatted_start)
  161.  
  162.                 total_skill_worktime = 0
  163.                 for timedelta_object in timedelta_objects:
  164.                     days = timedelta_object.days
  165.                     total_skill_worktime += days
  166.                 total_skill_worktime = total_skill_worktime / 365
  167.  
  168.                 if total_skill_worktime >= required_experience:
  169.                     has_skills.append(True)
  170.                 else:
  171.                     has_skills.append(False)
  172.  
  173.             if incomplete_job_experience_info is False:
  174.                 row["Incomplete job experience information"] = "False"
  175.             else:
  176.                 row["Incomplete job experience information"] = "True"
  177.  
  178.             if row not in self.accepted_profiles:
  179.                 if all(has_skills) and operator == 'AND':
  180.                     self.accepted_profiles.append(row)
  181.                 elif len(has_skills) == 1 and has_skills[0] is True:
  182.                     self.accepted_profiles.append(row)
  183.                 elif any(has_skills) and operator == 'OR':
  184.                     self.accepted_profiles.append(row)
  185.         return self.accepted_profiles
  186.  
  187.     def check_profiles_for_nice_to_haves(self):
  188.         nice_to_haves = self.nice_to_haves
  189.         mapping = self.mapping
  190.         keywords = self.keywords
  191.         csv_file = self.csv_file
  192.         csv_reader = csv.DictReader(csv_file, delimiter=',')
  193.  
  194.         for row in csv_reader:
  195.             results = {"summary": [],
  196.                        "title/description": [],
  197.                        "skills": []}
  198.             incomplete_title_or_description = False
  199.  
  200.             # PARSES THE 'SKILLS' SECTION AND LISTS 5 MOST ENDORSED SKILLS TO CHECK AGAINST
  201.             most_endorsed_skills = []
  202.             if row["Skills"]:
  203.                 comma_split_skills = row["Skills"].lower().strip().strip('""').strip().split(",")
  204.  
  205.                 partially_parsed_skills = []
  206.                 incomplete_skill = ''
  207.                 for skill in comma_split_skills:
  208.                     skill = skill.strip().strip('""').strip()
  209.                     if skill:
  210.                         if skill[-1].isalpha():
  211.                             incomplete_skill += skill + ', '
  212.                         elif skill[-1].isdigit():
  213.                             if skill[-2].isalpha():
  214.                                 incomplete_skill += skill + ', '
  215.                             else:
  216.                                 full_skill_name = skill
  217.                                 if incomplete_skill:
  218.                                     full_skill_name = incomplete_skill + skill
  219.                                 partially_parsed_skills.append(full_skill_name)
  220.                                 incomplete_skill = ''
  221.  
  222.                 skills_and_endorsements_parsed = []
  223.                 for string in partially_parsed_skills:
  224.                     string = string.strip('""').strip()
  225.                     if string:
  226.                         string_split = string.split(" : ")
  227.                         skill = string_split[0].strip('""').strip()
  228.                         endorsements = int(string_split[1].strip('""').strip())
  229.                         skills_and_endorsements_parsed.append(dict(skill=skill, endorsements=endorsements))
  230.                 most_endorsed_skills = sorted(
  231.                     skills_and_endorsements_parsed, key=lambda k: k['endorsements'], reverse=True)[:5]
  232.  
  233.             for nice_to_have in nice_to_haves:
  234.                 # SEPARATES THE SKILL FROM EXPERIENCE (IF PROVIDED).
  235.                 skill_and_experience = nice_to_have.split(':')
  236.                 nice_to_have = skill_and_experience[0].strip().lower()
  237.                 required_experience = 0
  238.                 if len(skill_and_experience) == 2:
  239.                     required_experience = float(skill_and_experience[1])
  240.  
  241.                 # SKILL MAPPING
  242.                 if nice_to_have in mapping.keys():
  243.                     nice_to_have = mapping[nice_to_have]
  244.  
  245.                 # CHECKS THE 5 MOST ENDORSED SKILLS FOR THE SKILL.
  246.                 if most_endorsed_skills:
  247.                     for endorsed_skill in most_endorsed_skills:
  248.                         if nice_to_have in endorsed_skill["skill"].lower():
  249.                             results["skills"].append(nice_to_have)
  250.                             break
  251.  
  252.                 # CHECKS 'SUMMARY' FOR THE SKILL.
  253.                 for key, value in row.items():
  254.                     if 'Summary' in key:
  255.                         if nice_to_have in value.lower():
  256.                             results["summary"].append(nice_to_have)
  257.  
  258.                 # CHECKS IF THE SKILL CAN BE FOUND IN A 'ORGANIZATION TITLE' OR 'ORGANIZATION DESCRIPTION',
  259.                 # AND CHECKS IF THE EXPERIENCE IS SUFFICIENT.
  260.                 numeration_of_jobs = []
  261.                 for key, value in row.items():
  262.                     if 'Organization Title' in key or 'Organization Description' in key:
  263.                         number = ''.join(filter(lambda x: x.isdigit(), key))
  264.                         if number not in numeration_of_jobs:
  265.                             numeration_of_jobs.append(number)
  266.                 numeration_of_jobs.sort()
  267.  
  268.                 numeration_of_matching_jobs = []
  269.                 for number in numeration_of_jobs:
  270.                     title = row["Organization Title {}".format(number)]
  271.                     description = row["Organization Description {}".format(number)]
  272.                     if (nice_to_have in title or nice_to_have in description):
  273.                         if number not in numeration_of_matching_jobs:
  274.                             numeration_of_matching_jobs.append(number)
  275.                     elif ((any(kw in title for kw in keywords) or any(kw in description for kw in keywords))
  276.                             and nice_to_have in results["skills"]):
  277.                         if number not in numeration_of_matching_jobs:
  278.                             incomplete_title_or_description = True
  279.                             numeration_of_matching_jobs.append(number)
  280.                 numeration_of_matching_jobs.sort()
  281.  
  282.                 # numeration_of_matching_jobs = []
  283.                 # for key, value in row.items():
  284.                 #     if 'Organization Title' in key or 'Organization Description' in key:
  285.                 #         number = ''.join(filter(lambda x: x.isdigit(), key))
  286.                 #         if nice_to_have in value.lower() and number not in numeration_of_matching_jobs:
  287.                 #             numeration_of_matching_jobs.append(number)
  288.                 #         elif any(kw in value.lower() for kw in keywords) and nice_to_have in results["skills"]:
  289.                 #             if ("Organization Title" in key
  290.                 #                     and nice_to_have in row["Organization Description {}".format(number)]):
  291.                 #                 pass
  292.                 #             elif ("Organization Description" in key
  293.                 #                     and nice_to_have in row["Organization Title {}".format(number)]):
  294.                 #                 pass
  295.                 #             else:
  296.                 #                 incomplete_title_or_description = True
  297.                 #                 numeration_of_matching_jobs.append(number)
  298.                 # numeration_of_matching_jobs.sort()
  299.  
  300.                 organization_start_end = []
  301.                 for number in numeration_of_matching_jobs:
  302.                     organization_start_end.append(dict(
  303.                         start=row['Organization Start {}'.format(number)].lower(),
  304.                         end=row['Organization End {}'.format(number)].lower()))
  305.  
  306.                 timedelta_objects = []
  307.                 for start_end_pair in organization_start_end:
  308.                     start_date = start_end_pair['start']
  309.                     end_date = start_end_pair['end']
  310.                     try:
  311.                         formatted_start = dateparser.parse(start_date, date_formats=["%b %Y"], languages=['en', 'pl'])
  312.                     except ValueError:
  313.                         formatted_start = dateparser.parse(start_date, date_formats=["%Y"], languages=['en', 'pl'])
  314.                     if end_date == 'present' or end_date == 'obecnie':
  315.                         formatted_end = datetime.datetime.today()
  316.                     elif end_date == 'less than a year' or end_date == 'mniej niż rok':
  317.                         formatted_end = datetime.datetime(formatted_start.year, 12, formatted_start.day, 0, 0)
  318.                     else:
  319.                         try:
  320.                             formatted_end = dateparser.parse(end_date, date_formats=["%b %Y"], languages=['en', 'pl'])
  321.                         except ValueError:
  322.                             formatted_end = dateparser.parse(end_date, date_formats=["%Y"], languages=['en', 'pl'])
  323.                     try:
  324.                         timedelta_objects.append(formatted_end - formatted_start)
  325.                     except TypeError:
  326.                         print("TypeError")
  327.  
  328.                 total_skill_worktime = 0
  329.                 for timedelta_object in timedelta_objects:
  330.                     total_skill_worktime += timedelta_object.days
  331.                 total_skill_worktime = total_skill_worktime / 365
  332.  
  333.                 if total_skill_worktime >= required_experience:
  334.                     results["title/description"].append(nice_to_have)
  335.  
  336.             # TAKES RESULTS AND CONVERTS THEM TO PERCENTAGE
  337.             results_in_percents = {}
  338.             for section_name, found_skills in results.items():
  339.                 if len(found_skills) == 0:
  340.                     percentage = 0
  341.                     results_in_percents[section_name] = percentage
  342.                 else:
  343.                     percentage = 100 * float(len(found_skills)) / float(len(nice_to_haves))
  344.                     results_in_percents[section_name] = percentage
  345.  
  346.             # CALCULATES PROFILE MATCH PERCENTAGE AND ADDS IT AS A FIELD TO ROW (PROFILE).
  347.             skills = 3 * results_in_percents["skills"]
  348.             summary = 1 * results_in_percents["summary"]
  349.             title_description = 6 * results_in_percents["title/description"]
  350.             weighted_average_divisor = 10
  351.             if incomplete_title_or_description is True:
  352.                 title_description = 4 * results_in_percents["title/description"]
  353.                 weighted_average_divisor = 8
  354.             profile_match_percentage = (summary + title_description + skills) / weighted_average_divisor
  355.             row["Profile match percentage"] = profile_match_percentage
  356.  
  357.             self.accepted_profiles.append(row)
  358.  
  359.     def check_profiles_for_last_workplace(self):
  360.         last_workplace = self.last_workplace
  361.         csv_file = self.csv_file
  362.  
  363.         csv_reader = csv.DictReader(csv_file, delimiter=',')
  364.  
  365.         for row in csv_reader:
  366.             matching_organization_numbers = []
  367.             for workplace in last_workplace:
  368.                 workplace = workplace.strip()
  369.                 for key, value in row.items():
  370.                     correct_key = key.startswith('Organization') and len(key) <= 15 and key[-1].isdigit()
  371.                     correct_workplace = workplace.lower() in value.lower()
  372.                     if correct_key and correct_workplace:
  373.                         number = ''.join(filter(lambda x: x.isdigit(), key))
  374.                         if number not in matching_organization_numbers:
  375.                             matching_organization_numbers.append(number)
  376.             matching_organization_numbers.sort()
  377.  
  378.             present_workplace_numbers = []
  379.             for number in matching_organization_numbers:
  380.                 organization_end = row['Organization End {}'.format(number)].lower()
  381.                 if organization_end == 'present' or organization_end == 'obecnie':
  382.                     present_workplace_numbers.append(number)
  383.  
  384.             if not present_workplace_numbers:
  385.                 self.accepted_profiles.append(row)
  386.         return self.accepted_profiles
  387.  
  388.     def check_profiles_for_location(self):
  389.         csv_file = self.csv_file
  390.  
  391.         csv_reader = csv.DictReader(csv_file, delimiter=',')
  392.  
  393.         for row in csv_reader:
  394.             location = unicodedata.normalize('NFKD', self.location).encode('ASCII', 'ignore').decode("utf-8")
  395.             if location.lower() in row['Location'].lower():
  396.                 self.accepted_profiles.append(row)
  397.         return self.accepted_profiles
  398.  
  399.     def generate_csv(self):
  400.         response = HttpResponse(content_type='text/csv')
  401.         response['Content-Disposition'] = 'attachment; filename="accepted_profiles.csv"'
  402.  
  403.         # fieldnames = [
  404.         #     "id", "Full name", "Email", "Profile url", "First name", "Last name", "Title", "Avatar",
  405.         #     "Location", "Address", "Birthday", "Summary", "Twitter", "Phone 1", "Phone 1 type", "Phone 2",
  406.         #     "Phone 2 type", "Phone 3", "Phone 3 type", "Messenger 1", "Messenger 1 type", "Messenger 2",
  407.         #     "Messenger 2 type", "Messenger 3", "Messenger 3 type", "Website 1", "Website 2", "Website 3",
  408.         #     "Organization 1", "Organization Title 1", "Organization Start 1", "Organization End 1",
  409.         #     "Organization Description 1", "Organization Location 1", "Organization LI URL 1", "Organization LI ID 1",
  410.         #     "Organization WWW 1", "Organization Domain 1", "Organization 2", "Organization Title 2",
  411.         #     "Organization Start 2", "Organization End 2", "Organization Description 2", "Organization Location 2",
  412.         #     "Organization LI URL 2", "Organization LI ID 2", "Organization WWW 2", "Organization Domain 2",
  413.         #     "Organization 3", "Organization Title 3", "Organization Start 3", "Organization End 3",
  414.         #     "Organization Description 3", "Organization Location 3", "Organization LI URL 3", "Organization LI ID 3",
  415.         #     "Organization WWW 3", "Organization Domain 3", "Organization 4", "Organization Title 4",
  416.         #     "Organization Start 4", "Organization End 4", "Organization Description 4", "Organization Location 4",
  417.         #     "Organization LI URL 4", "Organization LI ID 4", "Organization WWW 4", "Organization Domain 4",
  418.         #     "Organization 5", "Organization Title 5", "Organization Start 5", "Organization End 5",
  419.         #     "Organization Description 5", "Organization Location 5", "Organization LI URL 5", "Organization LI ID 5",
  420.         #     "Organization WWW 5", "Organization Domain 5", "Organization 6", "Organization Title 6",
  421.         #     "Organization Start 6", "Organization End 6", "Organization Description 6", "Organization Location 6",
  422.         #     "Organization LI URL 6", "Organization LI ID 6", "Organization WWW 6", "Organization Domain 6",
  423.         #     "Organization 7", "Organization Title 7", "Organization Start 7", "Organization End 7",
  424.         #     "Organization Description 7", "Organization Location 7", "Organization LI URL 7", "Organization LI ID 7",
  425.         #     "Organization WWW 7", "Organization Domain 7", "Education 1", "Education Degree 1",
  426.         #     "Education FOS 1", "Education Grade 1", "Education Start 1", "Education End 1", "Education Description 1",
  427.         #     "Education 2", "Education Degree 2", "Education FOS 2", "Education Grade 2", "Education Start 2",
  428.         #     "Education End 2", "Education Description 2", "Education 3", "Education Degree 3", "Education FOS 3",
  429.         #     "Education Grade 3", "Education Start 3", "Education End 3", "Education Description 3", "Skills",
  430.         #     "Followers", "Relationship", "Connected at", "Industry", "Mutual Count", "Mutual", "Mutual 1",
  431.         #     "Mutual 2", "Interests"]
  432.  
  433.         fieldnames = []
  434.         for profile in self.accepted_profiles:
  435.             for key in profile.keys():
  436.                 if key not in fieldnames:
  437.                     fieldnames.append(key)
  438.  
  439.         for profile in self.accepted_profiles:
  440.             profile_keys = []
  441.             for key in profile.keys():
  442.                 if key not in profile_keys:
  443.                     profile_keys.append(key)
  444.             if sorted(profile_keys) != sorted(fieldnames):
  445.                 print("your code is shit")
  446.  
  447.         csv_writer = csv.DictWriter(response, fieldnames=fieldnames, delimiter=",")
  448.         csv_writer.writeheader()
  449.         for profile in self.accepted_profiles:
  450.             csv_writer.writerow(profile)
  451.  
  452.         return response
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement