Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- f1 = pd.read_csv('/home/kopal/Desktop/jobs.csv')
- f1.head()
- df = pd.DataFrame(f1)
- df.head()
- df.loc[df['id'] == 13]
- df.loc[df['location'] == 'Mumbai']
- f2 = pd.read_csv('/home/kopal/Desktop/candidate_details.csv')
- f2.head()
- df1 = pd.DataFrame(f2)
- df1.head()
- df2=df1[['candidateName','skillSet','relevantExperiance','qualification','location','industryType']]
- df2
- a=df2.head(1000)
- a
- df3=df[['job_title','keyskills','location']]
- df3
- df3.loc[df['id'] == 13 ]
- import requests
- import matplotlib.pyplot as plt ##Visualization
- import nltk ##NLP
- from textblob import TextBlob
- from textblob.sentiments import NaiveBayesAnalyzer ##MACHINE LEARNING
- import re, math
- from collections import Counter
- import numpy as np
- blob = TextBlob(df3.keyskills[4], analyzer=NaiveBayesAnalyzer())
- blob1= TextBlob(df3.location[4], analyzer=NaiveBayesAnalyzer())
- df4=pd.DataFrame()
- df4['skills']=a.skillSet
- df4['location_prefer']=a.location
- df4
- text1 = df3.keySkills[4]
- text4 = df3.location[4]
- class Similarity():
- def compute_cosine_similarity(self, string1, string2):
- # intersects the words that are common
- # in the set of the two words
- intersection = set(string1.keys()) & set(string2.keys())
- # dot matrix of vec1 and vec2
- numerator = sum([string1[x] * string2[x] for x in intersection])
- # sum of the squares of each vector
- # sum1 is the sum of text1 and same for sum2 for text2
- sum1 = sum([string1[x]**2 for x in string1.keys()])
- sum2 = sum([string2[x]**2 for x in string2.keys()])
- # product of the square root of both sum(s)
- denominator = math.sqrt(sum1) * math.sqrt(sum2)
- if not denominator:
- return 0.0
- else:
- return round(numerator/float(denominator),4)
- def text_to_vector(self,text):
- WORD = re.compile(r'w+')
- words = WORD.findall(text)
- return Counter(words)
- # Jaccard Similarity
- def tokenize(self,string):
- return string.lower().split(" ")
- def jaccard_similarity(self, string1, string2):
- intersection = set(string1).intersection(set(string2))
- union = set(string1).union(set(string2))
- return len(intersection)/float(len(union))
- cosine=[]
- for text2 in a.skillSet:
- #print(i)
- similarity = Similarity()
- vector1 = similarity.text_to_vector(text1)
- vector2 = similarity.text_to_vector(text2)
- token1 = similarity.tokenize(text1)
- token2 = similarity.tokenize(text2)
- cosine.append(similarity.compute_cosine_similarity(vector1, vector2))
- class Similarity():
- def compute_cosine_similarity(self, string1, string2):
- # intersects the words that are common
- # in the set of the two words
- intersection = set(string1.keys()) & set(string2.keys())
- # dot matrix of vec1 and vec2
- numerator = sum([string1[x] * string2[x] for x in intersection])
- # sum of the squares of each vector
- # sum1 is the sum of text1 and same for sum2 for text2
- sum1 = sum([string1[x]**2 for x in string1.keys()])
- sum2 = sum([string2[x]**2 for x in string2.keys()])
- # product of the square root of both sum(s)
- denominator = math.sqrt(sum1) * math.sqrt(sum2)
- if not denominator:
- return 0.0
- else:
- return round(numerator/float(denominator),4)
- def text_to_vector(self,text):
- WORD = re.compile(r'w+')
- words = WORD.findall(text)
- return Counter(words)
- cosine1=[]
- for text3 in str(a.location):
- similarity1 = Similarity()
- vector4 = similarity1.text_to_vector(text4)
- vector3 = similarity1.text_to_vector(text3)
- token4 = similarity1.tokenize(text4)
- token3 = similarity1.tokenize(text3)
- cosine1.append(similarity1.compute_cosine_similarity(vector4, vector3))
- print(str(a.location))
- df4['similarity_for_skills']=cosine
- se = pd.Series(cosine1)
- df4['similarity_for_location'] = se
- #df4.insert(loc=0, column='simlarity_for_location', value=se)
- #df4['similarity_for_location']=str(cosine1)
- df4
- df5 = pd.concat([a.candidateName,df4], axis=1)
- result=df5.sort_values('similarity_for_skills',ascending=False)
- df6=pd.DataFrame(result)
- df6
Add Comment
Please, Sign In to add comment