Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # coding: utf-8
- # # Import libraries
- # In[1]:
- import pyodbc
- import pandas as pd
- import re
- from sklearn.feature_extraction.text import TfidfVectorizer
- # # Create Connection
- # In[2]:
- try:
- conn = pyodbc.connect(driver='{SQL Server}',
- server='100.100.100.102',
- database='DWH_UMC',
- trusted_connection='yes')
- print('Connection Successful')
- except:
- print('Something went WRONG')
- # # Import your Data Query into Pandas and Describe
- # In[43]:
- query = "SELECT Municipio FROM SolicitudesUMC"
- df = pd.read_sql_query(query,conn)
- df.describe()
- conn.close()
- # In[29]:
- df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
- df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)
- # # All unique items to list
- # In[44]:
- # Grab the column you'd like to group, filter out duplicate values
- # and make sure the values are Unicode
- vals = df['Municipio'].unique().astype('U')
- # # Ngrams Function
- # In[31]:
- # Write a function for cleaning strings and returning an array of ngrams
- def ngrams_analyzer(string):
- string = re.sub(r'[,-./]', r'', string)
- ngrams = zip(*[string[i:] for i in range(3)]) # N-Gram length is 5
- return [''.join(ngram) for ngram in ngrams]
- # In[32]:
- # Construct your vectorizer for building the TF-IDF matrix
- vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)
- # Build the matrix!!!
- tfidf_matrix = vectorizer.fit_transform(vals)
- # In[33]:
- # Import IGN's awesome_cossim_topn module
- from sparse_dot_topn import awesome_cossim_topn
- # The arguments for awesome_cossim_topn are as follows:
- ### 1. Our TF-IDF matrix
- ### 2. Our TF-IDF matrix transposed (allowing us to build a pairwise cosine matrix)
- ### 3. A top_n filter, which allows us to filter the number of matches returned, which isn't useful for our purposes
- ### 4. This is our similarity threshold. Only values over 0.8 will be returned
- cosine_matrix = awesome_cossim_topn(
- tfidf_matrix,
- tfidf_matrix.transpose(),
- vals.size,
- 0.8
- )
- # In[34]:
- # Build a coordinate matrix from a cosine matrix
- coo_matrix = cosine_matrix.tocoo()
- # Instaniate our lookup hash table
- group_lookup = {}
- def find_group(row, col):
- # If either the row or the col string have already been given
- # a group, return that group. Otherwise return none
- if row in group_lookup:
- return group_lookup[row]
- elif col in group_lookup:
- return group_lookup[col]
- else:
- return None
- def add_vals_to_lookup(group, row, col):
- # Once we know the group name, set it as the value
- # for both strings in the group_lookup
- group_lookup[row] = group
- group_lookup[col] = group
- def add_pair_to_lookup(row, col):
- # in this function we'll add both the row and the col to the lookup
- group = find_group(row, col) # first, see if one has already been added
- if group is not None:
- # if we already know the group, make sure both row and col are in lookup
- add_vals_to_lookup(group, row, col)
- else:
- # if we get here, we need to add a new group.
- # The name is arbitrary, so just make it the row
- add_vals_to_lookup(row, row, col)
- # for each row and column in coo_matrix
- # if they're not the same string add them to the group lookup
- for row, col in zip(coo_matrix.row, coo_matrix.col):
- if row != col:
- # Note that what is passed to add_pair_to_lookup is the string at each index
- # (eg: the names in the legal_name column) not the indices themselves
- add_pair_to_lookup(vals[row], vals[col])
- # In[40]:
- df['Municipio_H'] = df['Municipio'].map(group_lookup).fillna(df['Municipio'])
- df.to_csv('./SolicitudesUMC_Municipio_H.csv', encoding="utf-8")
- # In[36]:
- df.describe(include='all')
- # In[ ]:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement