Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Import libraries
- import pyodbc
- import pandas as pd
- import re
- from sklearn.feature_extraction.text import TfidfVectorizer
- # Create Connection
- try:
- conn = pyodbc.connect(driver='{SQL Server}',
- server='100.100.100.102',
- database='DWH_UMC',
- trusted_connection='yes')
- print('Connection Successful')
- except:
- print('Something went WRONG')
- # Import your Data Query into Pandas and Describe
- #query = "SELECT Top 1000 Municipio FROM SolicitudesUMC where Municipio like '%,%'"
- query = "SELECT Municipio FROM SolicitudesUMC"
- df = pd.read_sql_query(query,conn)
- df.describe()
- conn.close()
- df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
- df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)
- df['A'], df['B'] = df['Municipio'].str.split(',', 1).str
- df['Municipio_Comma'] = df['A'].str.replace(",","")
- df.drop(['A','B'],axis=1,inplace=True)
- df
- # All unique items to list
- # Grab the column you'd like to group, filter out duplicate values
- # and make sure the values are Unicode
- vals = df['Municipio_Comma'].unique().astype('U')
- # Ngrams Function
- # Write a function for cleaning strings and returning an array of ngrams
- def ngrams_analyzer(string):
- string = re.sub(r'[,-./]', r'', string)
- ngrams = zip(*[string[i:] for i in range(2)]) # N-Gram length is 5
- return [''.join(ngram) for ngram in ngrams]
- # Construct your vectorizer for building the TF-IDF matrix
- vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)
- # Build the matrix!!!
- tfidf_matrix = vectorizer.fit_transform(vals)
- # Import IGN's awesome_cossim_topn module
- from sparse_dot_topn import awesome_cossim_topn
- # The arguments for awesome_cossim_topn are as follows:
- ### 1. Our TF-IDF matrix
- ### 2. Our TF-IDF matrix transposed (allowing us to build a pairwise cosine matrix)
- ### 3. A top_n filter, which allows us to filter the number of matches returned, which isn't useful for our purposes
- ### 4. This is our similarity threshold. Only values over 0.8 will be returned
- cosine_matrix = awesome_cossim_topn(
- tfidf_matrix,
- tfidf_matrix.transpose(),
- vals.size,
- 0.7
- )
- # Build a coordinate matrix from a cosine matrix
- coo_matrix = cosine_matrix.tocoo()
- # Instaniate our lookup hash table
- group_lookup = {}
- def find_group(row, col):
- # If either the row or the col string have already been given
- # a group, return that group. Otherwise return none
- if row in group_lookup:
- return group_lookup[row]
- elif col in group_lookup:
- return group_lookup[col]
- else:
- return None
- def add_vals_to_lookup(group, row, col):
- # Once we know the group name, set it as the value
- # for both strings in the group_lookup
- group_lookup[row] = group
- group_lookup[col] = group
- def add_pair_to_lookup(row, col):
- # in this function we'll add both the row and the col to the lookup
- group = find_group(row, col) # first, see if one has already been added
- if group is not None:
- # if we already know the group, make sure both row and col are in lookup
- add_vals_to_lookup(group, row, col)
- else:
- # if we get here, we need to add a new group.
- # The name is arbitrary, so just make it the row
- add_vals_to_lookup(row, row, col)
- # for each row and column in coo_matrix
- # if they're not the same string add them to the group lookup
- for row, col in zip(coo_matrix.row, coo_matrix.col):
- if row != col:
- # Note that what is passed to add_pair_to_lookup is the string at each index
- # (eg: the names in the legal_name column) not the indices themselves
- add_pair_to_lookup(vals[row], vals[col])
- df['Municipio_H'] = df['Municipio_Comma'].map(group_lookup).fillna(df['Municipio_Comma'])
- df['Municipio_H'] = df['Municipio_H'].str.replace(' NUEVO LEON','')
- df['Municipio_H'] = df['Municipio_H'].apply(lambda x: 'Sin Información' if (len(x) <= 4 and x != 'MINA') else x)
- df.to_csv('./SolicitudesUMC_Municipio_H.csv', encoding="utf-8-sig")
- df.describe(include='all')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement