Advertisement
Guest User

Untitled

a guest
Apr 5th, 2020
185
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.11 KB | None | 0 0
  1. # Import libraries
  2.  
  3. import pyodbc
  4. import pandas as pd
  5. import re
  6. from sklearn.feature_extraction.text import TfidfVectorizer
  7.  
  8. # Create Connection
  9.  
  10. try:
  11.     conn = pyodbc.connect(driver='{SQL Server}',
  12.                           server='100.100.100.102',
  13.                           database='DWH_UMC',
  14.                           trusted_connection='yes')
  15.     print('Connection Successful')
  16.    
  17. except:
  18.     print('Something went WRONG')
  19.  
  20. # Import your Data Query into Pandas and Describe
  21.  
  22. #query = "SELECT Top 1000 Municipio FROM SolicitudesUMC where Municipio like '%,%'"
  23. query = "SELECT Municipio FROM SolicitudesUMC"
  24.  
  25.  
  26.  
  27. df = pd.read_sql_query(query,conn)
  28.  
  29. df.describe()
  30. conn.close()
  31.  
  32. df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
  33. df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)
  34. df['A'], df['B'] = df['Municipio'].str.split(',', 1).str
  35. df['Municipio_Comma'] = df['A'].str.replace(",","")
  36. df.drop(['A','B'],axis=1,inplace=True)
  37. df
  38.  
  39. # All unique items to list
  40.  
  41. # Grab the column you'd like to group, filter out duplicate values
  42. # and make sure the values are Unicode
  43. vals = df['Municipio_Comma'].unique().astype('U')
  44.  
  45. # Ngrams Function
  46.  
  47. # Write a function for cleaning strings and returning an array of ngrams
  48. def ngrams_analyzer(string):
  49.     string = re.sub(r'[,-./]', r'', string)
  50.     ngrams = zip(*[string[i:] for i in range(2)])  # N-Gram length is 5
  51.     return [''.join(ngram) for ngram in ngrams]
  52.  
  53. # Construct your vectorizer for building the TF-IDF matrix
  54. vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)
  55.  
  56. # Build the matrix!!!
  57. tfidf_matrix = vectorizer.fit_transform(vals)
  58.  
  59. # Import IGN's awesome_cossim_topn module
  60. from sparse_dot_topn import awesome_cossim_topn
  61.  
  62. # The arguments for awesome_cossim_topn are as follows:
  63. ### 1. Our TF-IDF matrix
  64. ### 2. Our TF-IDF matrix transposed (allowing us to build a pairwise cosine matrix)
  65. ### 3. A top_n filter, which allows us to filter the number of matches returned, which isn't useful for our purposes
  66. ### 4. This is our similarity threshold. Only values over 0.8 will be returned
  67. cosine_matrix = awesome_cossim_topn(
  68.   tfidf_matrix,
  69.   tfidf_matrix.transpose(),
  70.   vals.size,
  71.   0.7
  72. )
  73.  
  74. # Build a coordinate matrix from a cosine matrix
  75. coo_matrix = cosine_matrix.tocoo()
  76.  
  77. # Instaniate our lookup hash table
  78. group_lookup = {}
  79.  
  80.  
  81. def find_group(row, col):
  82.     # If either the row or the col string have already been given
  83.     # a group, return that group. Otherwise return none
  84.     if row in group_lookup:
  85.         return group_lookup[row]
  86.     elif col in group_lookup:
  87.         return group_lookup[col]
  88.     else:
  89.         return None
  90.  
  91.  
  92. def add_vals_to_lookup(group, row, col):
  93.     # Once we know the group name, set it as the value
  94.     # for both strings in the group_lookup
  95.     group_lookup[row] = group
  96.     group_lookup[col] = group
  97.  
  98.  
  99. def add_pair_to_lookup(row, col):
  100.     # in this function we'll add both the row and the col to the lookup
  101.     group = find_group(row, col)  # first, see if one has already been added
  102.     if group is not None:
  103.         # if we already know the group, make sure both row and col are in lookup
  104.         add_vals_to_lookup(group, row, col)
  105.     else:
  106.         # if we get here, we need to add a new group.
  107.         # The name is arbitrary, so just make it the row
  108.         add_vals_to_lookup(row, row, col)
  109.  
  110. # for each row and column in coo_matrix
  111. # if they're not the same string add them to the group lookup
  112. for row, col in zip(coo_matrix.row, coo_matrix.col):
  113.     if row != col:
  114.         # Note that what is passed to add_pair_to_lookup is the string at each index
  115.         # (eg: the names in the legal_name column) not the indices themselves
  116.         add_pair_to_lookup(vals[row], vals[col])
  117.  
  118. df['Municipio_H'] = df['Municipio_Comma'].map(group_lookup).fillna(df['Municipio_Comma'])
  119.  
  120. df['Municipio_H'] = df['Municipio_H'].str.replace(' NUEVO LEON','')
  121.  
  122. df['Municipio_H'] = df['Municipio_H'].apply(lambda x: 'Sin Información' if (len(x) <= 4 and x != 'MINA') else x)
  123.  
  124. df.to_csv('./SolicitudesUMC_Municipio_H.csv', encoding="utf-8-sig")
  125.  
  126. df.describe(include='all')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement