Advertisement
Guest User

Untitled

a guest
Apr 4th, 2020
194
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.92 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3.  
  4. # # Import libraries
  5.  
  6. # In[1]:
  7.  
  8.  
  9. import pyodbc
  10. import pandas as pd
  11. import re
  12. from sklearn.feature_extraction.text import TfidfVectorizer
  13.  
  14.  
  15. # # Create Connection
  16.  
  17. # In[2]:
  18.  
  19.  
  20. try:
  21.     conn = pyodbc.connect(driver='{SQL Server}',
  22.                           server='100.100.100.102',
  23.                           database='DWH_UMC',
  24.                           trusted_connection='yes')
  25.     print('Connection Successful')
  26.    
  27. except:
  28.     print('Something went WRONG')
  29.  
  30.  
  31. # # Import your Data Query into Pandas and Describe
  32.  
  33. # In[43]:
  34.  
  35.  
  36. query = "SELECT Municipio FROM SolicitudesUMC"
  37.  
  38. df = pd.read_sql_query(query,conn)
  39.  
  40. df.describe()
  41. conn.close()
  42.  
  43.  
  44. # In[29]:
  45.  
  46.  
  47. df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
  48. df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)
  49.  
  50.  
  51. # # All unique items to list
  52.  
  53. # In[44]:
  54.  
  55.  
  56. # Grab the column you'd like to group, filter out duplicate values
  57. # and make sure the values are Unicode
  58. vals = df['Municipio'].unique().astype('U')
  59.  
  60.  
  61. # # Ngrams Function
  62.  
  63. # In[31]:
  64.  
  65.  
  66. # Write a function for cleaning strings and returning an array of ngrams
  67. def ngrams_analyzer(string):
  68.     string = re.sub(r'[,-./]', r'', string)
  69.     ngrams = zip(*[string[i:] for i in range(3)])  # N-Gram length is 5
  70.     return [''.join(ngram) for ngram in ngrams]
  71.  
  72.  
  73. # In[32]:
  74.  
  75.  
  76. # Construct your vectorizer for building the TF-IDF matrix
  77. vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)
  78.  
  79. # Build the matrix!!!
  80. tfidf_matrix = vectorizer.fit_transform(vals)
  81.  
  82.  
  83. # In[33]:
  84.  
  85.  
  86. # Import IGN's awesome_cossim_topn module
  87. from sparse_dot_topn import awesome_cossim_topn
  88.  
  89. # The arguments for awesome_cossim_topn are as follows:
  90. ### 1. Our TF-IDF matrix
  91. ### 2. Our TF-IDF matrix transposed (allowing us to build a pairwise cosine matrix)
  92. ### 3. A top_n filter, which allows us to filter the number of matches returned, which isn't useful for our purposes
  93. ### 4. This is our similarity threshold. Only values over 0.8 will be returned
  94. cosine_matrix = awesome_cossim_topn(
  95.   tfidf_matrix,
  96.   tfidf_matrix.transpose(),
  97.   vals.size,
  98.   0.8
  99. )
  100.  
  101.  
  102. # In[34]:
  103.  
  104.  
  105. # Build a coordinate matrix from a cosine matrix
  106. coo_matrix = cosine_matrix.tocoo()
  107.  
  108. # Instaniate our lookup hash table
  109. group_lookup = {}
  110.  
  111.  
  112. def find_group(row, col):
  113.     # If either the row or the col string have already been given
  114.     # a group, return that group. Otherwise return none
  115.     if row in group_lookup:
  116.         return group_lookup[row]
  117.     elif col in group_lookup:
  118.         return group_lookup[col]
  119.     else:
  120.         return None
  121.  
  122.  
  123. def add_vals_to_lookup(group, row, col):
  124.     # Once we know the group name, set it as the value
  125.     # for both strings in the group_lookup
  126.     group_lookup[row] = group
  127.     group_lookup[col] = group
  128.  
  129.  
  130. def add_pair_to_lookup(row, col):
  131.     # in this function we'll add both the row and the col to the lookup
  132.     group = find_group(row, col)  # first, see if one has already been added
  133.     if group is not None:
  134.         # if we already know the group, make sure both row and col are in lookup
  135.         add_vals_to_lookup(group, row, col)
  136.     else:
  137.         # if we get here, we need to add a new group.
  138.         # The name is arbitrary, so just make it the row
  139.         add_vals_to_lookup(row, row, col)
  140.  
  141. # for each row and column in coo_matrix
  142. # if they're not the same string add them to the group lookup
  143. for row, col in zip(coo_matrix.row, coo_matrix.col):
  144.     if row != col:
  145.         # Note that what is passed to add_pair_to_lookup is the string at each index
  146.         # (eg: the names in the legal_name column) not the indices themselves
  147.         add_pair_to_lookup(vals[row], vals[col])
  148.  
  149.  
  150. # In[40]:
  151.  
  152.  
  153. df['Municipio_H'] = df['Municipio'].map(group_lookup).fillna(df['Municipio'])
  154.  
  155. df.to_csv('./SolicitudesUMC_Municipio_H.csv', encoding="utf-8")
  156.  
  157.  
  158. # In[36]:
  159.  
  160.  
  161. df.describe(include='all')
  162.  
  163.  
  164. # In[ ]:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement