Untitled

#!/usr/bin/env python
# coding: utf-8

# # Import libraries

# In[1]:


import pyodbc
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer


# # Create Connection

# In[2]:


try:
    conn = pyodbc.connect(driver='{SQL Server}',
                          server='100.100.100.102',
                          database='DWH_UMC',
                          trusted_connection='yes')
    print('Connection Successful')

except:
    print('Something went WRONG')


# # Import your Data Query into Pandas and Describe

# In[43]:


query = "SELECT Municipio FROM SolicitudesUMC"

df = pd.read_sql_query(query,conn)

df.describe()
conn.close()


# In[29]:


df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)


# # All unique items to list

# In[44]:


# Grab the column you'd like to group, filter out duplicate values
# and make sure the values are Unicode
vals = df['Municipio'].unique().astype('U')


# # Ngrams Function

# In[31]:


# Write a function for cleaning strings and returning an array of ngrams
def ngrams_analyzer(string):
    string = re.sub(r'[,-./]', r'', string)
    ngrams = zip(*[string[i:] for i in range(3)])  # N-Gram length is 5
    return [''.join(ngram) for ngram in ngrams]


# In[32]:


# Construct your vectorizer for building the TF-IDF matrix
vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)

# Build the matrix!!!
tfidf_matrix = vectorizer.fit_transform(vals)


# In[33]:


# Import IGN's awesome_cossim_topn module
from sparse_dot_topn import awesome_cossim_topn

# The arguments for awesome_cossim_topn are as follows:
### 1. Our TF-IDF matrix
### 2. Our TF-IDF matrix transposed (allowing us to build a pairwise cosine matrix)
### 3. A top_n filter, which allows us to filter the number of matches returned, which isn't useful for our purposes
### 4. This is our similarity threshold. Only values over 0.8 will be returned
cosine_matrix = awesome_cossim_topn(
  tfidf_matrix,
  tfidf_matrix.transpose(),
  vals.size,
  0.8
)


# In[34]:


# Build a coordinate matrix from a cosine matrix
coo_matrix = cosine_matrix.tocoo()

# Instaniate our lookup hash table
group_lookup = {}


def find_group(row, col):
    # If either the row or the col string have already been given
    # a group, return that group. Otherwise return none
    if row in group_lookup:
        return group_lookup[row]
    elif col in group_lookup:
        return group_lookup[col]
    else:
        return None


def add_vals_to_lookup(group, row, col):
    # Once we know the group name, set it as the value
    # for both strings in the group_lookup
    group_lookup[row] = group
    group_lookup[col] = group


def add_pair_to_lookup(row, col):
    # in this function we'll add both the row and the col to the lookup
    group = find_group(row, col)  # first, see if one has already been added
    if group is not None:
        # if we already know the group, make sure both row and col are in lookup
        add_vals_to_lookup(group, row, col)
    else:
        # if we get here, we need to add a new group.
        # The name is arbitrary, so just make it the row
        add_vals_to_lookup(row, row, col)

# for each row and column in coo_matrix
# if they're not the same string add them to the group lookup
for row, col in zip(coo_matrix.row, coo_matrix.col):
    if row != col:
        # Note that what is passed to add_pair_to_lookup is the string at each index
        # (eg: the names in the legal_name column) not the indices themselves
        add_pair_to_lookup(vals[row], vals[col])


# In[40]:


df['Municipio_H'] = df['Municipio'].map(group_lookup).fillna(df['Municipio'])

df.to_csv('./SolicitudesUMC_Municipio_H.csv', encoding="utf-8")


# In[36]:


df.describe(include='all')


# In[ ]: