Untitled

# Import libraries

import pyodbc
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Create Connection

try:
    conn = pyodbc.connect(driver='{SQL Server}',
                          server='100.100.100.102',
                          database='DWH_UMC',
                          trusted_connection='yes')
    print('Connection Successful')

except:
    print('Something went WRONG')

# Import your Data Query into Pandas and Describe

#query = "SELECT Top 1000 Municipio FROM SolicitudesUMC where Municipio like '%,%'"
query = "SELECT Municipio FROM SolicitudesUMC"


df = pd.read_sql_query(query,conn)

df.describe()
conn.close()

df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)
df['A'], df['B'] = df['Municipio'].str.split(',', 1).str
df['Municipio_Comma'] = df['A'].str.replace(",","")
df.drop(['A','B'],axis=1,inplace=True)
df

# All unique items to list

# Grab the column you'd like to group, filter out duplicate values
# and make sure the values are Unicode
vals = df['Municipio_Comma'].unique().astype('U')

# Ngrams Function

# Write a function for cleaning strings and returning an array of ngrams
def ngrams_analyzer(string):
    string = re.sub(r'[,-./]', r'', string)
    ngrams = zip(*[string[i:] for i in range(2)])  # N-Gram length is 5
    return [''.join(ngram) for ngram in ngrams]

# Construct your vectorizer for building the TF-IDF matrix
vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)

# Build the matrix!!!
tfidf_matrix = vectorizer.fit_transform(vals)

# Import IGN's awesome_cossim_topn module
from sparse_dot_topn import awesome_cossim_topn

# The arguments for awesome_cossim_topn are as follows:
### 1. Our TF-IDF matrix
### 2. Our TF-IDF matrix transposed (allowing us to build a pairwise cosine matrix)
### 3. A top_n filter, which allows us to filter the number of matches returned, which isn't useful for our purposes
### 4. This is our similarity threshold. Only values over 0.8 will be returned
cosine_matrix = awesome_cossim_topn(
  tfidf_matrix,
  tfidf_matrix.transpose(),
  vals.size,
  0.7
)

# Build a coordinate matrix from a cosine matrix
coo_matrix = cosine_matrix.tocoo()

# Instaniate our lookup hash table
group_lookup = {}


def find_group(row, col):
    # If either the row or the col string have already been given
    # a group, return that group. Otherwise return none
    if row in group_lookup:
        return group_lookup[row]
    elif col in group_lookup:
        return group_lookup[col]
    else:
        return None


def add_vals_to_lookup(group, row, col):
    # Once we know the group name, set it as the value
    # for both strings in the group_lookup
    group_lookup[row] = group
    group_lookup[col] = group


def add_pair_to_lookup(row, col):
    # in this function we'll add both the row and the col to the lookup
    group = find_group(row, col)  # first, see if one has already been added
    if group is not None:
        # if we already know the group, make sure both row and col are in lookup
        add_vals_to_lookup(group, row, col)
    else:
        # if we get here, we need to add a new group.
        # The name is arbitrary, so just make it the row
        add_vals_to_lookup(row, row, col)

# for each row and column in coo_matrix
# if they're not the same string add them to the group lookup
for row, col in zip(coo_matrix.row, coo_matrix.col):
    if row != col:
        # Note that what is passed to add_pair_to_lookup is the string at each index
        # (eg: the names in the legal_name column) not the indices themselves
        add_pair_to_lookup(vals[row], vals[col])

df['Municipio_H'] = df['Municipio_Comma'].map(group_lookup).fillna(df['Municipio_Comma'])

df['Municipio_H'] = df['Municipio_H'].str.replace(' NUEVO LEON','')

df['Municipio_H'] = df['Municipio_H'].apply(lambda x: 'Sin Información' if (len(x) <= 4 and x != 'MINA') else x)

df.to_csv('./SolicitudesUMC_Municipio_H.csv', encoding="utf-8-sig")

df.describe(include='all')