Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/declass/env/bin/python
- # -*- coding: utf-8 -*-
- import os
- import sys
- import pymysql
- import pandas as pd
- import ConfigParser
- import datetime
- import codecs
- from tqdm import tqdm
- from pandas.api.types import is_string_dtype
- # get config credentials from .cnf file
- config = ConfigParser.RawConfigParser()
- config.readfp(open(r'../../common/config/.mylogin.cnf'))
- db_user = config.get('client', 'user')
- db_pass = config.get('client', 'password')
- # connect to declassification_frus_update and declassification_frus
- declassification_frus_update = pymysql.connect(host='localhost', user=db_user, passwd=db_pass, db='declassification_frus_update', charset='utf8')
- declassification_frus = pymysql.connect(host='localhost', user=db_user, passwd=db_pass, db='declassification_frus', charset='utf8')
- tables=['authorship','classification_countries','classification_doc','classification_persons',
- 'classifications','countries','country_doc','doc_counts','docs','person_doc','persons',
- 'refs','term_doc','terms','top_classifications','top_countries','top_persons','top_topics',
- 'topic_doc','topics','volumes']
- # tqdm load progress bar
- print "Reading tables from declassification_frus and declassification_frus_update as pandas dataframes..."
- for table in tqdm(tables):
- # make table_update dataframe for each table from delcassification_frus_update
- exec(table + "_update = pd.read_sql('SELECT * FROM ' + table + ';', con=declassification_frus_update)")
- # make table_prod dataframe for each table from delcassification_frus
- exec(table + "_prod = pd.read_sql('SELECT * FROM ' + table + ';', con=declassification_frus)")
- # methods
- def peep_changes():
- cols = ['table','production db rowcount','update db rowcount','% difference']
- differential_df = pd.DataFrame(columns=cols)
- for table in tables:
- exec("update_count = float(len(" + table + "_update.index))")
- exec("prod_count = float(len(" + table + "_prod.index))")
- difference = update_count - prod_count
- if difference != 0:
- difference = (difference / prod_count) * 100
- temp_df = pd.DataFrame([["`" + table + "`", prod_count, update_count, difference]], columns=cols)
- differential_df = differential_df.append(temp_df, ignore_index=True)
- differential_df['production db rowcount'] = differential_df['production db rowcount'].astype(int)
- differential_df['update db rowcount'] = differential_df['update db rowcount'].astype(int)
- differential_df['% difference'] = differential_df['% difference'].round()
- return differential_df
- def to_markdown_table(df):
- fmt = ['---' for i in range(len(df.columns))]
- df_fmt = pd.DataFrame([fmt], columns=df.columns)
- df_formatted = pd.concat([df_fmt, df])
- return df_formatted.to_csv(sep="|", index=False, encoding='utf-8', quotechar="*")
- def peep_update_tables():
- all_tables_string = "### `declassification_fus_update` preview\n\n"
- for table in tables:
- exec('table_head =' + table + '_update.head()')
- all_tables_string += "#### `" + table + "`\n\n" + to_markdown_table(table_head) + "\n\n"
- return all_tables_string
- # clean out carriage returns on docs and volumes
- docs_update = docs_update.replace('\\n','',regex=True)
- volumes_update = volumes_update.replace('\\n','',regex=True)
- # truncate colums with long XML string values
- def truncate_string_cols(df):
- for col in df.columns:
- if is_string_dtype(df[col]):
- df[col] = df[col].str[:300]
- truncate_string_cols(docs_update)
- truncate_string_cols(volumes_update)
- truncate_string_cols(persons_update)
- # get changes as md table
- changes = to_markdown_table(peep_changes())
- # get _update preview tables
- updates = peep_update_tables()
- # concat full result report as string
- result_string = "# FRUS Collection Ingest Update — " + datetime.datetime.now().strftime("%m-%d-%Y") + "\n\n\n" + "### `declassification_frus` vs `declassification_fus_update`\n\n" + changes + "\n\n\n" + updates
- results_filepath = "../FRUS-ingest-results-" + datetime.datetime.now().strftime("%m-%d-%Y") + ".md"
- print "Writing results of ingestion to " + results_filepath + "..."
- # write results to .md
- try:
- os.remove(results_filepath)
- except OSError:
- pass
- f = codecs.open(results_filepath, encoding='utf-8', mode="w+")
- f.write(unicode(result_string,'utf-8'))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement