Advertisement
Guest User

Untitled

a guest
Aug 31st, 2017
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.27 KB | None | 0 0
  1. #!/usr/declass/env/bin/python
  2. # -*- coding: utf-8 -*-
  3.  
  4. import os
  5. import sys
  6. import pymysql
  7. import pandas as pd
  8. import ConfigParser
  9. import datetime
  10. import codecs
  11. from tqdm import tqdm
  12. from pandas.api.types import is_string_dtype
  13.  
  14. # get config credentials from .cnf file
  15. config = ConfigParser.RawConfigParser()
  16. config.readfp(open(r'../../common/config/.mylogin.cnf'))
  17. db_user = config.get('client', 'user')
  18. db_pass = config.get('client', 'password')
  19.  
  20. # connect to declassification_frus_update and declassification_frus
  21. declassification_frus_update = pymysql.connect(host='localhost', user=db_user, passwd=db_pass, db='declassification_frus_update', charset='utf8')
  22.  
  23. declassification_frus = pymysql.connect(host='localhost', user=db_user, passwd=db_pass, db='declassification_frus', charset='utf8')
  24.  
  25. tables=['authorship','classification_countries','classification_doc','classification_persons',
  26. 'classifications','countries','country_doc','doc_counts','docs','person_doc','persons',
  27. 'refs','term_doc','terms','top_classifications','top_countries','top_persons','top_topics',
  28. 'topic_doc','topics','volumes']
  29.  
  30. # tqdm load progress bar
  31. print "Reading tables from declassification_frus and declassification_frus_update as pandas dataframes..."
  32. for table in tqdm(tables):
  33. # make table_update dataframe for each table from delcassification_frus_update
  34. exec(table + "_update = pd.read_sql('SELECT * FROM ' + table + ';', con=declassification_frus_update)")
  35. # make table_prod dataframe for each table from delcassification_frus
  36. exec(table + "_prod = pd.read_sql('SELECT * FROM ' + table + ';', con=declassification_frus)")
  37.  
  38. # methods
  39.  
  40. def peep_changes():
  41. cols = ['table','production db rowcount','update db rowcount','% difference']
  42. differential_df = pd.DataFrame(columns=cols)
  43. for table in tables:
  44. exec("update_count = float(len(" + table + "_update.index))")
  45. exec("prod_count = float(len(" + table + "_prod.index))")
  46.  
  47. difference = update_count - prod_count
  48. if difference != 0:
  49. difference = (difference / prod_count) * 100
  50.  
  51. temp_df = pd.DataFrame([["`" + table + "`", prod_count, update_count, difference]], columns=cols)
  52. differential_df = differential_df.append(temp_df, ignore_index=True)
  53. differential_df['production db rowcount'] = differential_df['production db rowcount'].astype(int)
  54. differential_df['update db rowcount'] = differential_df['update db rowcount'].astype(int)
  55. differential_df['% difference'] = differential_df['% difference'].round()
  56.  
  57. return differential_df
  58.  
  59. def to_markdown_table(df):
  60. fmt = ['---' for i in range(len(df.columns))]
  61. df_fmt = pd.DataFrame([fmt], columns=df.columns)
  62. df_formatted = pd.concat([df_fmt, df])
  63. return df_formatted.to_csv(sep="|", index=False, encoding='utf-8', quotechar="*")
  64.  
  65. def peep_update_tables():
  66. all_tables_string = "### `declassification_fus_update` preview\n\n"
  67. for table in tables:
  68. exec('table_head =' + table + '_update.head()')
  69. all_tables_string += "#### `" + table + "`\n\n" + to_markdown_table(table_head) + "\n\n"
  70. return all_tables_string
  71.  
  72. # clean out carriage returns on docs and volumes
  73. docs_update = docs_update.replace('\\n','',regex=True)
  74. volumes_update = volumes_update.replace('\\n','',regex=True)
  75.  
  76. # truncate colums with long XML string values
  77. def truncate_string_cols(df):
  78. for col in df.columns:
  79. if is_string_dtype(df[col]):
  80. df[col] = df[col].str[:300]
  81.  
  82. truncate_string_cols(docs_update)
  83. truncate_string_cols(volumes_update)
  84. truncate_string_cols(persons_update)
  85.  
  86. # get changes as md table
  87. changes = to_markdown_table(peep_changes())
  88. # get _update preview tables
  89. updates = peep_update_tables()
  90. # concat full result report as string
  91. result_string = "# FRUS Collection Ingest Update — " + datetime.datetime.now().strftime("%m-%d-%Y") + "\n\n\n" + "### `declassification_frus` vs `declassification_fus_update`\n\n" + changes + "\n\n\n" + updates
  92.  
  93. results_filepath = "../FRUS-ingest-results-" + datetime.datetime.now().strftime("%m-%d-%Y") + ".md"
  94. print "Writing results of ingestion to " + results_filepath + "..."
  95.  
  96. # write results to .md
  97. try:
  98. os.remove(results_filepath)
  99. except OSError:
  100. pass
  101.  
  102. f = codecs.open(results_filepath, encoding='utf-8', mode="w+")
  103. f.write(unicode(result_string,'utf-8'))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement