Guest User

Untitled

a guest
Jan 18th, 2019
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.34 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": null,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "from demyst.analytics import Analytics \n",
  10. "import pandas as pd\n",
  11. "\n",
  12. "analytics = Analytics()"
  13. ]
  14. },
  15. {
  16. "cell_type": "code",
  17. "execution_count": null,
  18. "metadata": {},
  19. "outputs": [],
  20. "source": [
  21. "# read in the full file\n",
  22. "original_sample = pd.read_csv(\"/Users/jonathanedelman/Downloads/FOIA - 7(a)(FY2010-Present).csv\", encoding=\"latin-1\", low_memory=False)"
  23. ]
  24. },
  25. {
  26. "cell_type": "code",
  27. "execution_count": null,
  28. "metadata": {},
  29. "outputs": [],
  30. "source": [
  31. "# Filtering the original file down to an N of 5000. \n",
  32. "# 1000 chargeoffs and 4000 paid in full \n",
  33. "\n",
  34. "# Change this number for a smaller sample\n",
  35. "N = 5000\n",
  36. "# Take the more recent loans\n",
  37. "recent_sample = original_sample.loc[ 2016 <= original_sample[\"ApprovalFiscalYear\"]]\n",
  38. "# Identify the charge offs and the paid in full\n",
  39. "recent_charge_offs = recent_sample.loc[original_sample[\"LoanStatus\"] == \"CHGOFF\"]\n",
  40. "recent_paid_in_fulls = recent_sample.loc[original_sample[\"LoanStatus\"] == \"PIF\"]\n",
  41. "# Take 1/5 of the sample as charge offs, and 4/5 as paid in full\n",
  42. "recent_charge_off_sample = recent_charge_offs.sample(int(N/5))\n",
  43. "recent_paid_in_full_sample = recent_paid_in_fulls.sample(int(N*4/5))\n",
  44. "sample = pd.concat([recent_charge_off_sample, recent_paid_in_full_sample])"
  45. ]
  46. },
  47. {
  48. "cell_type": "code",
  49. "execution_count": null,
  50. "metadata": {},
  51. "outputs": [],
  52. "source": [
  53. "# Make binary variable binary numbers\n",
  54. "sample.loc[sample['LoanStatus'] == \"CHGOFF\", 'LoanStatus'] = 1\n",
  55. "sample.loc[sample['LoanStatus'] == \"PIF\", 'LoanStatus'] = 0"
  56. ]
  57. },
  58. {
  59. "cell_type": "code",
  60. "execution_count": null,
  61. "metadata": {},
  62. "outputs": [],
  63. "source": [
  64. "# Define Source List\n",
  65. "source_list = ['attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n",
  66. " 'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', 'google_latlon', \n",
  67. " 'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n",
  68. " 'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n",
  69. "]"
  70. ]
  71. },
  72. {
  73. "cell_type": "code",
  74. "execution_count": null,
  75. "metadata": {},
  76. "outputs": [],
  77. "source": [
  78. "# Doing a single lookup\n",
  79. "test_attempt_results = analytics.enrich_and_download(source_list, sample[:1])"
  80. ]
  81. },
  82. {
  83. "cell_type": "code",
  84. "execution_count": null,
  85. "metadata": {},
  86. "outputs": [],
  87. "source": [
  88. "sample.columns"
  89. ]
  90. },
  91. {
  92. "cell_type": "code",
  93. "execution_count": null,
  94. "metadata": {},
  95. "outputs": [],
  96. "source": [
  97. "# renaming existing column names to demyst types where applicable\n",
  98. "sample.rename(columns = {'BorrName':'business_name'}, inplace = True)\n",
  99. "sample.rename(columns = {'BorrStreet':'street'}, inplace = True)\n",
  100. "sample.rename(columns = {'BorrCity':'city'}, inplace = True)\n",
  101. "sample.rename(columns = {'BorrState':'state'}, inplace = True)\n",
  102. "sample.rename(columns = {'BorrZip':'post_code'}, inplace = True)\n",
  103. "\n",
  104. "# add country, since it is common and easy\n",
  105. "sample['country'] = \"us\""
  106. ]
  107. },
  108. {
  109. "cell_type": "code",
  110. "execution_count": null,
  111. "metadata": {},
  112. "outputs": [],
  113. "source": [
  114. "analytics.validate2(sample)"
  115. ]
  116. },
  117. {
  118. "cell_type": "code",
  119. "execution_count": null,
  120. "metadata": {},
  121. "outputs": [],
  122. "source": [
  123. "# Follow advice of the validate2 output\n",
  124. "sample['post_code'] = sample['post_code'].astype(str)"
  125. ]
  126. },
  127. {
  128. "cell_type": "code",
  129. "execution_count": null,
  130. "metadata": {},
  131. "outputs": [],
  132. "source": [
  133. "analytics.validate(sample, providers=source_list)"
  134. ]
  135. },
  136. {
  137. "cell_type": "code",
  138. "execution_count": null,
  139. "metadata": {},
  140. "outputs": [],
  141. "source": [
  142. "# as to not waste money when doing screenshots\n",
  143. "# sample = sample[1:5]\n",
  144. "sample"
  145. ]
  146. },
  147. {
  148. "cell_type": "code",
  149. "execution_count": null,
  150. "metadata": {},
  151. "outputs": [],
  152. "source": [
  153. "# Bing requires a \"query\" column\n",
  154. "sample['query'] = sample['business_name'] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"]"
  155. ]
  156. },
  157. {
  158. "cell_type": "code",
  159. "execution_count": null,
  160. "metadata": {},
  161. "outputs": [],
  162. "source": [
  163. "# Append Attom ID for attom products\n",
  164. "\n",
  165. "attom_results = analytics.enrich_and_download(['attom_expanded_profile_report'], sample)"
  166. ]
  167. },
  168. {
  169. "cell_type": "code",
  170. "execution_count": null,
  171. "metadata": {},
  172. "outputs": [],
  173. "source": [
  174. "sample['attom_id'] = attom_results['attom_expanded_profile_report.attom_id']"
  175. ]
  176. },
  177. {
  178. "cell_type": "code",
  179. "execution_count": null,
  180. "metadata": {},
  181. "outputs": [],
  182. "source": [
  183. "# Append latitude and longitude for Walkscore\n",
  184. "\n",
  185. "google_latlon_inputs = pd.DataFrame(columns=['street', 'country'])\n",
  186. "\n",
  187. "# For google_latlon, combine all address elements for the street column. For most Demyst providers, street is simply the street address.\n",
  188. "google_latlon_inputs['street'] = sample[\"street\"] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"] + \" \" + sample[\"post_code\"].map(str)\n",
  189. "\n",
  190. "# We are using US Businesses Only\n",
  191. "google_latlon_inputs['country'] = 'us'\n",
  192. "\n",
  193. "google_latlon_results = analytics.enrich_and_download(['google_latlon' ], google_latlon_inputs)"
  194. ]
  195. },
  196. {
  197. "cell_type": "code",
  198. "execution_count": null,
  199. "metadata": {},
  200. "outputs": [],
  201. "source": [
  202. "sample['longitude'] = google_latlon_results['google_latlon.longitude']\n",
  203. "sample['latitude'] = google_latlon_results['google_latlon.latitude']"
  204. ]
  205. },
  206. {
  207. "cell_type": "code",
  208. "execution_count": null,
  209. "metadata": {},
  210. "outputs": [],
  211. "source": [
  212. "# Already appended google_latlon, so do not need to do it again.\n",
  213. "updated_source_list = [\n",
  214. " 'attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n",
  215. " 'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', \n",
  216. " 'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n",
  217. " 'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n",
  218. "]"
  219. ]
  220. },
  221. {
  222. "cell_type": "code",
  223. "execution_count": null,
  224. "metadata": {
  225. "scrolled": true
  226. },
  227. "outputs": [],
  228. "source": [
  229. "results = analytics.enrich_and_download(source_list, sample, validate=False)"
  230. ]
  231. },
  232. {
  233. "cell_type": "code",
  234. "execution_count": null,
  235. "metadata": {},
  236. "outputs": [],
  237. "source": [
  238. "# Create 2 data frames: 1 with the residentia data, and one with the commercial data.\n",
  239. "attom_avm_results = results.filter(regex='attom_avm', axis=1)\n",
  240. "attom_pre_foreclosure_results = results.filter(regex='attom_pre_foreclosure', axis=1)\n",
  241. "attom_tax_assessor_results = results.filter(regex='attom_tax_assessor', axis=1)\n",
  242. "attom_residential = pd.concat([attom_avm_results, attom_pre_foreclosure_results, attom_tax_assessor_results], axis=1)\n",
  243. "\n",
  244. "attom_commercial_avm_results = results.filter(regex='attom_commercial_avm', axis=1)\n",
  245. "attom_commercial_pre_foreclosure_results = results.filter(regex='attom_commercial_pre_foreclosure', axis=1)\n",
  246. "attom_commercial_tax_assessor_results = results.filter(regex='attom_commercial_tax_assessor_results', axis=1)\n",
  247. "attom_commercial = pd.concat([attom_commercial_avm_results, attom_commercial_pre_foreclosure_results, attom_commercial_tax_assessor_results], axis=1)\n",
  248. "\n",
  249. "# Rename the columns so we have identical columns from the two data frames\n",
  250. "attom_residential.columns = attom_residential.columns.str.replace(\"attom_\", \"attom_combined_\")\n",
  251. "attom_commercial.columns = attom_commercial.columns.str.replace(\"attom_commercial_\", \"attom_combined_\")\n",
  252. "\n",
  253. "# Do a combine_first \n",
  254. "attom_combined = attom_commercial.combine_first(attom_residential)\n",
  255. "\n",
  256. "# Remove original attom columns from original results\n",
  257. "attom_less_results = results[results.columns.drop(list(results.filter(regex='attom_')))]\n",
  258. "\n",
  259. "# Combine attom results with the others (and google latlon from earlier)\n",
  260. "recombined_results = pd.concat([attom_less_results, attom_combined, google_latlon_results], axis=1)"
  261. ]
  262. },
  263. {
  264. "cell_type": "code",
  265. "execution_count": null,
  266. "metadata": {},
  267. "outputs": [],
  268. "source": [
  269. "no_dups = recombined_results.loc[:, ~recombined_results.columns.duplicated()]\n",
  270. "no_dups.to_csv(\"data_robot_input.csv\")"
  271. ]
  272. },
  273. {
  274. "cell_type": "code",
  275. "execution_count": null,
  276. "metadata": {},
  277. "outputs": [],
  278. "source": []
  279. }
  280. ],
  281. "metadata": {
  282. "kernelspec": {
  283. "display_name": "Python 3",
  284. "language": "python",
  285. "name": "python3"
  286. },
  287. "language_info": {
  288. "codemirror_mode": {
  289. "name": "ipython",
  290. "version": 3
  291. },
  292. "file_extension": ".py",
  293. "mimetype": "text/x-python",
  294. "name": "python",
  295. "nbconvert_exporter": "python",
  296. "pygments_lexer": "ipython3",
  297. "version": "3.7.1"
  298. }
  299. },
  300. "nbformat": 4,
  301. "nbformat_minor": 2
  302. }
Add Comment
Please, Sign In to add comment