daily pastebin goal
64%
SHARE
TWEET

Untitled

a guest Jan 18th, 2019 55 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. {
  2.  "cells": [
  3.   {
  4.    "cell_type": "code",
  5.    "execution_count": null,
  6.    "metadata": {},
  7.    "outputs": [],
  8.    "source": [
  9.     "from demyst.analytics import Analytics \n",
  10.     "import pandas as pd\n",
  11.     "\n",
  12.     "analytics = Analytics()"
  13.    ]
  14.   },
  15.   {
  16.    "cell_type": "code",
  17.    "execution_count": null,
  18.    "metadata": {},
  19.    "outputs": [],
  20.    "source": [
  21.     "# read in the full file\n",
  22.     "original_sample = pd.read_csv(\"/Users/jonathanedelman/Downloads/FOIA - 7(a)(FY2010-Present).csv\", encoding=\"latin-1\", low_memory=False)"
  23.    ]
  24.   },
  25.   {
  26.    "cell_type": "code",
  27.    "execution_count": null,
  28.    "metadata": {},
  29.    "outputs": [],
  30.    "source": [
  31.     "# Filtering the original file down to an N of 5000. \n",
  32.     "# 1000 chargeoffs and 4000 paid in full \n",
  33.     "\n",
  34.     "# Change this number for a smaller sample\n",
  35.     "N = 5000\n",
  36.     "# Take the more recent loans\n",
  37.     "recent_sample = original_sample.loc[ 2016 <= original_sample[\"ApprovalFiscalYear\"]]\n",
  38.     "# Identify the charge offs and the paid in full\n",
  39.     "recent_charge_offs = recent_sample.loc[original_sample[\"LoanStatus\"] == \"CHGOFF\"]\n",
  40.     "recent_paid_in_fulls = recent_sample.loc[original_sample[\"LoanStatus\"] == \"PIF\"]\n",
  41.     "# Take 1/5 of the sample as charge offs, and 4/5 as paid in full\n",
  42.     "recent_charge_off_sample = recent_charge_offs.sample(int(N/5))\n",
  43.     "recent_paid_in_full_sample = recent_paid_in_fulls.sample(int(N*4/5))\n",
  44.     "sample = pd.concat([recent_charge_off_sample, recent_paid_in_full_sample])"
  45.    ]
  46.   },
  47.   {
  48.    "cell_type": "code",
  49.    "execution_count": null,
  50.    "metadata": {},
  51.    "outputs": [],
  52.    "source": [
  53.     "# Make binary variable binary numbers\n",
  54.     "sample.loc[sample['LoanStatus'] == \"CHGOFF\", 'LoanStatus'] = 1\n",
  55.     "sample.loc[sample['LoanStatus'] == \"PIF\", 'LoanStatus'] = 0"
  56.    ]
  57.   },
  58.   {
  59.    "cell_type": "code",
  60.    "execution_count": null,
  61.    "metadata": {},
  62.    "outputs": [],
  63.    "source": [
  64.     "# Define Source List\n",
  65.     "source_list = ['attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n",
  66.     " 'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', 'google_latlon', \n",
  67.     " 'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n",
  68.     " 'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n",
  69.     "]"
  70.    ]
  71.   },
  72.   {
  73.    "cell_type": "code",
  74.    "execution_count": null,
  75.    "metadata": {},
  76.    "outputs": [],
  77.    "source": [
  78.     "# Doing a single lookup\n",
  79.     "test_attempt_results = analytics.enrich_and_download(source_list, sample[:1])"
  80.    ]
  81.   },
  82.   {
  83.    "cell_type": "code",
  84.    "execution_count": null,
  85.    "metadata": {},
  86.    "outputs": [],
  87.    "source": [
  88.     "sample.columns"
  89.    ]
  90.   },
  91.   {
  92.    "cell_type": "code",
  93.    "execution_count": null,
  94.    "metadata": {},
  95.    "outputs": [],
  96.    "source": [
  97.     "# renaming existing column names to demyst types where applicable\n",
  98.     "sample.rename(columns = {'BorrName':'business_name'}, inplace = True)\n",
  99.     "sample.rename(columns = {'BorrStreet':'street'}, inplace = True)\n",
  100.     "sample.rename(columns = {'BorrCity':'city'}, inplace = True)\n",
  101.     "sample.rename(columns = {'BorrState':'state'}, inplace = True)\n",
  102.     "sample.rename(columns = {'BorrZip':'post_code'}, inplace = True)\n",
  103.     "\n",
  104.     "# add country, since it is common and easy\n",
  105.     "sample['country'] = \"us\""
  106.    ]
  107.   },
  108.   {
  109.    "cell_type": "code",
  110.    "execution_count": null,
  111.    "metadata": {},
  112.    "outputs": [],
  113.    "source": [
  114.     "analytics.validate2(sample)"
  115.    ]
  116.   },
  117.   {
  118.    "cell_type": "code",
  119.    "execution_count": null,
  120.    "metadata": {},
  121.    "outputs": [],
  122.    "source": [
  123.     "# Follow advice of the validate2 output\n",
  124.     "sample['post_code'] = sample['post_code'].astype(str)"
  125.    ]
  126.   },
  127.   {
  128.    "cell_type": "code",
  129.    "execution_count": null,
  130.    "metadata": {},
  131.    "outputs": [],
  132.    "source": [
  133.     "analytics.validate(sample, providers=source_list)"
  134.    ]
  135.   },
  136.   {
  137.    "cell_type": "code",
  138.    "execution_count": null,
  139.    "metadata": {},
  140.    "outputs": [],
  141.    "source": [
  142.     "# as to not waste money when doing screenshots\n",
  143.     "# sample = sample[1:5]\n",
  144.     "sample"
  145.    ]
  146.   },
  147.   {
  148.    "cell_type": "code",
  149.    "execution_count": null,
  150.    "metadata": {},
  151.    "outputs": [],
  152.    "source": [
  153.     "# Bing requires a \"query\" column\n",
  154.     "sample['query'] = sample['business_name'] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"]"
  155.    ]
  156.   },
  157.   {
  158.    "cell_type": "code",
  159.    "execution_count": null,
  160.    "metadata": {},
  161.    "outputs": [],
  162.    "source": [
  163.     "# Append Attom ID for attom products\n",
  164.     "\n",
  165.     "attom_results = analytics.enrich_and_download(['attom_expanded_profile_report'], sample)"
  166.    ]
  167.   },
  168.   {
  169.    "cell_type": "code",
  170.    "execution_count": null,
  171.    "metadata": {},
  172.    "outputs": [],
  173.    "source": [
  174.     "sample['attom_id'] = attom_results['attom_expanded_profile_report.attom_id']"
  175.    ]
  176.   },
  177.   {
  178.    "cell_type": "code",
  179.    "execution_count": null,
  180.    "metadata": {},
  181.    "outputs": [],
  182.    "source": [
  183.     "# Append latitude and longitude for Walkscore\n",
  184.     "\n",
  185.     "google_latlon_inputs = pd.DataFrame(columns=['street', 'country'])\n",
  186.     "\n",
  187.     "# For google_latlon, combine all address elements for the street column. For most Demyst providers, street is simply the street address.\n",
  188.     "google_latlon_inputs['street'] = sample[\"street\"] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"] + \" \" + sample[\"post_code\"].map(str)\n",
  189.     "\n",
  190.     "# We are using US Businesses Only\n",
  191.     "google_latlon_inputs['country'] = 'us'\n",
  192.     "\n",
  193.     "google_latlon_results = analytics.enrich_and_download(['google_latlon' ], google_latlon_inputs)"
  194.    ]
  195.   },
  196.   {
  197.    "cell_type": "code",
  198.    "execution_count": null,
  199.    "metadata": {},
  200.    "outputs": [],
  201.    "source": [
  202.     "sample['longitude'] = google_latlon_results['google_latlon.longitude']\n",
  203.     "sample['latitude'] = google_latlon_results['google_latlon.latitude']"
  204.    ]
  205.   },
  206.   {
  207.    "cell_type": "code",
  208.    "execution_count": null,
  209.    "metadata": {},
  210.    "outputs": [],
  211.    "source": [
  212.     "# Already appended google_latlon, so do not need to do it again.\n",
  213.     "updated_source_list = [\n",
  214.     "    'attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n",
  215.     "    'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', \n",
  216.     "    'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n",
  217.     "    'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n",
  218.     "]"
  219.    ]
  220.   },
  221.   {
  222.    "cell_type": "code",
  223.    "execution_count": null,
  224.    "metadata": {
  225.     "scrolled": true
  226.    },
  227.    "outputs": [],
  228.    "source": [
  229.     "results = analytics.enrich_and_download(source_list, sample, validate=False)"
  230.    ]
  231.   },
  232.   {
  233.    "cell_type": "code",
  234.    "execution_count": null,
  235.    "metadata": {},
  236.    "outputs": [],
  237.    "source": [
  238.     "# Create 2 data frames: 1 with the residentia data, and one with the commercial data.\n",
  239.     "attom_avm_results = results.filter(regex='attom_avm', axis=1)\n",
  240.     "attom_pre_foreclosure_results = results.filter(regex='attom_pre_foreclosure', axis=1)\n",
  241.     "attom_tax_assessor_results = results.filter(regex='attom_tax_assessor', axis=1)\n",
  242.     "attom_residential = pd.concat([attom_avm_results, attom_pre_foreclosure_results, attom_tax_assessor_results], axis=1)\n",
  243.     "\n",
  244.     "attom_commercial_avm_results = results.filter(regex='attom_commercial_avm', axis=1)\n",
  245.     "attom_commercial_pre_foreclosure_results = results.filter(regex='attom_commercial_pre_foreclosure', axis=1)\n",
  246.     "attom_commercial_tax_assessor_results = results.filter(regex='attom_commercial_tax_assessor_results', axis=1)\n",
  247.     "attom_commercial = pd.concat([attom_commercial_avm_results, attom_commercial_pre_foreclosure_results, attom_commercial_tax_assessor_results], axis=1)\n",
  248.     "\n",
  249.     "# Rename the columns so we have identical columns from the two data frames\n",
  250.     "attom_residential.columns = attom_residential.columns.str.replace(\"attom_\", \"attom_combined_\")\n",
  251.     "attom_commercial.columns = attom_commercial.columns.str.replace(\"attom_commercial_\", \"attom_combined_\")\n",
  252.     "\n",
  253.     "# Do a combine_first \n",
  254.     "attom_combined = attom_commercial.combine_first(attom_residential)\n",
  255.     "\n",
  256.     "# Remove original attom columns from original results\n",
  257.     "attom_less_results = results[results.columns.drop(list(results.filter(regex='attom_')))]\n",
  258.     "\n",
  259.     "# Combine attom results with the others (and google latlon from earlier)\n",
  260.     "recombined_results = pd.concat([attom_less_results, attom_combined, google_latlon_results], axis=1)"
  261.    ]
  262.   },
  263.   {
  264.    "cell_type": "code",
  265.    "execution_count": null,
  266.    "metadata": {},
  267.    "outputs": [],
  268.    "source": [
  269.     "no_dups = recombined_results.loc[:, ~recombined_results.columns.duplicated()]\n",
  270.     "no_dups.to_csv(\"data_robot_input.csv\")"
  271.    ]
  272.   },
  273.   {
  274.    "cell_type": "code",
  275.    "execution_count": null,
  276.    "metadata": {},
  277.    "outputs": [],
  278.    "source": []
  279.   }
  280.  ],
  281.  "metadata": {
  282.   "kernelspec": {
  283.    "display_name": "Python 3",
  284.    "language": "python",
  285.    "name": "python3"
  286.   },
  287.   "language_info": {
  288.    "codemirror_mode": {
  289.     "name": "ipython",
  290.     "version": 3
  291.    },
  292.    "file_extension": ".py",
  293.    "mimetype": "text/x-python",
  294.    "name": "python",
  295.    "nbconvert_exporter": "python",
  296.    "pygments_lexer": "ipython3",
  297.    "version": "3.7.1"
  298.   }
  299.  },
  300.  "nbformat": 4,
  301.  "nbformat_minor": 2
  302. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top