Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from demyst.analytics import Analytics \n",
    "import pandas as pd\n",
    "\n",
    "analytics = Analytics()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read in the full file\n",
    "original_sample = pd.read_csv(\"/Users/jonathanedelman/Downloads/FOIA - 7(a)(FY2010-Present).csv\", encoding=\"latin-1\", low_memory=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filtering the original file down to an N of 5000. \n",
    "# 1000 chargeoffs and 4000 paid in full \n",
    "\n",
    "# Change this number for a smaller sample\n",
    "N = 5000\n",
    "# Take the more recent loans\n",
    "recent_sample = original_sample.loc[ 2016 <= original_sample[\"ApprovalFiscalYear\"]]\n",
    "# Identify the charge offs and the paid in full\n",
    "recent_charge_offs = recent_sample.loc[original_sample[\"LoanStatus\"] == \"CHGOFF\"]\n",
    "recent_paid_in_fulls = recent_sample.loc[original_sample[\"LoanStatus\"] == \"PIF\"]\n",
    "# Take 1/5 of the sample as charge offs, and 4/5 as paid in full\n",
    "recent_charge_off_sample = recent_charge_offs.sample(int(N/5))\n",
    "recent_paid_in_full_sample = recent_paid_in_fulls.sample(int(N*4/5))\n",
    "sample = pd.concat([recent_charge_off_sample, recent_paid_in_full_sample])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make binary variable binary numbers\n",
    "sample.loc[sample['LoanStatus'] == \"CHGOFF\", 'LoanStatus'] = 1\n",
    "sample.loc[sample['LoanStatus'] == \"PIF\", 'LoanStatus'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define Source List\n",
    "source_list = ['attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n",
    " 'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', 'google_latlon', \n",
    " 'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n",
    " 'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Doing a single lookup\n",
    "test_attempt_results = analytics.enrich_and_download(source_list, sample[:1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# renaming existing column names to demyst types where applicable\n",
    "sample.rename(columns = {'BorrName':'business_name'}, inplace = True)\n",
    "sample.rename(columns = {'BorrStreet':'street'}, inplace = True)\n",
    "sample.rename(columns = {'BorrCity':'city'}, inplace = True)\n",
    "sample.rename(columns = {'BorrState':'state'}, inplace = True)\n",
    "sample.rename(columns = {'BorrZip':'post_code'}, inplace = True)\n",
    "\n",
    "# add country, since it is common and easy\n",
    "sample['country'] = \"us\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "analytics.validate2(sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Follow advice of the validate2 output\n",
    "sample['post_code'] = sample['post_code'].astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "analytics.validate(sample, providers=source_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# as to not waste money when doing screenshots\n",
    "# sample = sample[1:5]\n",
    "sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Bing requires a \"query\" column\n",
    "sample['query'] = sample['business_name'] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Append Attom ID for attom products\n",
    "\n",
    "attom_results = analytics.enrich_and_download(['attom_expanded_profile_report'], sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample['attom_id'] = attom_results['attom_expanded_profile_report.attom_id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Append latitude and longitude for Walkscore\n",
    "\n",
    "google_latlon_inputs = pd.DataFrame(columns=['street', 'country'])\n",
    "\n",
    "# For google_latlon, combine all address elements for the street column. For most Demyst providers, street is simply the street address.\n",
    "google_latlon_inputs['street'] = sample[\"street\"] + \" \" + sample[\"city\"] + \", \" + sample[\"state\"] + \" \" + sample[\"post_code\"].map(str)\n",
    "\n",
    "# We are using US Businesses Only\n",
    "google_latlon_inputs['country'] = 'us'\n",
    "\n",
    "google_latlon_results = analytics.enrich_and_download(['google_latlon' ], google_latlon_inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample['longitude'] = google_latlon_results['google_latlon.longitude']\n",
    "sample['latitude'] = google_latlon_results['google_latlon.latitude']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Already appended google_latlon, so do not need to do it again.\n",
    "updated_source_list = [\n",
    "    'attom_avm', 'attom_commercial_avm', 'attom_pre_foreclosure', 'attom_commercial_pre_foreclosure', \n",
    "    'attom_tax_assessor', 'attom_commercial_tax_assessor', 'info_connect_company', \n",
    "    'infutor_property_append', 'utilityscore_bill', 'utilityscore_savings', 'dnb_find_company', \n",
    "    'acxiom_place', 'walkscore', 'housecanary_property_mortgage_lien', 'bing'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "results = analytics.enrich_and_download(source_list, sample, validate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create 2 data frames: 1 with the residentia data, and one with the commercial data.\n",
    "attom_avm_results = results.filter(regex='attom_avm', axis=1)\n",
    "attom_pre_foreclosure_results = results.filter(regex='attom_pre_foreclosure', axis=1)\n",
    "attom_tax_assessor_results = results.filter(regex='attom_tax_assessor', axis=1)\n",
    "attom_residential = pd.concat([attom_avm_results, attom_pre_foreclosure_results, attom_tax_assessor_results], axis=1)\n",
    "\n",
    "attom_commercial_avm_results = results.filter(regex='attom_commercial_avm', axis=1)\n",
    "attom_commercial_pre_foreclosure_results = results.filter(regex='attom_commercial_pre_foreclosure', axis=1)\n",
    "attom_commercial_tax_assessor_results = results.filter(regex='attom_commercial_tax_assessor_results', axis=1)\n",
    "attom_commercial = pd.concat([attom_commercial_avm_results, attom_commercial_pre_foreclosure_results, attom_commercial_tax_assessor_results], axis=1)\n",
    "\n",
    "# Rename the columns so we have identical columns from the two data frames\n",
    "attom_residential.columns = attom_residential.columns.str.replace(\"attom_\", \"attom_combined_\")\n",
    "attom_commercial.columns = attom_commercial.columns.str.replace(\"attom_commercial_\", \"attom_combined_\")\n",
    "\n",
    "# Do a combine_first \n",
    "attom_combined = attom_commercial.combine_first(attom_residential)\n",
    "\n",
    "# Remove original attom columns from original results\n",
    "attom_less_results = results[results.columns.drop(list(results.filter(regex='attom_')))]\n",
    "\n",
    "# Combine attom results with the others (and google latlon from earlier)\n",
    "recombined_results = pd.concat([attom_less_results, attom_combined, google_latlon_results], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "no_dups = recombined_results.loc[:, ~recombined_results.columns.duplicated()]\n",
    "no_dups.to_csv(\"data_robot_input.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}