SHARE
TWEET

Untitled

a guest Oct 14th, 2019 96 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. {
  2.  "cells": [
  3.   {
  4.    "cell_type": "markdown",
  5.    "metadata": {},
  6.    "source": [
  7.     "# Accessing Free Hourly Weather Data\n",
  8.     "NOAA stores Free Hourly Weather Data at: ftp://ftp.ncdc.noaa.gov/pub/data/noaa/\n",
  9.     "\n",
  10.     "It is my understanding that this is the data underlying most of the websites that charge for historical weather data at the hourly level (at least when needed in bulk).\n",
  11.     "\n",
  12.     "The most detailed data is in a very cumbersome format, but a subset of easy to parse data can be found at: ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite/\n",
  13.     "\n",
  14.     "To make this topic approachable to a large audience, I've added some notes on how one could access this data simply using Excel at the end of this notebook."
  15.    ]
  16.   },
  17.   {
  18.    "cell_type": "code",
  19.    "execution_count": 160,
  20.    "metadata": {},
  21.    "outputs": [],
  22.    "source": [
  23.     "# Column Names were determined from ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite/isd-lite-format.pdf\n",
  24.     "# That pdf describes what data is contained in the subset of data that I'll focus on.\n",
  25.     "isd_fwf_cols = ['year', 'month', 'day', 'hour', 'air_temp_c', 'dew_pt_temp_c',\n",
  26.     "                 'sea_lvl_press_hectoPa', 'wnd_dir_360', 'wnd_spd_mtrpersec',\n",
  27.     "                 'sky_condition', 'precip_hrly', 'precip_6hr_accum']"
  28.    ]
  29.   },
  30.   {
  31.    "cell_type": "code",
  32.    "execution_count": 161,
  33.    "metadata": {},
  34.    "outputs": [],
  35.    "source": [
  36.     "# Importing the python libraries that I use.\n",
  37.     "import pandas as pd\n",
  38.     "import numpy as np"
  39.    ]
  40.   },
  41.   {
  42.    "cell_type": "code",
  43.    "execution_count": 162,
  44.    "metadata": {},
  45.    "outputs": [],
  46.    "source": [
  47.     "# Importing the table defining the available data. \n",
  48.     "# There is a row for each station and it includes the begin and end date of available data.\n",
  49.     "isd_stations_data = pd.read_csv('ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.csv')"
  50.    ]
  51.   },
  52.   {
  53.    "cell_type": "code",
  54.    "execution_count": 163,
  55.    "metadata": {},
  56.    "outputs": [],
  57.    "source": [
  58.     "# I want data for DC, so I've chosen to search for the local airport. Reagan National Airport (DCA).\n",
  59.     "# Note that all of the Station Names are uppercase.\n",
  60.     "DCA_search = isd_stations_data.loc[(isd_stations_data['STATION NAME'].isna() == False) \n",
  61.     "                                   & (isd_stations_data['STATION NAME'].str.contains('REAGAN'))]"
  62.    ]
  63.   },
  64.   {
  65.    "cell_type": "code",
  66.    "execution_count": 164,
  67.    "metadata": {},
  68.    "outputs": [],
  69.    "source": [
  70.     "# Slicing out the BEGIN and END years to create the range of years for which I'll download data.\n",
  71.     "start_year = str(list(DCA_search.BEGIN)[0])[0:4]\n",
  72.     "end_year = str(list(DCA_search.END)[0])[0:4]\n",
  73.     "year_range = range(int(start_year), int(end_year)+1)"
  74.    ]
  75.   },
  76.   {
  77.    "cell_type": "code",
  78.    "execution_count": 165,
  79.    "metadata": {},
  80.    "outputs": [],
  81.    "source": [
  82.     "# Creating the station ID by which the ftp site is organized.\n",
  83.     "# Note that it is the concatenation of two columns separated by a hyphen.\n",
  84.     "station_id = str(list(DCA_search.USAF)[0])+'-'+str(list(DCA_search.WBAN)[0])"
  85.    ]
  86.   },
  87.   {
  88.    "cell_type": "code",
  89.    "execution_count": 144,
  90.    "metadata": {},
  91.    "outputs": [],
  92.    "source": [
  93.     "# Function to loop through a given station ID for a given range of years.\n",
  94.     "def download_isd_lite(station_id, year_range):\n",
  95.     "    isd_df = pd.DataFrame()\n",
  96.     "    for year in year_range:\n",
  97.     "        # There can be gaps of missing years in the data, so try and except were required. \n",
  98.     "        # The gaps that I've seen are only from decades ago.\n",
  99.     "        try:\n",
  100.     "            new_isd_df = pd.read_fwf('ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite/'\n",
  101.     "                                     +str(year)+'/'\n",
  102.     "                                     +station_id+'-'\n",
  103.     "                                     +str(year)\n",
  104.     "                                     +'.gz',\n",
  105.     "                                     header=None)\n",
  106.     "            isd_df = pd.concat([isd_df, new_isd_df])\n",
  107.     "        except:\n",
  108.     "            continue\n",
  109.     "    \n",
  110.     "    # Resetting the index of the concatenated DataFrame\n",
  111.     "    isd_df.reset_index(inplace=True, drop=True)\n",
  112.     "    \n",
  113.     "    # Setting the column names that I've derived from the format guide\n",
  114.     "    isd_df.columns = isd_fwf_cols\n",
  115.     "   \n",
  116.     "    # NOAA populates missing values with -9999, but I've chosen to replace them with NaN's.\n",
  117.     "    isd_df.replace({-9999: np.nan}, inplace=True)\n",
  118.     "    \n",
  119.     "    # Some of the columns are scaled by a factor of 10 to eliminate decimal points,\n",
  120.     "    # which would complicate the fixed width format that NOAA has chosen to utilize\n",
  121.     "    scaled_columns = ['air_temp_c', 'dew_pt_temp_c', 'sea_lvl_press_hectoPa', \n",
  122.     "                  'wnd_spd_mtrpersec', 'precip_hrly', 'precip_6hr_accum']\n",
  123.     "    scaling_factor = 10\n",
  124.     "    # Resolving the scaling factor\n",
  125.     "    isd_df[scaled_columns] = isd_df[scaled_columns] / 10\n",
  126.     "    \n",
  127.     "    # Creating a date_time column from the various time-based columns NOAA provides.\n",
  128.     "    # The first step is creating a properly formatted string that pandas can parse, and then parse them.\n",
  129.     "    isd_df['date_time'] = isd_df.day.astype('int').astype('str').str.zfill(2)+'/'\\\n",
  130.     "                         +isd_df.month.astype('int').astype('str').str.zfill(2)+'/'\\\n",
  131.     "                         +isd_df.year.astype('int').astype('str')+'/'\\\n",
  132.     "                         +isd_df.hour.astype('int').astype('str').str.zfill(2)\n",
  133.     "    isd_df['date_time'] = pd.to_datetime(isd_df['date_time'], format='%d/%m/%Y/%H')\n",
  134.     "    \n",
  135.     "    return isd_df"
  136.    ]
  137.   },
  138.   {
  139.    "cell_type": "code",
  140.    "execution_count": 145,
  141.    "metadata": {},
  142.    "outputs": [],
  143.    "source": [
  144.     "# Running the function for DCA for all years. This data frame can then be manipulated or exported.\n",
  145.     "isd_df = download_isd_lite(station_id, year_range)"
  146.    ]
  147.   },
  148.   {
  149.    "cell_type": "markdown",
  150.    "metadata": {},
  151.    "source": [
  152.     "# Excel Users\n",
  153.     "You could create the ftp URLs in Excel and then manually click all the links you create. If you had a column with the station_id and a column with the year for each file you want then you could construct the URLs as follows:\n",
  154.     "\n",
  155.     "=HYPERLINK(\"ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite/\"&reference_year_cell&\"/\"&reference_station_id_cell&\"-\"&reference_year_cell&\".gz\")\n",
  156.     "\n",
  157.     "You can then unzip each file, and open it in Excel. Using Excel's \"Text to Columns\" feature with the \"Original data type\" option set to \"Fixed width\", Excel will correctly separate the data in to columns. You can then manually add the column headers and save the data as an Excel file as desired. Then you could manually aggregate the data for multiple years and/or stations as needed."
  158.    ]
  159.   }
  160.  ],
  161.  "metadata": {
  162.   "kernelspec": {
  163.    "display_name": "Python 3",
  164.    "language": "python",
  165.    "name": "python3"
  166.   },
  167.   "language_info": {
  168.    "codemirror_mode": {
  169.     "name": "ipython",
  170.     "version": 3
  171.    },
  172.    "file_extension": ".py",
  173.    "mimetype": "text/x-python",
  174.    "name": "python",
  175.    "nbconvert_exporter": "python",
  176.    "pygments_lexer": "ipython3",
  177.    "version": "3.6.9"
  178.   },
  179.   "toc": {
  180.    "base_numbering": 1,
  181.    "nav_menu": {},
  182.    "number_sections": true,
  183.    "sideBar": true,
  184.    "skip_h1_title": false,
  185.    "title_cell": "Table of Contents",
  186.    "title_sidebar": "Contents",
  187.    "toc_cell": false,
  188.    "toc_position": {},
  189.    "toc_section_display": true,
  190.    "toc_window_display": false
  191.   }
  192.  },
  193.  "nbformat": 4,
  194.  "nbformat_minor": 2
  195. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top