build_UK_boundaries.R

#######################################
# SHINY Explorer - build UK boundaries
#######################################

# The following is the R code to create arbitrary aggregation boundaries, starting from some base shapefile layer.
# For the UK the reference is the union of:
# - Output Areas in England and Wales (EW). Generalised Clipped: http://geoportal.statistics.gov.uk/datasets/09b8a48426e3482ebbc0b0c49985c0fb_2
# - Output Areas in Scotland (SC) https://www.nrscotland.gov.uk/statistics-and-data/geography/our-products/census-datasets/2011-census/2011-boundaries
# - Small Areas in Northern Ireland (NI) http://www.nisra.gov.uk/geography/SmallAreas.htm
# Once downloaded, rename each group of files with the corresponding two characters acronym.
# It's necessary to keep only the files with the four following extensions: shp, shx, prj, dbf.

# One small problem to be aware of is the different original projection of the boundaries:
#   EW: '+init=epsg:27700'  # http://spatialreference.org/ref/epsg/osgb-1936-british-national-grid/
#   SC: '+init=epsg:27700'   # http://spatialreference.org/ref/epsg/osgb-1936-british-national-grid/
#   NI: '+init=epsg:29902'   # http://spatialreference.org/ref/epsg/29902/
# but all of the above will be transformed in the standard WGS84 for web mapping.

# ONS Reference Country codes
#  - E92000001 England;
#  - W92000004 Wales;
#  - S92000003 Scotland;
#  - N92000002 Northern Ireland;

# The following are the number of features in each set of boundaries:
#   - EW: 181,408 (E 171,372 + W 10,036)
#   - SC:  46,351
#   - NI:   4,537
# for a total UK of 232,296. This is important to know because when simplifying the polygons overall, some of the features could dissolve.

# To successfully aggregate, the script needs also a lookup table that gives back a unique aggregated area code for every output/small area (from now on just OA).
# This table could be built for many types of aggregations using, for example, the Postcodes Directories:
#  - NHS Postcode Directory: http://ons.maps.arcgis.com/home/item.html?id=4e63d9b589604c0ba10fbbd3bef08e6f
#  - ONS Postcode Directory: http://ons.maps.arcgis.com/home/item.html?id=dfa0ff74981b4b228d2030d852f0b14a
# These are maintained and published every quarter by the ONS, that relates both current and terminated postcodes in the UK
# to a range of current and past statutory administrative, electoral, health and other geographies. The downside to this approach is that some OA could belong to different aggregations, and it becomes somewhat arbitrary, without more knowledge, decide which is the right destination area.

# When saving to the database, there is an added column 'type' that is part of the foreign key ('id', 'type') connecting to the primary key ('boundary_id', 'type') in the table *boundaries_ids*, used to retrieve the correct id of the feature to link to external data.
# I saved all files in s:\ICS\Projects\Data Management\R_Shiny\Resources\boundaries_aggregations\

# It is finally important to understand that when reproducing ONS content, the output must include a source accreditation to ONS.


# LOAD PACKAGES ---------------------------------------------------------------------------------------------------
pkg <- c('maptools', 'rgeos', 'rgdal', 'rmapshaper', 'RMySQL')
invisible( lapply(pkg, require, character.only = TRUE) )

# SET VARIABLES ---------------------------------------------------------------------------------------------------

boundaries.path <- 'C:/cloud/OneDrive - University College London/data/boundaries/shp'
proj.wgs <- '+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0'

# LOAD AND MERGE EW, SC AND NI OAs MAP INTO A UNIQUE UK MAP  -------------------------------------------------------

# England and Wales
# read polygons from shapefile (using rgdal, projection is automatically applied if the prj file is found)
shp.ew <- readOGR(boundaries.path, layer = 'EW')
# check the projection, and read the field to keep as future id
summary(shp.ew)
# transform the shapefile to WGS84 projection
shp.ew <- spTransform(shp.ew, CRS(proj.wgs))
# keep in the data slot only the ONS Output Area id, renaming it as 'id'
shp.ew <- shp.ew[, 'oa11cd']
colnames(shp.ew@data) <- c('id')
# reassign the polygon IDs
shp.ew <- spChFIDs(shp.ew, as.character(shp.ew$id))

# Scotland
shp.sc <- readOGR(boundaries.path, layer = 'SC')
summary(shp.sc)
shp.sc <- spTransform(shp.sc, CRS(proj.wgs))
shp.sc <- shp.sc[, 'code']
colnames(shp.sc@data) <- c('id')
shp.sc <- spChFIDs(shp.sc, as.character(shp.sc$id))

# N.Ireland
shp.ni <- readOGR(boundaries.path, layer = 'NI')
summary(shp.ni)
shp.ni <- spTransform(shp.ni, CRS(proj.wgs))
shp.ni <- shp.ni[, 'SA2011']
colnames(shp.ni@data) <- c('id')
shp.ni <- spChFIDs(shp.ni, as.character(shp.ni$id))

# UK as a merge of all previous boundaries
shp.uk <- spRbind(spRbind(shp.ew, shp.sc), shp.ni)
# reduce the complexity of the boundaries
# run the following line only on Linux with at least 32GB RAM, it should take a few hours
# shp.uk <- ms_simplify(shp.uk, keep = 0.05)
# save merged polygons as a unique shapefile
try( do.call(file.remove, as.list(paste0(boundaries.path, '/', list.files(path = boundaries.path, pattern = 'UK') ) ) ), silent = FALSE )
writeOGR(shp.uk, dsn = boundaries.path, layer = 'UK', driver = 'ESRI Shapefile')


# # # STEP 2 - CREATE AGGREGATION MAPS USING PREVIOUS UK MAP AND A LOOKUP TABLE ------------------------------------------------------------

# read polygons from shapefile
shp.uk <- readOGR(boundaries.path, layer = 'UK')
# connect to database
db_conn <- dbConnect(MySQL(), user = 'root', password = 'root', dbname = 'geography')
# retrieve lookup table
lookups <- dbReadTable(db_conn, 'lookups')
# list of aggregations to process
areas <- c('CCG', 'LAT', 'NHSR', 'CCR', 'CTRY')
for(area in areas){
    # join shapefile data slot and lookup table on the area code
    shp.uk <- merge(shp.uk, lookups[, c('OA', area)], by.x = 'id', by.y = 'OA')
    # Build the list of subareas
    subareas <- sort(unique(shp.uk[[area]]))
    # Define first area
    subarea <- subareas[1]
    # Print a processing message
    print(paste('Processing', area, 'subarea', subarea, '- number 1 out of', length(subareas)))
    # select all OA contained in first subarea
    shp.area <- subset(shp.uk, shp.uk[[area]] == subarea)
    # dissolve submap
    shp.area <- ms_dissolve(shp.area)
    # define object id
    shp.area$id <- subarea
    shp.area <- spChFIDs(shp.area, as.character(shp.area$id))
    # proceed in the same way for all other subareas, attaching every results to previous object
    for(idx in 2:length(subareas)){
        subarea <- subareas[idx]
        print(paste('Processing', area, 'subarea', subarea, '- number', idx, 'out of', length(subareas)))
        shp.tmp <- subset(shp.uk, shp.uk[[area]] == subarea)
        shp.tmp <- ms_dissolve(shp.tmp)
        shp.tmp$id <- subarea
        shp.tmp <- spChFIDs(shp.tmp, as.character(shp.tmp$id))
        shp.area <- spRbind(shp.area, shp.tmp)
    }
    print(paste0('Simplifying ', area, 's'))
    # reduce the details of the boundaries for less space (it's a statistical map, not an OS Explorer Map!)
    shp.area <- ms_simplify(shp.area, keep = 0.05)
    # Print a processing message
    print(paste('Saving', area, 'in dataframe format'))
    # save lookup (rmapshaperid, id)
    lkp <- shp.area@data
    #  adding the area type
    lkp$type <- area
    #  rename columns of lookup
    colnames(lkp) <- c('boundary_id', 'id', 'type')
    # save lookup to database
    dbSendQuery(db_conn, paste0("DELETE FROM boundaries_ids WHERE type = '", area, "'") )
    dbWriteTable(db_conn, 'boundaries_ids', lkp, row.names = FALSE, append = TRUE)
    # create dataframe suitable for ggplot
    df.area <- tidy(shp.area)
    # change names in dataframe to coordinates to avoid possible mismatching with programming languages keywords
    setnames(df.area, c('long', 'lat'), c('X_lon', 'Y_lat'))
    # add "type" column
    df.area$type <- area
    # save dataframe to database
    dbSendQuery(db_conn, paste0("DELETE FROM boundaries WHERE type = '", area, "'") )
    dbWriteTable(db_conn, 'boundaries', df.area, row.names = FALSE, append = TRUE)
    # Print a processing message
    print(paste('Saving', area, 'in shapefile format'))
    # delete the rmapshaperid from Polygons
    shp.area <- shp.area[, 'id']
    # save Polygons as shapefile
    try( do.call(file.remove, as.list(paste0(boundaries.path, '/', list.files(path = boundaries.path, pattern = area) ) ) ), silent = TRUE )
    writeOGR(shp.area, dsn = boundaries.path, layer = area, driver = 'ESRI Shapefile')
    # some polishing...
    shp.uk <- shp.uk[, 'id']
    rm(df.area)
    rm(shp.area)
    gc()
}


# clean and exit
dbDisconnect(db_conn)
rm(list = ls())
gc()