Advertisement
MetricT

US Patent Rate per 100k with nonlinear fit

Feb 5th, 2024
1,300
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 5.88 KB | None | 0 0
  1. ### Compute the ratio of patents granted per 100k US population, as a measure
  2. ### of innovation over time.  <Mathew.Binkley@Vanderbilt.edu>
  3.  
  4. library(tidyverse)
  5. library(rvest)
  6. library(scales)
  7. library(broom)
  8. library(minpack.lm)
  9. library(gt)
  10. library(cowplot)
  11.  
  12. ### US Population since 1900
  13. us_pop_pres <-
  14.   "https://multpl.com/united-states-population/table/by-year" %>%
  15.   read_html() %>%
  16.   html_nodes("table") %>%
  17.   .[[1]] %>%
  18.   html_table() %>%
  19.   filter(grepl("Jul", Date)) %>%
  20.   rename(Year = Date) %>%
  21.   mutate(
  22.     Year  = gsub("Jul 1, ", "", Year) %>% as.integer(),
  23.     Value = gsub(" million", "", Value) %>% as.numeric(),
  24.     Date  = as.Date(paste(Year, "-01-01", sep = "")),
  25.     Value = Value * 1000000
  26.   ) %>%
  27.   select(Date, Value)
  28.  
  29. # US population before 1900 from Wikipedia
  30. # https://en.wikipedia.org/wiki/Demographic_history_of_the_United_States
  31. us_pop_hist <-
  32.   tribble(
  33.     ~year, ~Value,
  34.     1610,   350,
  35.     1620,   2302,
  36.     1630,   4646,
  37.     1640,   26634,
  38.     1650,   50368,
  39.     1660,   75058,
  40.     1670,   111935,
  41.     1680,   151507,
  42.     1690,   210372,
  43.     1700,   250888,
  44.     1710,   331711,
  45.     1720,   466185,
  46.     1730,   629445,
  47.     1740,   905563,
  48.     1750,   1170760,
  49.     1760,   1593625,
  50.     1770,   2148076,
  51.     1780,   2780369,
  52.     1790,   3929214,
  53.     1800,   5308483,
  54.     1810,   7239881,
  55.     1820,   9638453,
  56.     1830,   12866020,
  57.     1840,   17069453,
  58.     1850,   23191876,
  59.     1860,   31443321,
  60.     1870,   38558371,
  61.     1880,   50189209,
  62.     1890,   62979766
  63.   ) %>%
  64.   mutate(Date = as.Date(paste(year, "-01-01", sep = ""))) %>%
  65.   select(Date, Value)
  66.  
  67. ### Combine past/present history into one dataset
  68. us_pop <-
  69.   us_pop_pres %>%
  70.   bind_rows(us_pop_hist) %>%
  71.   arrange(Date) %>%
  72.   rename(date = Date, us_pop = Value)
  73.  
  74. # Info on number of patents from US Patent & Trademark Office
  75. patents <-
  76.   "https://www.uspto.gov/web/offices/ac/ido/oeip/taf/h_counts.htm" %>%
  77.   read_html() %>%
  78.   html_elements("table") %>%
  79.   .[[3]] %>%
  80.   html_table() %>%
  81.   janitor::clean_names() %>%
  82.   mutate_all(~sub(",", "", .)) %>%
  83.   select(-notes) %>%
  84.   rename(
  85.     year = 1,
  86.     utility_patent_applications = 2,
  87.     design_patent_applications = 3,
  88.     plant_patent_applications = 4,
  89.     utility_patents = 5,
  90.     design_patents = 6,
  91.     plant_patents = 7,
  92.     patents_to_foreign_residents = 8
  93.   ) %>%
  94.   mutate(
  95.     year = ifelse(year == "1836 (c)", "1836", year),
  96.     design_patents = ifelse(design_patents == "(b)", "", design_patents)
  97.     ) %>%
  98.   mutate_all(~sub("n/a", "", .)) %>%
  99.   mutate_all(as.numeric) %>%
  100.   replace(is.na(.), 0) %>%
  101.   mutate(
  102.     date = as.Date(paste(year, "-01-01", sep = "")),
  103.     total_patents = utility_patents + design_patents + plant_patents
  104.     ) %>%
  105.   select(date, total_patents)
  106.  
  107. # Combine patents and population into one table so we can compute the rate of
  108. # patents per 100k population
  109. combo <-
  110.   us_pop %>%
  111.   full_join(patents, by = "date") %>%
  112.   filter(!is.na(total_patents), !is.na(us_pop)) %>%
  113.   mutate(
  114.     year = year(date),
  115.     patent_rate_100k = 100000 * total_patents / us_pop)
  116.  
  117. # Do a non-linear fit and derive a fit.   Starting points originally derived by eye-balling it.
  118. fit <-
  119.   combo %>%
  120.   nlsLM(patent_rate_100k ~ a + (b * year) + exp(c + d * year) + amplitude * sin((2 * pi / period) * (year + phase)),
  121.         data = .,
  122.         start = list(a = -178.75411547, b = 0.10855252, c = -179.64201187, d = 0.09114342, amplitude = 50, period = 68.9, phase = 40))
  123.  
  124. a <- fit %>% tidy() %>% filter(term == "a") %>% pull(estimate)
  125. b <- fit %>% tidy() %>% filter(term == "b") %>% pull(estimate)
  126. c <- fit %>% tidy() %>% filter(term == "c") %>% pull(estimate)
  127. d <- fit %>% tidy() %>% filter(term == "d") %>% pull(estimate)
  128.  
  129. amplitude <- fit %>% tidy() %>% filter(term == "amplitude") %>% pull(estimate)
  130. period    <- fit %>% tidy() %>% filter(term == "period")    %>% pull(estimate)
  131. phase     <- fit %>% tidy() %>% filter(term == "phase")     %>% pull(estimate)
  132.  
  133. combo <-
  134.   combo %>%
  135.   mutate(
  136.    
  137.     # Fit the non-cyclical parts so we can subtract out
  138.     fit_lin_exp     = a + b*year + exp(c + d * year),
  139.     detrend_lin_exp = patent_rate_100k - fit_lin_exp,
  140.    
  141.     # Fit the cyclical parts so we can also subtract them to look at residuals
  142.     fit_sin         = amplitude * sin((2 * pi / period) * (year + phase)),
  143.     detrend_sin     = detrend_lin_exp - fit_sin
  144.     )
  145.  
  146. # Create a table of the fit info to append to our graph.   "gt" doesn't allow
  147. # converting tables to grobs, so I save as a png and add to the graph in GIMP.
  148. fit_info <-
  149.   fit %>%
  150.   tidy() %>%
  151.   gt() %>%
  152.   tab_header(title = "patent_rate_100k ~ (a + b * year) + exp(c + d * year) + amplitude * sin((2 * pi / period) * (year + phase)") %>%
  153.   fmt_number(decimals = 3)
  154. print(fit_info)
  155.  
  156. # Graph the raw patent per 100k rate, showing the trend line we will subtract
  157. g_patents <-
  158.   ggplot(data = combo) +
  159.   theme_bw() +
  160.   geom_line(aes(x = date, y = patent_rate_100k)) +
  161.   geom_line(aes(x = date, y = fit_lin_exp + fit_sin), color = "firebrick2") +
  162.   scale_x_date(breaks = pretty_breaks(10)) +
  163.   scale_y_continuous(breaks = pretty_breaks(10)) +
  164.   labs(x = "", y = "Patent Rate per 100k population", title = "US Patent Rate per 100k population")
  165. print(g_patents)
  166.  
  167. # Subtract out the non-cyclical linear/exponential components and graph the
  168. # z-score of the detrended patent rate
  169. g_patents_detrend <-
  170.   ggplot(data = combo) +
  171.   theme_bw() +
  172.   geom_line(aes(x = date, y = (detrend_lin_exp - mean(detrend_lin_exp))/sd(detrend_lin_exp))) +
  173.   geom_hline(yintercept = 0, linetype = "dashed", color = "steelblue2") +
  174.   scale_x_date(breaks = pretty_breaks(10), limits = c(as.Date("1790-01-01"), Sys.Date())) +
  175.   scale_y_continuous(breaks = pretty_breaks(10)) +
  176.   labs(x = "", y = "Z-Score",
  177.        title = "Z-Score of linear/exponential-detrended US Patent Rate per 100k population")
  178. print(g_patents_detrend)
  179.  
  180. plot_grid(g_patents, g_patents_detrend, ncol = 1, align = "hv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement