Guest User

Untitled

a guest
Apr 25th, 2018
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 70.50 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "## Import Modules"
  8. ]
  9. },
  10. {
  11. "cell_type": "code",
  12. "execution_count": 1,
  13. "metadata": {},
  14. "outputs": [],
  15. "source": [
  16. "import pandas\n",
  17. "import numpy as np"
  18. ]
  19. },
  20. {
  21. "cell_type": "markdown",
  22. "metadata": {},
  23. "source": [
  24. "## Import Data"
  25. ]
  26. },
  27. {
  28. "cell_type": "code",
  29. "execution_count": 2,
  30. "metadata": {},
  31. "outputs": [],
  32. "source": [
  33. "data1 = pandas.read_csv(\"C://Users/admin-vicvphq/Desktop/testing/ipgod101.csv\", low_memory = False)"
  34. ]
  35. },
  36. {
  37. "cell_type": "code",
  38. "execution_count": 3,
  39. "metadata": {},
  40. "outputs": [],
  41. "source": [
  42. "data2_part1 = pandas.read_csv(\"C://Users/admin-vicvphq/Desktop/testing/ipgod102_part1.csv\", low_memory = False)"
  43. ]
  44. },
  45. {
  46. "cell_type": "code",
  47. "execution_count": 4,
  48. "metadata": {},
  49. "outputs": [],
  50. "source": [
  51. "data2_part2 = pandas.read_csv(\"C://Users/admin-vicvphq/Desktop/testing/ipgod102_part2.csv\", low_memory = False)"
  52. ]
  53. },
  54. {
  55. "cell_type": "markdown",
  56. "metadata": {},
  57. "source": [
  58. "## Union data2 into one dataframe"
  59. ]
  60. },
  61. {
  62. "cell_type": "code",
  63. "execution_count": 5,
  64. "metadata": {},
  65. "outputs": [],
  66. "source": [
  67. "data2 = pandas.concat([data2_part1, data2_part2], axis = 0)"
  68. ]
  69. },
  70. {
  71. "cell_type": "markdown",
  72. "metadata": {},
  73. "source": [
  74. "## Check Data "
  75. ]
  76. },
  77. {
  78. "cell_type": "code",
  79. "execution_count": 6,
  80. "metadata": {},
  81. "outputs": [
  82. {
  83. "name": "stdout",
  84. "output_type": "stream",
  85. "text": [
  86. "<class 'pandas.core.frame.DataFrame'>\n",
  87. "RangeIndex: 1381534 entries, 0 to 1381533\n",
  88. "Data columns (total 12 columns):\n",
  89. "australian_appl_no 1381534 non-null object\n",
  90. "application_year 1381534 non-null int64\n",
  91. "sealing_year 1381527 non-null float64\n",
  92. "patent_type 1381534 non-null object\n",
  93. "patent_status_type 972381 non-null object\n",
  94. "country 938216 non-null object\n",
  95. "australian 268134 non-null float64\n",
  96. "sealing_date 1381527 non-null object\n",
  97. "primary_ipc_mark_value 849317 non-null object\n",
  98. "entity 752815 non-null float64\n",
  99. "big 31194 non-null float64\n",
  100. "applicant_no 1381534 non-null int64\n",
  101. "dtypes: float64(4), int64(2), object(6)\n",
  102. "memory usage: 94.9+ MB\n"
  103. ]
  104. }
  105. ],
  106. "source": [
  107. "data1.info()"
  108. ]
  109. },
  110. {
  111. "cell_type": "code",
  112. "execution_count": 7,
  113. "metadata": {},
  114. "outputs": [
  115. {
  116. "name": "stdout",
  117. "output_type": "stream",
  118. "text": [
  119. "<class 'pandas.core.frame.DataFrame'>\n",
  120. "Int64Index: 1041145 entries, 0 to 516146\n",
  121. "Data columns (total 25 columns):\n",
  122. "ipa_applt_id 1041145 non-null int64\n",
  123. "australian_appl_no 1041145 non-null object\n",
  124. "appln_type 1041145 non-null object\n",
  125. "name 1041141 non-null object\n",
  126. "cleanname 1041141 non-null object\n",
  127. "country 1041141 non-null object\n",
  128. "australian 301518 non-null float64\n",
  129. "entity 806061 non-null float64\n",
  130. "ipa_id 1040415 non-null float64\n",
  131. "abn 116639 non-null float64\n",
  132. "acn 115536 non-null float64\n",
  133. "source 132897 non-null object\n",
  134. "big 32775 non-null float64\n",
  135. "ultimate 17235 non-null float64\n",
  136. "lat 276002 non-null float64\n",
  137. "lon 276002 non-null float64\n",
  138. "qg 301518 non-null float64\n",
  139. "state 301114 non-null object\n",
  140. "postcode 297944 non-null float64\n",
  141. "patstat_appln_id 572056 non-null float64\n",
  142. "applicant_type 1041145 non-null object\n",
  143. "sa2_main11 275080 non-null float64\n",
  144. "sa2_name11 275080 non-null object\n",
  145. "gcc_code11 275080 non-null object\n",
  146. "gcc_name11 275080 non-null object\n",
  147. "dtypes: float64(13), int64(1), object(11)\n",
  148. "memory usage: 162.8+ MB\n"
  149. ]
  150. }
  151. ],
  152. "source": [
  153. "data2.info()"
  154. ]
  155. },
  156. {
  157. "cell_type": "code",
  158. "execution_count": 8,
  159. "metadata": {},
  160. "outputs": [
  161. {
  162. "data": {
  163. "text/html": [
  164. "<div>\n",
  165. "<style scoped>\n",
  166. " .dataframe tbody tr th:only-of-type {\n",
  167. " vertical-align: middle;\n",
  168. " }\n",
  169. "\n",
  170. " .dataframe tbody tr th {\n",
  171. " vertical-align: top;\n",
  172. " }\n",
  173. "\n",
  174. " .dataframe thead th {\n",
  175. " text-align: right;\n",
  176. " }\n",
  177. "</style>\n",
  178. "<table border=\"1\" class=\"dataframe\">\n",
  179. " <thead>\n",
  180. " <tr style=\"text-align: right;\">\n",
  181. " <th></th>\n",
  182. " <th>application_year</th>\n",
  183. " <th>sealing_year</th>\n",
  184. " <th>australian</th>\n",
  185. " <th>entity</th>\n",
  186. " <th>big</th>\n",
  187. " <th>applicant_no</th>\n",
  188. " </tr>\n",
  189. " </thead>\n",
  190. " <tbody>\n",
  191. " <tr>\n",
  192. " <th>count</th>\n",
  193. " <td>1.381534e+06</td>\n",
  194. " <td>1.381527e+06</td>\n",
  195. " <td>268134.000000</td>\n",
  196. " <td>752815.000000</td>\n",
  197. " <td>31194.000000</td>\n",
  198. " <td>1.381534e+06</td>\n",
  199. " </tr>\n",
  200. " <tr>\n",
  201. " <th>mean</th>\n",
  202. " <td>2.283254e+03</td>\n",
  203. " <td>7.297736e+03</td>\n",
  204. " <td>1.045753</td>\n",
  205. " <td>1.037900</td>\n",
  206. " <td>3.009842</td>\n",
  207. " <td>1.074785e+00</td>\n",
  208. " </tr>\n",
  209. " <tr>\n",
  210. " <th>std</th>\n",
  211. " <td>1.509252e+03</td>\n",
  212. " <td>3.782131e+03</td>\n",
  213. " <td>0.425347</td>\n",
  214. " <td>0.387512</td>\n",
  215. " <td>3.319992</td>\n",
  216. " <td>3.447780e-01</td>\n",
  217. " </tr>\n",
  218. " <tr>\n",
  219. " <th>min</th>\n",
  220. " <td>1.900000e+03</td>\n",
  221. " <td>1.967000e+03</td>\n",
  222. " <td>1.000000</td>\n",
  223. " <td>1.000000</td>\n",
  224. " <td>0.000000</td>\n",
  225. " <td>1.000000e+00</td>\n",
  226. " </tr>\n",
  227. " <tr>\n",
  228. " <th>25%</th>\n",
  229. " <td>1.975000e+03</td>\n",
  230. " <td>2.011000e+03</td>\n",
  231. " <td>1.000000</td>\n",
  232. " <td>1.000000</td>\n",
  233. " <td>1.000000</td>\n",
  234. " <td>1.000000e+00</td>\n",
  235. " </tr>\n",
  236. " <tr>\n",
  237. " <th>50%</th>\n",
  238. " <td>1.996000e+03</td>\n",
  239. " <td>9.999000e+03</td>\n",
  240. " <td>1.000000</td>\n",
  241. " <td>1.000000</td>\n",
  242. " <td>3.000000</td>\n",
  243. " <td>1.000000e+00</td>\n",
  244. " </tr>\n",
  245. " <tr>\n",
  246. " <th>75%</th>\n",
  247. " <td>2.008000e+03</td>\n",
  248. " <td>9.999000e+03</td>\n",
  249. " <td>1.000000</td>\n",
  250. " <td>1.000000</td>\n",
  251. " <td>3.000000</td>\n",
  252. " <td>1.000000e+00</td>\n",
  253. " </tr>\n",
  254. " <tr>\n",
  255. " <th>max</th>\n",
  256. " <td>9.999000e+03</td>\n",
  257. " <td>9.999000e+03</td>\n",
  258. " <td>5.000000</td>\n",
  259. " <td>5.000000</td>\n",
  260. " <td>44.000000</td>\n",
  261. " <td>2.300000e+01</td>\n",
  262. " </tr>\n",
  263. " </tbody>\n",
  264. "</table>\n",
  265. "</div>"
  266. ],
  267. "text/plain": [
  268. " application_year sealing_year australian entity \\\n",
  269. "count 1.381534e+06 1.381527e+06 268134.000000 752815.000000 \n",
  270. "mean 2.283254e+03 7.297736e+03 1.045753 1.037900 \n",
  271. "std 1.509252e+03 3.782131e+03 0.425347 0.387512 \n",
  272. "min 1.900000e+03 1.967000e+03 1.000000 1.000000 \n",
  273. "25% 1.975000e+03 2.011000e+03 1.000000 1.000000 \n",
  274. "50% 1.996000e+03 9.999000e+03 1.000000 1.000000 \n",
  275. "75% 2.008000e+03 9.999000e+03 1.000000 1.000000 \n",
  276. "max 9.999000e+03 9.999000e+03 5.000000 5.000000 \n",
  277. "\n",
  278. " big applicant_no \n",
  279. "count 31194.000000 1.381534e+06 \n",
  280. "mean 3.009842 1.074785e+00 \n",
  281. "std 3.319992 3.447780e-01 \n",
  282. "min 0.000000 1.000000e+00 \n",
  283. "25% 1.000000 1.000000e+00 \n",
  284. "50% 3.000000 1.000000e+00 \n",
  285. "75% 3.000000 1.000000e+00 \n",
  286. "max 44.000000 2.300000e+01 "
  287. ]
  288. },
  289. "execution_count": 8,
  290. "metadata": {},
  291. "output_type": "execute_result"
  292. }
  293. ],
  294. "source": [
  295. "data1.describe()"
  296. ]
  297. },
  298. {
  299. "cell_type": "code",
  300. "execution_count": 9,
  301. "metadata": {},
  302. "outputs": [
  303. {
  304. "data": {
  305. "text/html": [
  306. "<div>\n",
  307. "<style scoped>\n",
  308. " .dataframe tbody tr th:only-of-type {\n",
  309. " vertical-align: middle;\n",
  310. " }\n",
  311. "\n",
  312. " .dataframe tbody tr th {\n",
  313. " vertical-align: top;\n",
  314. " }\n",
  315. "\n",
  316. " .dataframe thead th {\n",
  317. " text-align: right;\n",
  318. " }\n",
  319. "</style>\n",
  320. "<table border=\"1\" class=\"dataframe\">\n",
  321. " <thead>\n",
  322. " <tr style=\"text-align: right;\">\n",
  323. " <th></th>\n",
  324. " <th>ipa_applt_id</th>\n",
  325. " <th>australian</th>\n",
  326. " <th>entity</th>\n",
  327. " <th>ipa_id</th>\n",
  328. " <th>abn</th>\n",
  329. " <th>acn</th>\n",
  330. " <th>big</th>\n",
  331. " <th>ultimate</th>\n",
  332. " <th>lat</th>\n",
  333. " <th>lon</th>\n",
  334. " <th>qg</th>\n",
  335. " <th>postcode</th>\n",
  336. " <th>patstat_appln_id</th>\n",
  337. " <th>sa2_main11</th>\n",
  338. " </tr>\n",
  339. " </thead>\n",
  340. " <tbody>\n",
  341. " <tr>\n",
  342. " <th>count</th>\n",
  343. " <td>1.041145e+06</td>\n",
  344. " <td>301518.0</td>\n",
  345. " <td>806061.0</td>\n",
  346. " <td>1.040415e+06</td>\n",
  347. " <td>1.166390e+05</td>\n",
  348. " <td>1.155360e+05</td>\n",
  349. " <td>32775.000000</td>\n",
  350. " <td>1.723500e+04</td>\n",
  351. " <td>276002.000000</td>\n",
  352. " <td>276002.000000</td>\n",
  353. " <td>301518.000000</td>\n",
  354. " <td>297944.000000</td>\n",
  355. " <td>5.720560e+05</td>\n",
  356. " <td>2.750800e+05</td>\n",
  357. " </tr>\n",
  358. " <tr>\n",
  359. " <th>mean</th>\n",
  360. " <td>3.308055e+06</td>\n",
  361. " <td>1.0</td>\n",
  362. " <td>1.0</td>\n",
  363. " <td>3.529633e+05</td>\n",
  364. " <td>5.381772e+10</td>\n",
  365. " <td>6.559500e+07</td>\n",
  366. " <td>2.777727</td>\n",
  367. " <td>1.219921e+08</td>\n",
  368. " <td>-33.425968</td>\n",
  369. " <td>144.616010</td>\n",
  370. " <td>7.249236</td>\n",
  371. " <td>3515.825075</td>\n",
  372. " <td>1.021049e+08</td>\n",
  373. " <td>2.648798e+08</td>\n",
  374. " </tr>\n",
  375. " <tr>\n",
  376. " <th>std</th>\n",
  377. " <td>3.224939e+06</td>\n",
  378. " <td>0.0</td>\n",
  379. " <td>0.0</td>\n",
  380. " <td>2.450687e+05</td>\n",
  381. " <td>2.627974e+10</td>\n",
  382. " <td>8.338251e+07</td>\n",
  383. " <td>3.331155</td>\n",
  384. " <td>1.254714e+08</td>\n",
  385. " <td>4.929670</td>\n",
  386. " <td>13.768512</td>\n",
  387. " <td>2.425960</td>\n",
  388. " <td>1381.603974</td>\n",
  389. " <td>1.621904e+08</td>\n",
  390. " <td>1.665090e+08</td>\n",
  391. " </tr>\n",
  392. " <tr>\n",
  393. " <th>min</th>\n",
  394. " <td>1.000000e+00</td>\n",
  395. " <td>1.0</td>\n",
  396. " <td>1.0</td>\n",
  397. " <td>6.000000e+00</td>\n",
  398. " <td>9.006074e+09</td>\n",
  399. " <td>3.590000e+02</td>\n",
  400. " <td>0.000000</td>\n",
  401. " <td>7.790000e+02</td>\n",
  402. " <td>-45.903000</td>\n",
  403. " <td>-124.091000</td>\n",
  404. " <td>0.000000</td>\n",
  405. " <td>0.000000</td>\n",
  406. " <td>4.484250e+05</td>\n",
  407. " <td>1.010110e+08</td>\n",
  408. " </tr>\n",
  409. " <tr>\n",
  410. " <th>25%</th>\n",
  411. " <td>2.608740e+05</td>\n",
  412. " <td>1.0</td>\n",
  413. " <td>1.0</td>\n",
  414. " <td>1.643470e+05</td>\n",
  415. " <td>3.107960e+10</td>\n",
  416. " <td>5.526723e+06</td>\n",
  417. " <td>1.000000</td>\n",
  418. " <td>7.140918e+07</td>\n",
  419. " <td>-37.666000</td>\n",
  420. " <td>144.961000</td>\n",
  421. " <td>8.000000</td>\n",
  422. " <td>2258.000000</td>\n",
  423. " <td>2.036613e+06</td>\n",
  424. " <td>1.220214e+08</td>\n",
  425. " </tr>\n",
  426. " <tr>\n",
  427. " <th>50%</th>\n",
  428. " <td>5.211710e+05</td>\n",
  429. " <td>1.0</td>\n",
  430. " <td>1.0</td>\n",
  431. " <td>3.042230e+05</td>\n",
  432. " <td>5.223406e+10</td>\n",
  433. " <td>6.309276e+07</td>\n",
  434. " <td>3.000000</td>\n",
  435. " <td>1.166685e+08</td>\n",
  436. " <td>-33.868999</td>\n",
  437. " <td>149.146000</td>\n",
  438. " <td>8.000000</td>\n",
  439. " <td>3139.000000</td>\n",
  440. " <td>2.663495e+06</td>\n",
  441. " <td>2.090412e+08</td>\n",
  442. " </tr>\n",
  443. " <tr>\n",
  444. " <th>75%</th>\n",
  445. " <td>6.710670e+06</td>\n",
  446. " <td>1.0</td>\n",
  447. " <td>1.0</td>\n",
  448. " <td>5.106935e+05</td>\n",
  449. " <td>7.801041e+10</td>\n",
  450. " <td>1.022062e+08</td>\n",
  451. " <td>3.000000</td>\n",
  452. " <td>1.304912e+08</td>\n",
  453. " <td>-31.962000</td>\n",
  454. " <td>151.211000</td>\n",
  455. " <td>8.000000</td>\n",
  456. " <td>4220.000000</td>\n",
  457. " <td>2.743345e+08</td>\n",
  458. " <td>3.150214e+08</td>\n",
  459. " </tr>\n",
  460. " <tr>\n",
  461. " <th>max</th>\n",
  462. " <td>7.811173e+06</td>\n",
  463. " <td>1.0</td>\n",
  464. " <td>1.0</td>\n",
  465. " <td>9.250380e+05</td>\n",
  466. " <td>9.994756e+10</td>\n",
  467. " <td>6.166277e+08</td>\n",
  468. " <td>44.000000</td>\n",
  469. " <td>6.194705e+08</td>\n",
  470. " <td>56.441002</td>\n",
  471. " <td>174.717000</td>\n",
  472. " <td>9.000000</td>\n",
  473. " <td>9726.000000</td>\n",
  474. " <td>9.600031e+08</td>\n",
  475. " <td>8.010911e+08</td>\n",
  476. " </tr>\n",
  477. " </tbody>\n",
  478. "</table>\n",
  479. "</div>"
  480. ],
  481. "text/plain": [
  482. " ipa_applt_id australian entity ipa_id abn \\\n",
  483. "count 1.041145e+06 301518.0 806061.0 1.040415e+06 1.166390e+05 \n",
  484. "mean 3.308055e+06 1.0 1.0 3.529633e+05 5.381772e+10 \n",
  485. "std 3.224939e+06 0.0 0.0 2.450687e+05 2.627974e+10 \n",
  486. "min 1.000000e+00 1.0 1.0 6.000000e+00 9.006074e+09 \n",
  487. "25% 2.608740e+05 1.0 1.0 1.643470e+05 3.107960e+10 \n",
  488. "50% 5.211710e+05 1.0 1.0 3.042230e+05 5.223406e+10 \n",
  489. "75% 6.710670e+06 1.0 1.0 5.106935e+05 7.801041e+10 \n",
  490. "max 7.811173e+06 1.0 1.0 9.250380e+05 9.994756e+10 \n",
  491. "\n",
  492. " acn big ultimate lat lon \\\n",
  493. "count 1.155360e+05 32775.000000 1.723500e+04 276002.000000 276002.000000 \n",
  494. "mean 6.559500e+07 2.777727 1.219921e+08 -33.425968 144.616010 \n",
  495. "std 8.338251e+07 3.331155 1.254714e+08 4.929670 13.768512 \n",
  496. "min 3.590000e+02 0.000000 7.790000e+02 -45.903000 -124.091000 \n",
  497. "25% 5.526723e+06 1.000000 7.140918e+07 -37.666000 144.961000 \n",
  498. "50% 6.309276e+07 3.000000 1.166685e+08 -33.868999 149.146000 \n",
  499. "75% 1.022062e+08 3.000000 1.304912e+08 -31.962000 151.211000 \n",
  500. "max 6.166277e+08 44.000000 6.194705e+08 56.441002 174.717000 \n",
  501. "\n",
  502. " qg postcode patstat_appln_id sa2_main11 \n",
  503. "count 301518.000000 297944.000000 5.720560e+05 2.750800e+05 \n",
  504. "mean 7.249236 3515.825075 1.021049e+08 2.648798e+08 \n",
  505. "std 2.425960 1381.603974 1.621904e+08 1.665090e+08 \n",
  506. "min 0.000000 0.000000 4.484250e+05 1.010110e+08 \n",
  507. "25% 8.000000 2258.000000 2.036613e+06 1.220214e+08 \n",
  508. "50% 8.000000 3139.000000 2.663495e+06 2.090412e+08 \n",
  509. "75% 8.000000 4220.000000 2.743345e+08 3.150214e+08 \n",
  510. "max 9.000000 9726.000000 9.600031e+08 8.010911e+08 "
  511. ]
  512. },
  513. "execution_count": 9,
  514. "metadata": {},
  515. "output_type": "execute_result"
  516. }
  517. ],
  518. "source": [
  519. "data2.describe()"
  520. ]
  521. },
  522. {
  523. "cell_type": "code",
  524. "execution_count": 10,
  525. "metadata": {},
  526. "outputs": [
  527. {
  528. "data": {
  529. "text/plain": [
  530. "(1381534, 12)"
  531. ]
  532. },
  533. "execution_count": 10,
  534. "metadata": {},
  535. "output_type": "execute_result"
  536. }
  537. ],
  538. "source": [
  539. "data1.shape"
  540. ]
  541. },
  542. {
  543. "cell_type": "code",
  544. "execution_count": 11,
  545. "metadata": {},
  546. "outputs": [
  547. {
  548. "data": {
  549. "text/plain": [
  550. "(1041145, 25)"
  551. ]
  552. },
  553. "execution_count": 11,
  554. "metadata": {},
  555. "output_type": "execute_result"
  556. }
  557. ],
  558. "source": [
  559. "data2.shape"
  560. ]
  561. },
  562. {
  563. "cell_type": "code",
  564. "execution_count": 44,
  565. "metadata": {},
  566. "outputs": [
  567. {
  568. "data": {
  569. "text/plain": [
  570. "Index(['australian_appl_no', 'application_year', 'sealing_year', 'patent_type',\n",
  571. " 'patent_status_type', 'country', 'australian', 'sealing_date',\n",
  572. " 'primary_ipc_mark_value', 'entity', 'big', 'applicant_no'],\n",
  573. " dtype='object')"
  574. ]
  575. },
  576. "execution_count": 44,
  577. "metadata": {},
  578. "output_type": "execute_result"
  579. }
  580. ],
  581. "source": [
  582. "data1.columns"
  583. ]
  584. },
  585. {
  586. "cell_type": "code",
  587. "execution_count": 45,
  588. "metadata": {},
  589. "outputs": [
  590. {
  591. "data": {
  592. "text/plain": [
  593. "Index(['ipa_applt_id', 'australian_appl_no', 'appln_type', 'name', 'cleanname',\n",
  594. " 'country', 'australian', 'entity', 'ipa_id', 'abn', 'acn', 'source',\n",
  595. " 'big', 'ultimate', 'lat', 'lon', 'qg', 'state', 'postcode',\n",
  596. " 'patstat_appln_id', 'applicant_type', 'sa2_main11', 'sa2_name11',\n",
  597. " 'gcc_code11', 'gcc_name11'],\n",
  598. " dtype='object')"
  599. ]
  600. },
  601. "execution_count": 45,
  602. "metadata": {},
  603. "output_type": "execute_result"
  604. }
  605. ],
  606. "source": [
  607. "data2.columns"
  608. ]
  609. },
  610. {
  611. "cell_type": "code",
  612. "execution_count": 12,
  613. "metadata": {},
  614. "outputs": [],
  615. "source": [
  616. "data1_2016 = data1[data1[\"application_year\"] == 2016]"
  617. ]
  618. },
  619. {
  620. "cell_type": "code",
  621. "execution_count": 13,
  622. "metadata": {},
  623. "outputs": [
  624. {
  625. "data": {
  626. "text/plain": [
  627. "(36059, 12)"
  628. ]
  629. },
  630. "execution_count": 13,
  631. "metadata": {},
  632. "output_type": "execute_result"
  633. }
  634. ],
  635. "source": [
  636. "data1_2016.shape"
  637. ]
  638. },
  639. {
  640. "cell_type": "markdown",
  641. "metadata": {},
  642. "source": [
  643. "## Create merged dataset"
  644. ]
  645. },
  646. {
  647. "cell_type": "code",
  648. "execution_count": 14,
  649. "metadata": {},
  650. "outputs": [],
  651. "source": [
  652. "data_merged = data1_2016.merge(right = data2, how = \"left\", on = \"australian_appl_no\")"
  653. ]
  654. },
  655. {
  656. "cell_type": "code",
  657. "execution_count": 15,
  658. "metadata": {},
  659. "outputs": [
  660. {
  661. "data": {
  662. "text/html": [
  663. "<div>\n",
  664. "<style scoped>\n",
  665. " .dataframe tbody tr th:only-of-type {\n",
  666. " vertical-align: middle;\n",
  667. " }\n",
  668. "\n",
  669. " .dataframe tbody tr th {\n",
  670. " vertical-align: top;\n",
  671. " }\n",
  672. "\n",
  673. " .dataframe thead th {\n",
  674. " text-align: right;\n",
  675. " }\n",
  676. "</style>\n",
  677. "<table border=\"1\" class=\"dataframe\">\n",
  678. " <thead>\n",
  679. " <tr style=\"text-align: right;\">\n",
  680. " <th></th>\n",
  681. " <th>australian_appl_no</th>\n",
  682. " <th>application_year</th>\n",
  683. " <th>sealing_year</th>\n",
  684. " <th>patent_type</th>\n",
  685. " <th>patent_status_type</th>\n",
  686. " <th>country_x</th>\n",
  687. " <th>australian_x</th>\n",
  688. " <th>sealing_date</th>\n",
  689. " <th>primary_ipc_mark_value</th>\n",
  690. " <th>entity_x</th>\n",
  691. " <th>...</th>\n",
  692. " <th>lon</th>\n",
  693. " <th>qg</th>\n",
  694. " <th>state</th>\n",
  695. " <th>postcode</th>\n",
  696. " <th>patstat_appln_id</th>\n",
  697. " <th>applicant_type</th>\n",
  698. " <th>sa2_main11</th>\n",
  699. " <th>sa2_name11</th>\n",
  700. " <th>gcc_code11</th>\n",
  701. " <th>gcc_name11</th>\n",
  702. " </tr>\n",
  703. " </thead>\n",
  704. " <tbody>\n",
  705. " <tr>\n",
  706. " <th>0</th>\n",
  707. " <td>2011212055</td>\n",
  708. " <td>2016</td>\n",
  709. " <td>9999.0</td>\n",
  710. " <td>Standard</td>\n",
  711. " <td>FILED</td>\n",
  712. " <td>US</td>\n",
  713. " <td>NaN</td>\n",
  714. " <td>9999-12-31</td>\n",
  715. " <td>C11D 1/12</td>\n",
  716. " <td>1.0</td>\n",
  717. " <td>...</td>\n",
  718. " <td>NaN</td>\n",
  719. " <td>NaN</td>\n",
  720. " <td>NaN</td>\n",
  721. " <td>NaN</td>\n",
  722. " <td>NaN</td>\n",
  723. " <td>International</td>\n",
  724. " <td>NaN</td>\n",
  725. " <td>NaN</td>\n",
  726. " <td>NaN</td>\n",
  727. " <td>NaN</td>\n",
  728. " </tr>\n",
  729. " <tr>\n",
  730. " <th>1</th>\n",
  731. " <td>2011291595</td>\n",
  732. " <td>2016</td>\n",
  733. " <td>9999.0</td>\n",
  734. " <td>Standard</td>\n",
  735. " <td>FILED</td>\n",
  736. " <td>US</td>\n",
  737. " <td>NaN</td>\n",
  738. " <td>9999-12-31</td>\n",
  739. " <td>B23D 51/01</td>\n",
  740. " <td>NaN</td>\n",
  741. " <td>...</td>\n",
  742. " <td>NaN</td>\n",
  743. " <td>NaN</td>\n",
  744. " <td>NaN</td>\n",
  745. " <td>NaN</td>\n",
  746. " <td>NaN</td>\n",
  747. " <td>International</td>\n",
  748. " <td>NaN</td>\n",
  749. " <td>NaN</td>\n",
  750. " <td>NaN</td>\n",
  751. " <td>NaN</td>\n",
  752. " </tr>\n",
  753. " <tr>\n",
  754. " <th>2</th>\n",
  755. " <td>2012268660</td>\n",
  756. " <td>2016</td>\n",
  757. " <td>9999.0</td>\n",
  758. " <td>Standard</td>\n",
  759. " <td>FILED</td>\n",
  760. " <td>US</td>\n",
  761. " <td>NaN</td>\n",
  762. " <td>9999-12-31</td>\n",
  763. " <td>C09D 5/00</td>\n",
  764. " <td>1.0</td>\n",
  765. " <td>...</td>\n",
  766. " <td>NaN</td>\n",
  767. " <td>NaN</td>\n",
  768. " <td>NaN</td>\n",
  769. " <td>NaN</td>\n",
  770. " <td>NaN</td>\n",
  771. " <td>International</td>\n",
  772. " <td>NaN</td>\n",
  773. " <td>NaN</td>\n",
  774. " <td>NaN</td>\n",
  775. " <td>NaN</td>\n",
  776. " </tr>\n",
  777. " <tr>\n",
  778. " <th>3</th>\n",
  779. " <td>2012306228</td>\n",
  780. " <td>2016</td>\n",
  781. " <td>9999.0</td>\n",
  782. " <td>Standard</td>\n",
  783. " <td>FILED</td>\n",
  784. " <td>FI</td>\n",
  785. " <td>NaN</td>\n",
  786. " <td>9999-12-31</td>\n",
  787. " <td>B65D 90/12</td>\n",
  788. " <td>5.0</td>\n",
  789. " <td>...</td>\n",
  790. " <td>NaN</td>\n",
  791. " <td>NaN</td>\n",
  792. " <td>NaN</td>\n",
  793. " <td>NaN</td>\n",
  794. " <td>NaN</td>\n",
  795. " <td>International</td>\n",
  796. " <td>NaN</td>\n",
  797. " <td>NaN</td>\n",
  798. " <td>NaN</td>\n",
  799. " <td>NaN</td>\n",
  800. " </tr>\n",
  801. " <tr>\n",
  802. " <th>4</th>\n",
  803. " <td>2012306228</td>\n",
  804. " <td>2016</td>\n",
  805. " <td>9999.0</td>\n",
  806. " <td>Standard</td>\n",
  807. " <td>FILED</td>\n",
  808. " <td>FI</td>\n",
  809. " <td>NaN</td>\n",
  810. " <td>9999-12-31</td>\n",
  811. " <td>B65D 90/12</td>\n",
  812. " <td>5.0</td>\n",
  813. " <td>...</td>\n",
  814. " <td>NaN</td>\n",
  815. " <td>NaN</td>\n",
  816. " <td>NaN</td>\n",
  817. " <td>NaN</td>\n",
  818. " <td>NaN</td>\n",
  819. " <td>International</td>\n",
  820. " <td>NaN</td>\n",
  821. " <td>NaN</td>\n",
  822. " <td>NaN</td>\n",
  823. " <td>NaN</td>\n",
  824. " </tr>\n",
  825. " </tbody>\n",
  826. "</table>\n",
  827. "<p>5 rows × 36 columns</p>\n",
  828. "</div>"
  829. ],
  830. "text/plain": [
  831. " australian_appl_no application_year sealing_year patent_type \\\n",
  832. "0 2011212055 2016 9999.0 Standard \n",
  833. "1 2011291595 2016 9999.0 Standard \n",
  834. "2 2012268660 2016 9999.0 Standard \n",
  835. "3 2012306228 2016 9999.0 Standard \n",
  836. "4 2012306228 2016 9999.0 Standard \n",
  837. "\n",
  838. " patent_status_type country_x australian_x sealing_date \\\n",
  839. "0 FILED US NaN 9999-12-31 \n",
  840. "1 FILED US NaN 9999-12-31 \n",
  841. "2 FILED US NaN 9999-12-31 \n",
  842. "3 FILED FI NaN 9999-12-31 \n",
  843. "4 FILED FI NaN 9999-12-31 \n",
  844. "\n",
  845. " primary_ipc_mark_value entity_x ... lon qg state postcode \\\n",
  846. "0 C11D 1/12 1.0 ... NaN NaN NaN NaN \n",
  847. "1 B23D 51/01 NaN ... NaN NaN NaN NaN \n",
  848. "2 C09D 5/00 1.0 ... NaN NaN NaN NaN \n",
  849. "3 B65D 90/12 5.0 ... NaN NaN NaN NaN \n",
  850. "4 B65D 90/12 5.0 ... NaN NaN NaN NaN \n",
  851. "\n",
  852. " patstat_appln_id applicant_type sa2_main11 sa2_name11 gcc_code11 \\\n",
  853. "0 NaN International NaN NaN NaN \n",
  854. "1 NaN International NaN NaN NaN \n",
  855. "2 NaN International NaN NaN NaN \n",
  856. "3 NaN International NaN NaN NaN \n",
  857. "4 NaN International NaN NaN NaN \n",
  858. "\n",
  859. " gcc_name11 \n",
  860. "0 NaN \n",
  861. "1 NaN \n",
  862. "2 NaN \n",
  863. "3 NaN \n",
  864. "4 NaN \n",
  865. "\n",
  866. "[5 rows x 36 columns]"
  867. ]
  868. },
  869. "execution_count": 15,
  870. "metadata": {},
  871. "output_type": "execute_result"
  872. }
  873. ],
  874. "source": [
  875. "data_merged.head(5)"
  876. ]
  877. },
  878. {
  879. "cell_type": "code",
  880. "execution_count": 16,
  881. "metadata": {},
  882. "outputs": [
  883. {
  884. "data": {
  885. "text/plain": [
  886. "Index(['australian_appl_no', 'application_year', 'sealing_year', 'patent_type',\n",
  887. " 'patent_status_type', 'country_x', 'australian_x', 'sealing_date',\n",
  888. " 'primary_ipc_mark_value', 'entity_x', 'big_x', 'applicant_no',\n",
  889. " 'ipa_applt_id', 'appln_type', 'name', 'cleanname', 'country_y',\n",
  890. " 'australian_y', 'entity_y', 'ipa_id', 'abn', 'acn', 'source', 'big_y',\n",
  891. " 'ultimate', 'lat', 'lon', 'qg', 'state', 'postcode', 'patstat_appln_id',\n",
  892. " 'applicant_type', 'sa2_main11', 'sa2_name11', 'gcc_code11',\n",
  893. " 'gcc_name11'],\n",
  894. " dtype='object')"
  895. ]
  896. },
  897. "execution_count": 16,
  898. "metadata": {},
  899. "output_type": "execute_result"
  900. }
  901. ],
  902. "source": [
  903. "data_merged.columns"
  904. ]
  905. },
  906. {
  907. "cell_type": "markdown",
  908. "metadata": {},
  909. "source": [
  910. "## Check shape of merged dataset, note there are 39,298 patents in calendar year 2016"
  911. ]
  912. },
  913. {
  914. "cell_type": "code",
  915. "execution_count": 17,
  916. "metadata": {},
  917. "outputs": [
  918. {
  919. "data": {
  920. "text/plain": [
  921. "(39298, 36)"
  922. ]
  923. },
  924. "execution_count": 17,
  925. "metadata": {},
  926. "output_type": "execute_result"
  927. }
  928. ],
  929. "source": [
  930. "data_merged.shape"
  931. ]
  932. },
  933. {
  934. "cell_type": "markdown",
  935. "metadata": {},
  936. "source": [
  937. "## Create subset of merged dataset containing only coordinates "
  938. ]
  939. },
  940. {
  941. "cell_type": "code",
  942. "execution_count": 18,
  943. "metadata": {},
  944. "outputs": [],
  945. "source": [
  946. "lat_long_only = data_merged[['lat','lon']]"
  947. ]
  948. },
  949. {
  950. "cell_type": "code",
  951. "execution_count": 19,
  952. "metadata": {},
  953. "outputs": [
  954. {
  955. "name": "stdout",
  956. "output_type": "stream",
  957. "text": [
  958. "<class 'pandas.core.frame.DataFrame'>\n",
  959. "Int64Index: 39298 entries, 0 to 39297\n",
  960. "Data columns (total 2 columns):\n",
  961. "lat 7825 non-null float64\n",
  962. "lon 7825 non-null float64\n",
  963. "dtypes: float64(2)\n",
  964. "memory usage: 921.0 KB\n"
  965. ]
  966. }
  967. ],
  968. "source": [
  969. "lat_long_only.info()"
  970. ]
  971. },
  972. {
  973. "cell_type": "markdown",
  974. "metadata": {},
  975. "source": [
  976. "## Drop rows with null values"
  977. ]
  978. },
  979. {
  980. "cell_type": "code",
  981. "execution_count": 20,
  982. "metadata": {},
  983. "outputs": [],
  984. "source": [
  985. "lat_long_only2 = lat_long_only.dropna()"
  986. ]
  987. },
  988. {
  989. "cell_type": "markdown",
  990. "metadata": {},
  991. "source": [
  992. "## Note only 7825 rows remain"
  993. ]
  994. },
  995. {
  996. "cell_type": "code",
  997. "execution_count": 21,
  998. "metadata": {},
  999. "outputs": [
  1000. {
  1001. "data": {
  1002. "text/plain": [
  1003. "(7825, 2)"
  1004. ]
  1005. },
  1006. "execution_count": 21,
  1007. "metadata": {},
  1008. "output_type": "execute_result"
  1009. }
  1010. ],
  1011. "source": [
  1012. "lat_long_only2.shape"
  1013. ]
  1014. },
  1015. {
  1016. "cell_type": "markdown",
  1017. "metadata": {},
  1018. "source": [
  1019. "## The percentage of patent applications with usable coordinates in 2016"
  1020. ]
  1021. },
  1022. {
  1023. "cell_type": "code",
  1024. "execution_count": 22,
  1025. "metadata": {},
  1026. "outputs": [
  1027. {
  1028. "data": {
  1029. "text/plain": [
  1030. "19.9119548068604"
  1031. ]
  1032. },
  1033. "execution_count": 22,
  1034. "metadata": {},
  1035. "output_type": "execute_result"
  1036. }
  1037. ],
  1038. "source": [
  1039. "lat_long_only2.shape[0]/data_merged.shape[0] * 100"
  1040. ]
  1041. },
  1042. {
  1043. "cell_type": "markdown",
  1044. "metadata": {},
  1045. "source": [
  1046. "## Export the data to csv "
  1047. ]
  1048. },
  1049. {
  1050. "cell_type": "code",
  1051. "execution_count": 23,
  1052. "metadata": {},
  1053. "outputs": [],
  1054. "source": [
  1055. "lat_long_only2.to_csv(\"patent_coordinates_2016.csv\")"
  1056. ]
  1057. },
  1058. {
  1059. "cell_type": "code",
  1060. "execution_count": 24,
  1061. "metadata": {},
  1062. "outputs": [
  1063. {
  1064. "data": {
  1065. "text/html": [
  1066. "<div>\n",
  1067. "<style scoped>\n",
  1068. " .dataframe tbody tr th:only-of-type {\n",
  1069. " vertical-align: middle;\n",
  1070. " }\n",
  1071. "\n",
  1072. " .dataframe tbody tr th {\n",
  1073. " vertical-align: top;\n",
  1074. " }\n",
  1075. "\n",
  1076. " .dataframe thead th {\n",
  1077. " text-align: right;\n",
  1078. " }\n",
  1079. "</style>\n",
  1080. "<table border=\"1\" class=\"dataframe\">\n",
  1081. " <thead>\n",
  1082. " <tr style=\"text-align: right;\">\n",
  1083. " <th></th>\n",
  1084. " <th>australian_appl_no</th>\n",
  1085. " <th>application_year</th>\n",
  1086. " <th>sealing_year</th>\n",
  1087. " <th>patent_type</th>\n",
  1088. " <th>patent_status_type</th>\n",
  1089. " <th>country</th>\n",
  1090. " <th>australian</th>\n",
  1091. " <th>sealing_date</th>\n",
  1092. " <th>primary_ipc_mark_value</th>\n",
  1093. " <th>entity</th>\n",
  1094. " <th>big</th>\n",
  1095. " <th>applicant_no</th>\n",
  1096. " </tr>\n",
  1097. " </thead>\n",
  1098. " <tbody>\n",
  1099. " <tr>\n",
  1100. " <th>0</th>\n",
  1101. " <td>1904000001</td>\n",
  1102. " <td>1904</td>\n",
  1103. " <td>9999.0</td>\n",
  1104. " <td>Standard</td>\n",
  1105. " <td>NaN</td>\n",
  1106. " <td>NaN</td>\n",
  1107. " <td>NaN</td>\n",
  1108. " <td>9999-12-31</td>\n",
  1109. " <td>NaN</td>\n",
  1110. " <td>NaN</td>\n",
  1111. " <td>NaN</td>\n",
  1112. " <td>1</td>\n",
  1113. " </tr>\n",
  1114. " <tr>\n",
  1115. " <th>1</th>\n",
  1116. " <td>1904000004</td>\n",
  1117. " <td>1904</td>\n",
  1118. " <td>9999.0</td>\n",
  1119. " <td>Standard</td>\n",
  1120. " <td>NaN</td>\n",
  1121. " <td>NaN</td>\n",
  1122. " <td>NaN</td>\n",
  1123. " <td>9999-12-31</td>\n",
  1124. " <td>NaN</td>\n",
  1125. " <td>NaN</td>\n",
  1126. " <td>NaN</td>\n",
  1127. " <td>1</td>\n",
  1128. " </tr>\n",
  1129. " <tr>\n",
  1130. " <th>2</th>\n",
  1131. " <td>1904000006</td>\n",
  1132. " <td>1904</td>\n",
  1133. " <td>9999.0</td>\n",
  1134. " <td>Standard</td>\n",
  1135. " <td>NaN</td>\n",
  1136. " <td>NaN</td>\n",
  1137. " <td>NaN</td>\n",
  1138. " <td>9999-12-31</td>\n",
  1139. " <td>NaN</td>\n",
  1140. " <td>NaN</td>\n",
  1141. " <td>NaN</td>\n",
  1142. " <td>1</td>\n",
  1143. " </tr>\n",
  1144. " <tr>\n",
  1145. " <th>3</th>\n",
  1146. " <td>1904000009</td>\n",
  1147. " <td>1904</td>\n",
  1148. " <td>9999.0</td>\n",
  1149. " <td>Standard</td>\n",
  1150. " <td>NaN</td>\n",
  1151. " <td>NaN</td>\n",
  1152. " <td>NaN</td>\n",
  1153. " <td>9999-12-31</td>\n",
  1154. " <td>NaN</td>\n",
  1155. " <td>NaN</td>\n",
  1156. " <td>NaN</td>\n",
  1157. " <td>1</td>\n",
  1158. " </tr>\n",
  1159. " <tr>\n",
  1160. " <th>4</th>\n",
  1161. " <td>1904000010</td>\n",
  1162. " <td>1904</td>\n",
  1163. " <td>9999.0</td>\n",
  1164. " <td>Standard</td>\n",
  1165. " <td>NaN</td>\n",
  1166. " <td>NaN</td>\n",
  1167. " <td>NaN</td>\n",
  1168. " <td>9999-12-31</td>\n",
  1169. " <td>NaN</td>\n",
  1170. " <td>NaN</td>\n",
  1171. " <td>NaN</td>\n",
  1172. " <td>1</td>\n",
  1173. " </tr>\n",
  1174. " </tbody>\n",
  1175. "</table>\n",
  1176. "</div>"
  1177. ],
  1178. "text/plain": [
  1179. " australian_appl_no application_year sealing_year patent_type \\\n",
  1180. "0 1904000001 1904 9999.0 Standard \n",
  1181. "1 1904000004 1904 9999.0 Standard \n",
  1182. "2 1904000006 1904 9999.0 Standard \n",
  1183. "3 1904000009 1904 9999.0 Standard \n",
  1184. "4 1904000010 1904 9999.0 Standard \n",
  1185. "\n",
  1186. " patent_status_type country australian sealing_date primary_ipc_mark_value \\\n",
  1187. "0 NaN NaN NaN 9999-12-31 NaN \n",
  1188. "1 NaN NaN NaN 9999-12-31 NaN \n",
  1189. "2 NaN NaN NaN 9999-12-31 NaN \n",
  1190. "3 NaN NaN NaN 9999-12-31 NaN \n",
  1191. "4 NaN NaN NaN 9999-12-31 NaN \n",
  1192. "\n",
  1193. " entity big applicant_no \n",
  1194. "0 NaN NaN 1 \n",
  1195. "1 NaN NaN 1 \n",
  1196. "2 NaN NaN 1 \n",
  1197. "3 NaN NaN 1 \n",
  1198. "4 NaN NaN 1 "
  1199. ]
  1200. },
  1201. "execution_count": 24,
  1202. "metadata": {},
  1203. "output_type": "execute_result"
  1204. }
  1205. ],
  1206. "source": [
  1207. "data1.head()"
  1208. ]
  1209. },
  1210. {
  1211. "cell_type": "code",
  1212. "execution_count": 25,
  1213. "metadata": {},
  1214. "outputs": [],
  1215. "source": [
  1216. "subset = data1[[\"australian_appl_no\", \"application_year\"]]"
  1217. ]
  1218. },
  1219. {
  1220. "cell_type": "code",
  1221. "execution_count": 27,
  1222. "metadata": {},
  1223. "outputs": [
  1224. {
  1225. "data": {
  1226. "text/html": [
  1227. "<div>\n",
  1228. "<style scoped>\n",
  1229. " .dataframe tbody tr th:only-of-type {\n",
  1230. " vertical-align: middle;\n",
  1231. " }\n",
  1232. "\n",
  1233. " .dataframe tbody tr th {\n",
  1234. " vertical-align: top;\n",
  1235. " }\n",
  1236. "\n",
  1237. " .dataframe thead th {\n",
  1238. " text-align: right;\n",
  1239. " }\n",
  1240. "</style>\n",
  1241. "<table border=\"1\" class=\"dataframe\">\n",
  1242. " <thead>\n",
  1243. " <tr style=\"text-align: right;\">\n",
  1244. " <th></th>\n",
  1245. " <th>australian_appl_no</th>\n",
  1246. " <th>application_year</th>\n",
  1247. " </tr>\n",
  1248. " </thead>\n",
  1249. " <tbody>\n",
  1250. " <tr>\n",
  1251. " <th>0</th>\n",
  1252. " <td>1904000001</td>\n",
  1253. " <td>1904</td>\n",
  1254. " </tr>\n",
  1255. " <tr>\n",
  1256. " <th>1</th>\n",
  1257. " <td>1904000004</td>\n",
  1258. " <td>1904</td>\n",
  1259. " </tr>\n",
  1260. " <tr>\n",
  1261. " <th>2</th>\n",
  1262. " <td>1904000006</td>\n",
  1263. " <td>1904</td>\n",
  1264. " </tr>\n",
  1265. " <tr>\n",
  1266. " <th>3</th>\n",
  1267. " <td>1904000009</td>\n",
  1268. " <td>1904</td>\n",
  1269. " </tr>\n",
  1270. " <tr>\n",
  1271. " <th>4</th>\n",
  1272. " <td>1904000010</td>\n",
  1273. " <td>1904</td>\n",
  1274. " </tr>\n",
  1275. " </tbody>\n",
  1276. "</table>\n",
  1277. "</div>"
  1278. ],
  1279. "text/plain": [
  1280. " australian_appl_no application_year\n",
  1281. "0 1904000001 1904\n",
  1282. "1 1904000004 1904\n",
  1283. "2 1904000006 1904\n",
  1284. "3 1904000009 1904\n",
  1285. "4 1904000010 1904"
  1286. ]
  1287. },
  1288. "execution_count": 27,
  1289. "metadata": {},
  1290. "output_type": "execute_result"
  1291. }
  1292. ],
  1293. "source": [
  1294. "subset.head()"
  1295. ]
  1296. },
  1297. {
  1298. "cell_type": "code",
  1299. "execution_count": 33,
  1300. "metadata": {},
  1301. "outputs": [
  1302. {
  1303. "name": "stdout",
  1304. "output_type": "stream",
  1305. "text": [
  1306. "<class 'pandas.core.frame.DataFrame'>\n",
  1307. "Int64Index: 1041145 entries, 0 to 516146\n",
  1308. "Data columns (total 25 columns):\n",
  1309. "ipa_applt_id 1041145 non-null int64\n",
  1310. "australian_appl_no 1041145 non-null object\n",
  1311. "appln_type 1041145 non-null object\n",
  1312. "name 1041141 non-null object\n",
  1313. "cleanname 1041141 non-null object\n",
  1314. "country 1041141 non-null object\n",
  1315. "australian 301518 non-null float64\n",
  1316. "entity 806061 non-null float64\n",
  1317. "ipa_id 1040415 non-null float64\n",
  1318. "abn 116639 non-null float64\n",
  1319. "acn 115536 non-null float64\n",
  1320. "source 132897 non-null object\n",
  1321. "big 32775 non-null float64\n",
  1322. "ultimate 17235 non-null float64\n",
  1323. "lat 276002 non-null float64\n",
  1324. "lon 276002 non-null float64\n",
  1325. "qg 301518 non-null float64\n",
  1326. "state 301114 non-null object\n",
  1327. "postcode 297944 non-null float64\n",
  1328. "patstat_appln_id 572056 non-null float64\n",
  1329. "applicant_type 1041145 non-null object\n",
  1330. "sa2_main11 275080 non-null float64\n",
  1331. "sa2_name11 275080 non-null object\n",
  1332. "gcc_code11 275080 non-null object\n",
  1333. "gcc_name11 275080 non-null object\n",
  1334. "dtypes: float64(13), int64(1), object(11)\n",
  1335. "memory usage: 162.8+ MB\n"
  1336. ]
  1337. }
  1338. ],
  1339. "source": [
  1340. "data2.info()"
  1341. ]
  1342. },
  1343. {
  1344. "cell_type": "code",
  1345. "execution_count": 43,
  1346. "metadata": {},
  1347. "outputs": [
  1348. {
  1349. "data": {
  1350. "text/plain": [
  1351. "2047 1\n",
  1352. "541365 1\n",
  1353. "7631828 1\n",
  1354. "524189 1\n",
  1355. "7625681 1\n",
  1356. "6768861 1\n",
  1357. "505826 1\n",
  1358. "6524284 1\n",
  1359. "5223343 1\n",
  1360. "6555513 1\n",
  1361. "6774683 1\n",
  1362. "27098 1\n",
  1363. "235569 1\n",
  1364. "7809983 1\n",
  1365. "7807934 1\n",
  1366. "497638 1\n",
  1367. "7627734 1\n",
  1368. "18910 1\n",
  1369. "270498 1\n",
  1370. "6552748 1\n",
  1371. "6548848 1\n",
  1372. "227373 1\n",
  1373. "382202 1\n",
  1374. "522218 1\n",
  1375. "7604296 1\n",
  1376. "317071 1\n",
  1377. "110529 1\n",
  1378. "6828574 1\n",
  1379. "6540660 1\n",
  1380. "524989 1\n",
  1381. " ..\n",
  1382. "6892426 1\n",
  1383. "6890379 1\n",
  1384. "6816687 1\n",
  1385. "6818734 1\n",
  1386. "420674 1\n",
  1387. "6822828 1\n",
  1388. "6824875 1\n",
  1389. "6826922 1\n",
  1390. "537513 1\n",
  1391. "6831016 1\n",
  1392. "6833063 1\n",
  1393. "6835110 1\n",
  1394. "6837157 1\n",
  1395. "6839204 1\n",
  1396. "6841251 1\n",
  1397. "6843298 1\n",
  1398. "6845345 1\n",
  1399. "6847392 1\n",
  1400. "6914975 1\n",
  1401. "6917022 1\n",
  1402. "6919069 1\n",
  1403. "6921116 1\n",
  1404. "300736 1\n",
  1405. "366608 1\n",
  1406. "6605461 1\n",
  1407. "6610917 1\n",
  1408. "6884238 1\n",
  1409. "6886285 1\n",
  1410. "6888332 1\n",
  1411. "2049 1\n",
  1412. "Name: ipa_applt_id, Length: 1041145, dtype: int64"
  1413. ]
  1414. },
  1415. "execution_count": 43,
  1416. "metadata": {},
  1417. "output_type": "execute_result"
  1418. }
  1419. ],
  1420. "source": [
  1421. "data2[\"ipa_applt_id\"].value_counts()"
  1422. ]
  1423. },
  1424. {
  1425. "cell_type": "code",
  1426. "execution_count": 38,
  1427. "metadata": {},
  1428. "outputs": [
  1429. {
  1430. "data": {
  1431. "text/html": [
  1432. "<div>\n",
  1433. "<style scoped>\n",
  1434. " .dataframe tbody tr th:only-of-type {\n",
  1435. " vertical-align: middle;\n",
  1436. " }\n",
  1437. "\n",
  1438. " .dataframe tbody tr th {\n",
  1439. " vertical-align: top;\n",
  1440. " }\n",
  1441. "\n",
  1442. " .dataframe thead th {\n",
  1443. " text-align: right;\n",
  1444. " }\n",
  1445. "</style>\n",
  1446. "<table border=\"1\" class=\"dataframe\">\n",
  1447. " <thead>\n",
  1448. " <tr style=\"text-align: right;\">\n",
  1449. " <th></th>\n",
  1450. " <th>ipa_applt_id</th>\n",
  1451. " <th>australian_appl_no</th>\n",
  1452. " <th>appln_type</th>\n",
  1453. " <th>name</th>\n",
  1454. " <th>cleanname</th>\n",
  1455. " <th>country</th>\n",
  1456. " <th>australian</th>\n",
  1457. " <th>entity</th>\n",
  1458. " <th>ipa_id</th>\n",
  1459. " <th>abn</th>\n",
  1460. " <th>...</th>\n",
  1461. " <th>lon</th>\n",
  1462. " <th>qg</th>\n",
  1463. " <th>state</th>\n",
  1464. " <th>postcode</th>\n",
  1465. " <th>patstat_appln_id</th>\n",
  1466. " <th>applicant_type</th>\n",
  1467. " <th>sa2_main11</th>\n",
  1468. " <th>sa2_name11</th>\n",
  1469. " <th>gcc_code11</th>\n",
  1470. " <th>gcc_name11</th>\n",
  1471. " </tr>\n",
  1472. " </thead>\n",
  1473. " <tbody>\n",
  1474. " <tr>\n",
  1475. " <th>328868</th>\n",
  1476. " <td>505022</td>\n",
  1477. " <td>2011242127</td>\n",
  1478. " <td>PAT</td>\n",
  1479. " <td>Ryba, Eric</td>\n",
  1480. " <td>RYBA, ERIC</td>\n",
  1481. " <td>US</td>\n",
  1482. " <td>NaN</td>\n",
  1483. " <td>NaN</td>\n",
  1484. " <td>523838.0</td>\n",
  1485. " <td>NaN</td>\n",
  1486. " <td>...</td>\n",
  1487. " <td>NaN</td>\n",
  1488. " <td>NaN</td>\n",
  1489. " <td>NaN</td>\n",
  1490. " <td>NaN</td>\n",
  1491. " <td>365754711.0</td>\n",
  1492. " <td>International</td>\n",
  1493. " <td>NaN</td>\n",
  1494. " <td>NaN</td>\n",
  1495. " <td>NaN</td>\n",
  1496. " <td>NaN</td>\n",
  1497. " </tr>\n",
  1498. " <tr>\n",
  1499. " <th>328869</th>\n",
  1500. " <td>505018</td>\n",
  1501. " <td>2011242127</td>\n",
  1502. " <td>PAT</td>\n",
  1503. " <td>Mullins, Barry</td>\n",
  1504. " <td>MULLINS, BARRY</td>\n",
  1505. " <td>IE</td>\n",
  1506. " <td>NaN</td>\n",
  1507. " <td>NaN</td>\n",
  1508. " <td>764553.0</td>\n",
  1509. " <td>NaN</td>\n",
  1510. " <td>...</td>\n",
  1511. " <td>NaN</td>\n",
  1512. " <td>NaN</td>\n",
  1513. " <td>NaN</td>\n",
  1514. " <td>NaN</td>\n",
  1515. " <td>365754711.0</td>\n",
  1516. " <td>International</td>\n",
  1517. " <td>NaN</td>\n",
  1518. " <td>NaN</td>\n",
  1519. " <td>NaN</td>\n",
  1520. " <td>NaN</td>\n",
  1521. " </tr>\n",
  1522. " <tr>\n",
  1523. " <th>328870</th>\n",
  1524. " <td>505010</td>\n",
  1525. " <td>2011242127</td>\n",
  1526. " <td>PAT</td>\n",
  1527. " <td>Hughes, Luke</td>\n",
  1528. " <td>HUGHES, LUKE</td>\n",
  1529. " <td>IE</td>\n",
  1530. " <td>NaN</td>\n",
  1531. " <td>NaN</td>\n",
  1532. " <td>615868.0</td>\n",
  1533. " <td>NaN</td>\n",
  1534. " <td>...</td>\n",
  1535. " <td>NaN</td>\n",
  1536. " <td>NaN</td>\n",
  1537. " <td>NaN</td>\n",
  1538. " <td>NaN</td>\n",
  1539. " <td>365754711.0</td>\n",
  1540. " <td>International</td>\n",
  1541. " <td>NaN</td>\n",
  1542. " <td>NaN</td>\n",
  1543. " <td>NaN</td>\n",
  1544. " <td>NaN</td>\n",
  1545. " </tr>\n",
  1546. " <tr>\n",
  1547. " <th>328871</th>\n",
  1548. " <td>505008</td>\n",
  1549. " <td>2011242127</td>\n",
  1550. " <td>PAT</td>\n",
  1551. " <td>Donovan, Danny</td>\n",
  1552. " <td>DONOVAN, DANNY</td>\n",
  1553. " <td>IE</td>\n",
  1554. " <td>NaN</td>\n",
  1555. " <td>NaN</td>\n",
  1556. " <td>823533.0</td>\n",
  1557. " <td>NaN</td>\n",
  1558. " <td>...</td>\n",
  1559. " <td>NaN</td>\n",
  1560. " <td>NaN</td>\n",
  1561. " <td>NaN</td>\n",
  1562. " <td>NaN</td>\n",
  1563. " <td>365754711.0</td>\n",
  1564. " <td>International</td>\n",
  1565. " <td>NaN</td>\n",
  1566. " <td>NaN</td>\n",
  1567. " <td>NaN</td>\n",
  1568. " <td>NaN</td>\n",
  1569. " </tr>\n",
  1570. " <tr>\n",
  1571. " <th>328872</th>\n",
  1572. " <td>505017</td>\n",
  1573. " <td>2011242127</td>\n",
  1574. " <td>PAT</td>\n",
  1575. " <td>McMullin, Gwenda</td>\n",
  1576. " <td>MCMULLIN, GWENDA</td>\n",
  1577. " <td>IE</td>\n",
  1578. " <td>NaN</td>\n",
  1579. " <td>NaN</td>\n",
  1580. " <td>775148.0</td>\n",
  1581. " <td>NaN</td>\n",
  1582. " <td>...</td>\n",
  1583. " <td>NaN</td>\n",
  1584. " <td>NaN</td>\n",
  1585. " <td>NaN</td>\n",
  1586. " <td>NaN</td>\n",
  1587. " <td>365754711.0</td>\n",
  1588. " <td>International</td>\n",
  1589. " <td>NaN</td>\n",
  1590. " <td>NaN</td>\n",
  1591. " <td>NaN</td>\n",
  1592. " <td>NaN</td>\n",
  1593. " </tr>\n",
  1594. " <tr>\n",
  1595. " <th>328873</th>\n",
  1596. " <td>505016</td>\n",
  1597. " <td>2011242127</td>\n",
  1598. " <td>PAT</td>\n",
  1599. " <td>Leung, Mark</td>\n",
  1600. " <td>LEUNG, MARK</td>\n",
  1601. " <td>CA</td>\n",
  1602. " <td>NaN</td>\n",
  1603. " <td>NaN</td>\n",
  1604. " <td>828513.0</td>\n",
  1605. " <td>NaN</td>\n",
  1606. " <td>...</td>\n",
  1607. " <td>NaN</td>\n",
  1608. " <td>NaN</td>\n",
  1609. " <td>NaN</td>\n",
  1610. " <td>NaN</td>\n",
  1611. " <td>365754711.0</td>\n",
  1612. " <td>International</td>\n",
  1613. " <td>NaN</td>\n",
  1614. " <td>NaN</td>\n",
  1615. " <td>NaN</td>\n",
  1616. " <td>NaN</td>\n",
  1617. " </tr>\n",
  1618. " <tr>\n",
  1619. " <th>328874</th>\n",
  1620. " <td>505023</td>\n",
  1621. " <td>2011242127</td>\n",
  1622. " <td>PAT</td>\n",
  1623. " <td>Sweeney, Fiachra</td>\n",
  1624. " <td>SWEENEY, FIACHRA</td>\n",
  1625. " <td>IE</td>\n",
  1626. " <td>NaN</td>\n",
  1627. " <td>NaN</td>\n",
  1628. " <td>764070.0</td>\n",
  1629. " <td>NaN</td>\n",
  1630. " <td>...</td>\n",
  1631. " <td>NaN</td>\n",
  1632. " <td>NaN</td>\n",
  1633. " <td>NaN</td>\n",
  1634. " <td>NaN</td>\n",
  1635. " <td>365754711.0</td>\n",
  1636. " <td>International</td>\n",
  1637. " <td>NaN</td>\n",
  1638. " <td>NaN</td>\n",
  1639. " <td>NaN</td>\n",
  1640. " <td>NaN</td>\n",
  1641. " </tr>\n",
  1642. " <tr>\n",
  1643. " <th>328875</th>\n",
  1644. " <td>505012</td>\n",
  1645. " <td>2011242127</td>\n",
  1646. " <td>PAT</td>\n",
  1647. " <td>Kelly, Gary</td>\n",
  1648. " <td>KELLY, GARY</td>\n",
  1649. " <td>IE</td>\n",
  1650. " <td>NaN</td>\n",
  1651. " <td>NaN</td>\n",
  1652. " <td>594158.0</td>\n",
  1653. " <td>NaN</td>\n",
  1654. " <td>...</td>\n",
  1655. " <td>NaN</td>\n",
  1656. " <td>NaN</td>\n",
  1657. " <td>NaN</td>\n",
  1658. " <td>NaN</td>\n",
  1659. " <td>365754711.0</td>\n",
  1660. " <td>International</td>\n",
  1661. " <td>NaN</td>\n",
  1662. " <td>NaN</td>\n",
  1663. " <td>NaN</td>\n",
  1664. " <td>NaN</td>\n",
  1665. " </tr>\n",
  1666. " <tr>\n",
  1667. " <th>328876</th>\n",
  1668. " <td>505014</td>\n",
  1669. " <td>2011242127</td>\n",
  1670. " <td>PAT</td>\n",
  1671. " <td>Kelly, John</td>\n",
  1672. " <td>KELLY, JOHN</td>\n",
  1673. " <td>IE</td>\n",
  1674. " <td>NaN</td>\n",
  1675. " <td>NaN</td>\n",
  1676. " <td>717848.0</td>\n",
  1677. " <td>NaN</td>\n",
  1678. " <td>...</td>\n",
  1679. " <td>NaN</td>\n",
  1680. " <td>NaN</td>\n",
  1681. " <td>NaN</td>\n",
  1682. " <td>NaN</td>\n",
  1683. " <td>365754711.0</td>\n",
  1684. " <td>International</td>\n",
  1685. " <td>NaN</td>\n",
  1686. " <td>NaN</td>\n",
  1687. " <td>NaN</td>\n",
  1688. " <td>NaN</td>\n",
  1689. " </tr>\n",
  1690. " <tr>\n",
  1691. " <th>328877</th>\n",
  1692. " <td>505005</td>\n",
  1693. " <td>2011242127</td>\n",
  1694. " <td>PAT</td>\n",
  1695. " <td>Buckley, Naomi</td>\n",
  1696. " <td>BUCKLEY, NAOMI</td>\n",
  1697. " <td>IE</td>\n",
  1698. " <td>NaN</td>\n",
  1699. " <td>NaN</td>\n",
  1700. " <td>647038.0</td>\n",
  1701. " <td>NaN</td>\n",
  1702. " <td>...</td>\n",
  1703. " <td>NaN</td>\n",
  1704. " <td>NaN</td>\n",
  1705. " <td>NaN</td>\n",
  1706. " <td>NaN</td>\n",
  1707. " <td>365754711.0</td>\n",
  1708. " <td>International</td>\n",
  1709. " <td>NaN</td>\n",
  1710. " <td>NaN</td>\n",
  1711. " <td>NaN</td>\n",
  1712. " <td>NaN</td>\n",
  1713. " </tr>\n",
  1714. " <tr>\n",
  1715. " <th>328878</th>\n",
  1716. " <td>505027</td>\n",
  1717. " <td>2011242127</td>\n",
  1718. " <td>PAT</td>\n",
  1719. " <td>Zarins, Denise</td>\n",
  1720. " <td>ZARINS, DENISE</td>\n",
  1721. " <td>US</td>\n",
  1722. " <td>NaN</td>\n",
  1723. " <td>NaN</td>\n",
  1724. " <td>544663.0</td>\n",
  1725. " <td>NaN</td>\n",
  1726. " <td>...</td>\n",
  1727. " <td>NaN</td>\n",
  1728. " <td>NaN</td>\n",
  1729. " <td>NaN</td>\n",
  1730. " <td>NaN</td>\n",
  1731. " <td>365754711.0</td>\n",
  1732. " <td>International</td>\n",
  1733. " <td>NaN</td>\n",
  1734. " <td>NaN</td>\n",
  1735. " <td>NaN</td>\n",
  1736. " <td>NaN</td>\n",
  1737. " </tr>\n",
  1738. " <tr>\n",
  1739. " <th>328879</th>\n",
  1740. " <td>505007</td>\n",
  1741. " <td>2011242127</td>\n",
  1742. " <td>PAT</td>\n",
  1743. " <td>Cummins, Michael</td>\n",
  1744. " <td>CUMMINS, MICHAEL</td>\n",
  1745. " <td>IE</td>\n",
  1746. " <td>NaN</td>\n",
  1747. " <td>NaN</td>\n",
  1748. " <td>627347.0</td>\n",
  1749. " <td>NaN</td>\n",
  1750. " <td>...</td>\n",
  1751. " <td>NaN</td>\n",
  1752. " <td>NaN</td>\n",
  1753. " <td>NaN</td>\n",
  1754. " <td>NaN</td>\n",
  1755. " <td>365754711.0</td>\n",
  1756. " <td>International</td>\n",
  1757. " <td>NaN</td>\n",
  1758. " <td>NaN</td>\n",
  1759. " <td>NaN</td>\n",
  1760. " <td>NaN</td>\n",
  1761. " </tr>\n",
  1762. " <tr>\n",
  1763. " <th>328880</th>\n",
  1764. " <td>505024</td>\n",
  1765. " <td>2011242127</td>\n",
  1766. " <td>PAT</td>\n",
  1767. " <td>Tilotta, Vincenzo</td>\n",
  1768. " <td>TILOTTA, VINCENZO</td>\n",
  1769. " <td>IE</td>\n",
  1770. " <td>NaN</td>\n",
  1771. " <td>NaN</td>\n",
  1772. " <td>731287.0</td>\n",
  1773. " <td>NaN</td>\n",
  1774. " <td>...</td>\n",
  1775. " <td>NaN</td>\n",
  1776. " <td>NaN</td>\n",
  1777. " <td>NaN</td>\n",
  1778. " <td>NaN</td>\n",
  1779. " <td>365754711.0</td>\n",
  1780. " <td>International</td>\n",
  1781. " <td>NaN</td>\n",
  1782. " <td>NaN</td>\n",
  1783. " <td>NaN</td>\n",
  1784. " <td>NaN</td>\n",
  1785. " </tr>\n",
  1786. " <tr>\n",
  1787. " <th>328881</th>\n",
  1788. " <td>505006</td>\n",
  1789. " <td>2011242127</td>\n",
  1790. " <td>PAT</td>\n",
  1791. " <td>Clark, Benjamin</td>\n",
  1792. " <td>CLARK, BENJAMIN</td>\n",
  1793. " <td>US</td>\n",
  1794. " <td>NaN</td>\n",
  1795. " <td>NaN</td>\n",
  1796. " <td>839498.0</td>\n",
  1797. " <td>NaN</td>\n",
  1798. " <td>...</td>\n",
  1799. " <td>NaN</td>\n",
  1800. " <td>NaN</td>\n",
  1801. " <td>NaN</td>\n",
  1802. " <td>NaN</td>\n",
  1803. " <td>365754711.0</td>\n",
  1804. " <td>International</td>\n",
  1805. " <td>NaN</td>\n",
  1806. " <td>NaN</td>\n",
  1807. " <td>NaN</td>\n",
  1808. " <td>NaN</td>\n",
  1809. " </tr>\n",
  1810. " <tr>\n",
  1811. " <th>328882</th>\n",
  1812. " <td>505009</td>\n",
  1813. " <td>2011242127</td>\n",
  1814. " <td>PAT</td>\n",
  1815. " <td>Gelfand, Mark</td>\n",
  1816. " <td>GELFAND, MARK</td>\n",
  1817. " <td>US</td>\n",
  1818. " <td>NaN</td>\n",
  1819. " <td>NaN</td>\n",
  1820. " <td>757227.0</td>\n",
  1821. " <td>NaN</td>\n",
  1822. " <td>...</td>\n",
  1823. " <td>NaN</td>\n",
  1824. " <td>NaN</td>\n",
  1825. " <td>NaN</td>\n",
  1826. " <td>NaN</td>\n",
  1827. " <td>365754711.0</td>\n",
  1828. " <td>International</td>\n",
  1829. " <td>NaN</td>\n",
  1830. " <td>NaN</td>\n",
  1831. " <td>NaN</td>\n",
  1832. " <td>NaN</td>\n",
  1833. " </tr>\n",
  1834. " <tr>\n",
  1835. " <th>328883</th>\n",
  1836. " <td>505015</td>\n",
  1837. " <td>2011242127</td>\n",
  1838. " <td>PAT</td>\n",
  1839. " <td>Ledwith, Seamus</td>\n",
  1840. " <td>LEDWITH, SEAMUS</td>\n",
  1841. " <td>IE</td>\n",
  1842. " <td>NaN</td>\n",
  1843. " <td>NaN</td>\n",
  1844. " <td>582278.0</td>\n",
  1845. " <td>NaN</td>\n",
  1846. " <td>...</td>\n",
  1847. " <td>NaN</td>\n",
  1848. " <td>NaN</td>\n",
  1849. " <td>NaN</td>\n",
  1850. " <td>NaN</td>\n",
  1851. " <td>365754711.0</td>\n",
  1852. " <td>International</td>\n",
  1853. " <td>NaN</td>\n",
  1854. " <td>NaN</td>\n",
  1855. " <td>NaN</td>\n",
  1856. " <td>NaN</td>\n",
  1857. " </tr>\n",
  1858. " <tr>\n",
  1859. " <th>328884</th>\n",
  1860. " <td>505021</td>\n",
  1861. " <td>2011242127</td>\n",
  1862. " <td>PAT</td>\n",
  1863. " <td>Piccagli, Francesco</td>\n",
  1864. " <td>PICCAGLI, FRANCESCO</td>\n",
  1865. " <td>IT</td>\n",
  1866. " <td>NaN</td>\n",
  1867. " <td>NaN</td>\n",
  1868. " <td>573395.0</td>\n",
  1869. " <td>NaN</td>\n",
  1870. " <td>...</td>\n",
  1871. " <td>NaN</td>\n",
  1872. " <td>NaN</td>\n",
  1873. " <td>NaN</td>\n",
  1874. " <td>NaN</td>\n",
  1875. " <td>365754711.0</td>\n",
  1876. " <td>International</td>\n",
  1877. " <td>NaN</td>\n",
  1878. " <td>NaN</td>\n",
  1879. " <td>NaN</td>\n",
  1880. " <td>NaN</td>\n",
  1881. " </tr>\n",
  1882. " <tr>\n",
  1883. " <th>328885</th>\n",
  1884. " <td>505025</td>\n",
  1885. " <td>2011242127</td>\n",
  1886. " <td>PAT</td>\n",
  1887. " <td>Turovskiy, Roman</td>\n",
  1888. " <td>TUROVSKIY, ROMAN</td>\n",
  1889. " <td>US</td>\n",
  1890. " <td>NaN</td>\n",
  1891. " <td>NaN</td>\n",
  1892. " <td>815052.0</td>\n",
  1893. " <td>NaN</td>\n",
  1894. " <td>...</td>\n",
  1895. " <td>NaN</td>\n",
  1896. " <td>NaN</td>\n",
  1897. " <td>NaN</td>\n",
  1898. " <td>NaN</td>\n",
  1899. " <td>365754711.0</td>\n",
  1900. " <td>International</td>\n",
  1901. " <td>NaN</td>\n",
  1902. " <td>NaN</td>\n",
  1903. " <td>NaN</td>\n",
  1904. " <td>NaN</td>\n",
  1905. " </tr>\n",
  1906. " <tr>\n",
  1907. " <th>328886</th>\n",
  1908. " <td>505019</td>\n",
  1909. " <td>2011242127</td>\n",
  1910. " <td>PAT</td>\n",
  1911. " <td>Naga, Karun D</td>\n",
  1912. " <td>NAGA, KARUN D</td>\n",
  1913. " <td>US</td>\n",
  1914. " <td>NaN</td>\n",
  1915. " <td>NaN</td>\n",
  1916. " <td>569675.0</td>\n",
  1917. " <td>NaN</td>\n",
  1918. " <td>...</td>\n",
  1919. " <td>NaN</td>\n",
  1920. " <td>NaN</td>\n",
  1921. " <td>NaN</td>\n",
  1922. " <td>NaN</td>\n",
  1923. " <td>365754711.0</td>\n",
  1924. " <td>International</td>\n",
  1925. " <td>NaN</td>\n",
  1926. " <td>NaN</td>\n",
  1927. " <td>NaN</td>\n",
  1928. " <td>NaN</td>\n",
  1929. " </tr>\n",
  1930. " <tr>\n",
  1931. " <th>328887</th>\n",
  1932. " <td>505013</td>\n",
  1933. " <td>2011242127</td>\n",
  1934. " <td>PAT</td>\n",
  1935. " <td>Kelly, Grace</td>\n",
  1936. " <td>KELLY, GRACE</td>\n",
  1937. " <td>IE</td>\n",
  1938. " <td>NaN</td>\n",
  1939. " <td>NaN</td>\n",
  1940. " <td>884565.0</td>\n",
  1941. " <td>NaN</td>\n",
  1942. " <td>...</td>\n",
  1943. " <td>NaN</td>\n",
  1944. " <td>NaN</td>\n",
  1945. " <td>NaN</td>\n",
  1946. " <td>NaN</td>\n",
  1947. " <td>365754711.0</td>\n",
  1948. " <td>International</td>\n",
  1949. " <td>NaN</td>\n",
  1950. " <td>NaN</td>\n",
  1951. " <td>NaN</td>\n",
  1952. " <td>NaN</td>\n",
  1953. " </tr>\n",
  1954. " <tr>\n",
  1955. " <th>328888</th>\n",
  1956. " <td>505026</td>\n",
  1957. " <td>2011242127</td>\n",
  1958. " <td>PAT</td>\n",
  1959. " <td>Woolley, Lana</td>\n",
  1960. " <td>WOOLLEY, LANA</td>\n",
  1961. " <td>IE</td>\n",
  1962. " <td>NaN</td>\n",
  1963. " <td>NaN</td>\n",
  1964. " <td>547325.0</td>\n",
  1965. " <td>NaN</td>\n",
  1966. " <td>...</td>\n",
  1967. " <td>NaN</td>\n",
  1968. " <td>NaN</td>\n",
  1969. " <td>NaN</td>\n",
  1970. " <td>NaN</td>\n",
  1971. " <td>365754711.0</td>\n",
  1972. " <td>International</td>\n",
  1973. " <td>NaN</td>\n",
  1974. " <td>NaN</td>\n",
  1975. " <td>NaN</td>\n",
  1976. " <td>NaN</td>\n",
  1977. " </tr>\n",
  1978. " <tr>\n",
  1979. " <th>328889</th>\n",
  1980. " <td>505020</td>\n",
  1981. " <td>2011242127</td>\n",
  1982. " <td>PAT</td>\n",
  1983. " <td>Nash, Stephen</td>\n",
  1984. " <td>NASH, STEPHEN</td>\n",
  1985. " <td>IE</td>\n",
  1986. " <td>NaN</td>\n",
  1987. " <td>NaN</td>\n",
  1988. " <td>710620.0</td>\n",
  1989. " <td>NaN</td>\n",
  1990. " <td>...</td>\n",
  1991. " <td>NaN</td>\n",
  1992. " <td>NaN</td>\n",
  1993. " <td>NaN</td>\n",
  1994. " <td>NaN</td>\n",
  1995. " <td>365754711.0</td>\n",
  1996. " <td>International</td>\n",
  1997. " <td>NaN</td>\n",
  1998. " <td>NaN</td>\n",
  1999. " <td>NaN</td>\n",
  2000. " <td>NaN</td>\n",
  2001. " </tr>\n",
  2002. " <tr>\n",
  2003. " <th>328890</th>\n",
  2004. " <td>505011</td>\n",
  2005. " <td>2011242127</td>\n",
  2006. " <td>PAT</td>\n",
  2007. " <td>Kelly, Brian</td>\n",
  2008. " <td>KELLY, BRIAN</td>\n",
  2009. " <td>IE</td>\n",
  2010. " <td>NaN</td>\n",
  2011. " <td>NaN</td>\n",
  2012. " <td>830479.0</td>\n",
  2013. " <td>NaN</td>\n",
  2014. " <td>...</td>\n",
  2015. " <td>NaN</td>\n",
  2016. " <td>NaN</td>\n",
  2017. " <td>NaN</td>\n",
  2018. " <td>NaN</td>\n",
  2019. " <td>365754711.0</td>\n",
  2020. " <td>International</td>\n",
  2021. " <td>NaN</td>\n",
  2022. " <td>NaN</td>\n",
  2023. " <td>NaN</td>\n",
  2024. " <td>NaN</td>\n",
  2025. " </tr>\n",
  2026. " </tbody>\n",
  2027. "</table>\n",
  2028. "<p>23 rows × 25 columns</p>\n",
  2029. "</div>"
  2030. ],
  2031. "text/plain": [
  2032. " ipa_applt_id australian_appl_no appln_type name \\\n",
  2033. "328868 505022 2011242127 PAT Ryba, Eric \n",
  2034. "328869 505018 2011242127 PAT Mullins, Barry \n",
  2035. "328870 505010 2011242127 PAT Hughes, Luke \n",
  2036. "328871 505008 2011242127 PAT Donovan, Danny \n",
  2037. "328872 505017 2011242127 PAT McMullin, Gwenda \n",
  2038. "328873 505016 2011242127 PAT Leung, Mark \n",
  2039. "328874 505023 2011242127 PAT Sweeney, Fiachra \n",
  2040. "328875 505012 2011242127 PAT Kelly, Gary \n",
  2041. "328876 505014 2011242127 PAT Kelly, John \n",
  2042. "328877 505005 2011242127 PAT Buckley, Naomi \n",
  2043. "328878 505027 2011242127 PAT Zarins, Denise \n",
  2044. "328879 505007 2011242127 PAT Cummins, Michael \n",
  2045. "328880 505024 2011242127 PAT Tilotta, Vincenzo \n",
  2046. "328881 505006 2011242127 PAT Clark, Benjamin \n",
  2047. "328882 505009 2011242127 PAT Gelfand, Mark \n",
  2048. "328883 505015 2011242127 PAT Ledwith, Seamus \n",
  2049. "328884 505021 2011242127 PAT Piccagli, Francesco \n",
  2050. "328885 505025 2011242127 PAT Turovskiy, Roman \n",
  2051. "328886 505019 2011242127 PAT Naga, Karun D \n",
  2052. "328887 505013 2011242127 PAT Kelly, Grace \n",
  2053. "328888 505026 2011242127 PAT Woolley, Lana \n",
  2054. "328889 505020 2011242127 PAT Nash, Stephen \n",
  2055. "328890 505011 2011242127 PAT Kelly, Brian \n",
  2056. "\n",
  2057. " cleanname country australian entity ipa_id abn \\\n",
  2058. "328868 RYBA, ERIC US NaN NaN 523838.0 NaN \n",
  2059. "328869 MULLINS, BARRY IE NaN NaN 764553.0 NaN \n",
  2060. "328870 HUGHES, LUKE IE NaN NaN 615868.0 NaN \n",
  2061. "328871 DONOVAN, DANNY IE NaN NaN 823533.0 NaN \n",
  2062. "328872 MCMULLIN, GWENDA IE NaN NaN 775148.0 NaN \n",
  2063. "328873 LEUNG, MARK CA NaN NaN 828513.0 NaN \n",
  2064. "328874 SWEENEY, FIACHRA IE NaN NaN 764070.0 NaN \n",
  2065. "328875 KELLY, GARY IE NaN NaN 594158.0 NaN \n",
  2066. "328876 KELLY, JOHN IE NaN NaN 717848.0 NaN \n",
  2067. "328877 BUCKLEY, NAOMI IE NaN NaN 647038.0 NaN \n",
  2068. "328878 ZARINS, DENISE US NaN NaN 544663.0 NaN \n",
  2069. "328879 CUMMINS, MICHAEL IE NaN NaN 627347.0 NaN \n",
  2070. "328880 TILOTTA, VINCENZO IE NaN NaN 731287.0 NaN \n",
  2071. "328881 CLARK, BENJAMIN US NaN NaN 839498.0 NaN \n",
  2072. "328882 GELFAND, MARK US NaN NaN 757227.0 NaN \n",
  2073. "328883 LEDWITH, SEAMUS IE NaN NaN 582278.0 NaN \n",
  2074. "328884 PICCAGLI, FRANCESCO IT NaN NaN 573395.0 NaN \n",
  2075. "328885 TUROVSKIY, ROMAN US NaN NaN 815052.0 NaN \n",
  2076. "328886 NAGA, KARUN D US NaN NaN 569675.0 NaN \n",
  2077. "328887 KELLY, GRACE IE NaN NaN 884565.0 NaN \n",
  2078. "328888 WOOLLEY, LANA IE NaN NaN 547325.0 NaN \n",
  2079. "328889 NASH, STEPHEN IE NaN NaN 710620.0 NaN \n",
  2080. "328890 KELLY, BRIAN IE NaN NaN 830479.0 NaN \n",
  2081. "\n",
  2082. " ... lon qg state postcode patstat_appln_id \\\n",
  2083. "328868 ... NaN NaN NaN NaN 365754711.0 \n",
  2084. "328869 ... NaN NaN NaN NaN 365754711.0 \n",
  2085. "328870 ... NaN NaN NaN NaN 365754711.0 \n",
  2086. "328871 ... NaN NaN NaN NaN 365754711.0 \n",
  2087. "328872 ... NaN NaN NaN NaN 365754711.0 \n",
  2088. "328873 ... NaN NaN NaN NaN 365754711.0 \n",
  2089. "328874 ... NaN NaN NaN NaN 365754711.0 \n",
  2090. "328875 ... NaN NaN NaN NaN 365754711.0 \n",
  2091. "328876 ... NaN NaN NaN NaN 365754711.0 \n",
  2092. "328877 ... NaN NaN NaN NaN 365754711.0 \n",
  2093. "328878 ... NaN NaN NaN NaN 365754711.0 \n",
  2094. "328879 ... NaN NaN NaN NaN 365754711.0 \n",
  2095. "328880 ... NaN NaN NaN NaN 365754711.0 \n",
  2096. "328881 ... NaN NaN NaN NaN 365754711.0 \n",
  2097. "328882 ... NaN NaN NaN NaN 365754711.0 \n",
  2098. "328883 ... NaN NaN NaN NaN 365754711.0 \n",
  2099. "328884 ... NaN NaN NaN NaN 365754711.0 \n",
  2100. "328885 ... NaN NaN NaN NaN 365754711.0 \n",
  2101. "328886 ... NaN NaN NaN NaN 365754711.0 \n",
  2102. "328887 ... NaN NaN NaN NaN 365754711.0 \n",
  2103. "328888 ... NaN NaN NaN NaN 365754711.0 \n",
  2104. "328889 ... NaN NaN NaN NaN 365754711.0 \n",
  2105. "328890 ... NaN NaN NaN NaN 365754711.0 \n",
  2106. "\n",
  2107. " applicant_type sa2_main11 sa2_name11 gcc_code11 gcc_name11 \n",
  2108. "328868 International NaN NaN NaN NaN \n",
  2109. "328869 International NaN NaN NaN NaN \n",
  2110. "328870 International NaN NaN NaN NaN \n",
  2111. "328871 International NaN NaN NaN NaN \n",
  2112. "328872 International NaN NaN NaN NaN \n",
  2113. "328873 International NaN NaN NaN NaN \n",
  2114. "328874 International NaN NaN NaN NaN \n",
  2115. "328875 International NaN NaN NaN NaN \n",
  2116. "328876 International NaN NaN NaN NaN \n",
  2117. "328877 International NaN NaN NaN NaN \n",
  2118. "328878 International NaN NaN NaN NaN \n",
  2119. "328879 International NaN NaN NaN NaN \n",
  2120. "328880 International NaN NaN NaN NaN \n",
  2121. "328881 International NaN NaN NaN NaN \n",
  2122. "328882 International NaN NaN NaN NaN \n",
  2123. "328883 International NaN NaN NaN NaN \n",
  2124. "328884 International NaN NaN NaN NaN \n",
  2125. "328885 International NaN NaN NaN NaN \n",
  2126. "328886 International NaN NaN NaN NaN \n",
  2127. "328887 International NaN NaN NaN NaN \n",
  2128. "328888 International NaN NaN NaN NaN \n",
  2129. "328889 International NaN NaN NaN NaN \n",
  2130. "328890 International NaN NaN NaN NaN \n",
  2131. "\n",
  2132. "[23 rows x 25 columns]"
  2133. ]
  2134. },
  2135. "execution_count": 38,
  2136. "metadata": {},
  2137. "output_type": "execute_result"
  2138. }
  2139. ],
  2140. "source": [
  2141. "data2[data2[\"australian_appl_no\"] == '2011242127']"
  2142. ]
  2143. },
  2144. {
  2145. "cell_type": "code",
  2146. "execution_count": null,
  2147. "metadata": {},
  2148. "outputs": [],
  2149. "source": [
  2150. "data1[\"aust\"]"
  2151. ]
  2152. }
  2153. ],
  2154. "metadata": {
  2155. "kernelspec": {
  2156. "display_name": "Python 3",
  2157. "language": "python",
  2158. "name": "python3"
  2159. },
  2160. "language_info": {
  2161. "codemirror_mode": {
  2162. "name": "ipython",
  2163. "version": 3
  2164. },
  2165. "file_extension": ".py",
  2166. "mimetype": "text/x-python",
  2167. "name": "python",
  2168. "nbconvert_exporter": "python",
  2169. "pygments_lexer": "ipython3",
  2170. "version": "3.6.5"
  2171. }
  2172. },
  2173. "nbformat": 4,
  2174. "nbformat_minor": 1
  2175. }
Add Comment
Please, Sign In to add comment