Advertisement
Guest User

Untitled

a guest
Feb 23rd, 2019
115
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 21.44 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 3,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "#!/usr/bin/env python\n",
  10. "# -*- coding: utf-8 -*-\n",
  11. "import pandas as pd\n",
  12. "import os\n",
  13. "import codecs\n",
  14. "from sets import Set\n",
  15. "import csv"
  16. ]
  17. },
  18. {
  19. "cell_type": "code",
  20. "execution_count": null,
  21. "metadata": {},
  22. "outputs": [],
  23. "source": [
  24. "# ------ verifica se há alguma duplicidade dos campos passados na lista - retorna False e True se existir --------"
  25. ]
  26. },
  27. {
  28. "cell_type": "code",
  29. "execution_count": null,
  30. "metadata": {},
  31. "outputs": [],
  32. "source": [
  33. "df_cad.duplicated(['NM_ENTIDADE_ENSINO_Capes','SG_ENTIDADE_ENSINO_Capes' ])"
  34. ]
  35. },
  36. {
  37. "cell_type": "code",
  38. "execution_count": null,
  39. "metadata": {},
  40. "outputs": [],
  41. "source": [
  42. "# ---- Verificando a diferença de quantidade após o merge, se há campos duplicados. Usando o Set --------"
  43. ]
  44. },
  45. {
  46. "cell_type": "code",
  47. "execution_count": 768,
  48. "metadata": {},
  49. "outputs": [],
  50. "source": [
  51. "s_df_cad = Set(df_cad['SG_ENTIDADE_ENSINO_Capes'])\n",
  52. "s_df_prog = Set(df_prog['SG_ENTIDADE_ENSINO_Capes'])"
  53. ]
  54. },
  55. {
  56. "cell_type": "code",
  57. "execution_count": 769,
  58. "metadata": {},
  59. "outputs": [],
  60. "source": [
  61. "#diferenca = s_df_cad - s_df_merged"
  62. ]
  63. },
  64. {
  65. "cell_type": "code",
  66. "execution_count": 770,
  67. "metadata": {},
  68. "outputs": [],
  69. "source": [
  70. "diferenca = s_df_cad.symmetric_difference(s_df_prog)"
  71. ]
  72. },
  73. {
  74. "cell_type": "code",
  75. "execution_count": 771,
  76. "metadata": {},
  77. "outputs": [
  78. {
  79. "data": {
  80. "text/plain": [
  81. "Set([u'IFCE - SOBRAL', u'UNIFATEA', u'UNESP-ASSIS', u'CESAR-AM', u'IFMG', u'UNESP-SJC', u'IFNMG', u'UNESP-RC', u'EGN', u'UNICARIOCA', u'GHC', u'IFPI-FLORIANO', u'FEBASP', u'UNESP/SV', u'FIPECAFI', u'UNIFACCAMP', u'IESB', u'UNESP-BOT', u'FICSAE', u'IBDT', u'UFSC - BLUMENAU', u'UNEAL', u'IETEC', u'UNIFEMM', u'FIOCRUZ-EGS BRAS\\xcdLIA', u'CENARGEN', u'UFJF-GV', u'USP/EACH', u'UFPB-JP', u'EGS/FIOCRUZ BRAS\\xcdLIA', u'UEMS/DOURADOS', u'UFESBA', u'UNESP-MAR', u'UNESP-SJRP', u'FIAMFAAM', u'FCMMG', u'IPEA', u'FACENE', u'FADIP', u'FADIC', u'UFCA', u'UNICHRISTUS', u'FDC', u'EMBRAPA', u'UNESP-IFT', u'IFS', u'UNESP-ARA\\xc7', u'ENAP', u'FABAPAR', u'IFC', u'IFRS', u'IFRR', u'EMBRAPA-CPAFAP', u'UNILAB - REDEN\\xc7\\xc3O', u'UNESP/TUP\\xc3', u'UNESP-SOR', u'UNIMES', u'UFRGS-LITORAL', u'UNI7', u'UNESP-SV', u'HCPA', u'FIOCRUZ-CPQRR', u'UPE', u'UNIPLI', u'UNIFESP - DIADEMA', u'UNESP-BAURU', u'UNIP\\xca', u'UNIT/ALAGOAS', u'FFIA', u'MPEG', u'UTFPR-MD', u'IVB', u'UNIALFA', u'UNIVBRASIL', u'FUAM', u'UNINTER', u'IFPR', u'CPAFRO / RO', u'IFAL', u'IFAM', u'ITAL', u'UNILA', u'HUJM', u'UNIHORIZONTES', u'UNILAB', u'EMBRAPA/CPATU', u'FG', u'UEAP', u'IFFARROUP', u'FEPECS-ESCS', u'UNIVERITAS UNG', u'MAST', u'UFSC - ARARANGUA', u'UFRRJ/NI', u'EMBRAPA-CPAFAC', u'UNESP-REITORIA', u'SBBQ', u'FMP', u'EMBRAPA-CNPA', u'FUCAPE-RJ', u'FIOCRUZ-CPQGM', u'LACEN/RO', u'UNESP-TUP\\xc3', u'FTC-SSA', u'UFOB', u'UNIARP', u'FACVEST', u'SBF', u'ITV MI', u'EACH', u'FORTEC', u'UNINTA', u'UFPI', u'UNESP-ARAR', u'EDUCATIE', u'IEAPM', u'UNESP-PP', u'CPQLMD/FIOCRUZ', u'UNESP-GUAR', u'USU', u'UNESP-IS', u'IFBAIANO', u'FIOCRUZ/RO', u'UNESP-FR', u'FIOCRUZ-NESC/CPQAM', u'FACEPD', u'UNIPAMPA - CA SUL', u'UNCISAL', u'ANDIFES', u'UFPB-RT', u'SBIBAE', u'FUCAPE-MA', u'INTA', u'FIOCRUZ-CPQLMD', u'IFBA', u'IFSULDEMINAS', u'UERGS', u'FPT', u'FESP', u'UNEMAT-SNP', u'IDOR', u'FCRB', u'FEPAGRO', u'FGV - DIREITO SP', u'UNESP-JAB', u'UDF', u'FIOCRUZ-RO', u'FASATC'])"
  82. ]
  83. },
  84. "execution_count": 771,
  85. "metadata": {},
  86. "output_type": "execute_result"
  87. }
  88. ],
  89. "source": [
  90. "diferenca"
  91. ]
  92. },
  93. {
  94. "cell_type": "code",
  95. "execution_count": 772,
  96. "metadata": {},
  97. "outputs": [
  98. {
  99. "data": {
  100. "text/plain": [
  101. "148"
  102. ]
  103. },
  104. "execution_count": 772,
  105. "metadata": {},
  106. "output_type": "execute_result"
  107. }
  108. ],
  109. "source": [
  110. "len(diferenca)"
  111. ]
  112. },
  113. {
  114. "cell_type": "code",
  115. "execution_count": 773,
  116. "metadata": {},
  117. "outputs": [],
  118. "source": [
  119. "diferenca_A = s_df_cad.difference(s_df_prog)"
  120. ]
  121. },
  122. {
  123. "cell_type": "code",
  124. "execution_count": 774,
  125. "metadata": {},
  126. "outputs": [
  127. {
  128. "data": {
  129. "text/plain": [
  130. "Set([u'IFCE - SOBRAL', u'UNIFATEA', u'UNESP-ASSIS', u'CESAR-AM', u'IFMG', u'UNESP-SJC', u'IFNMG', u'UNESP-RC', u'EGN', u'UNICARIOCA', u'GHC', u'IFPI-FLORIANO', u'FEBASP', u'UNESP/SV', u'FIPECAFI', u'UNIFACCAMP', u'IESB', u'UNESP-BOT', u'FICSAE', u'IBDT', u'UFSC - BLUMENAU', u'UNEAL', u'IETEC', u'UNIFEMM', u'FIOCRUZ-EGS BRAS\\xcdLIA', u'CENARGEN', u'UFJF-GV', u'USP/EACH', u'UFPB-JP', u'EGS/FIOCRUZ BRAS\\xcdLIA', u'UEMS/DOURADOS', u'UFESBA', u'UNESP-MAR', u'UNESP-SJRP', u'FIAMFAAM', u'FCMMG', u'IPEA', u'FACENE', u'FADIP', u'FADIC', u'UFCA', u'UNICHRISTUS', u'FDC', u'EMBRAPA', u'UNESP-IFT', u'IFS', u'UNESP-ARA\\xc7', u'ENAP', u'FABAPAR', u'IFC', u'IFRS', u'IFRR', u'EMBRAPA-CPAFAP', u'UNILAB - REDEN\\xc7\\xc3O', u'UNESP/TUP\\xc3', u'UNESP-SOR', u'UNIMES', u'UFRGS-LITORAL', u'UNI7', u'UNESP-SV', u'HCPA', u'FIOCRUZ-CPQRR', u'UPE', u'UNIPLI', u'UNIFESP - DIADEMA', u'UNESP-BAURU', u'UNIP\\xca', u'UNIT/ALAGOAS', u'FFIA', u'MPEG', u'UTFPR-MD', u'IVB', u'UNIALFA', u'UNIVBRASIL', u'FUAM', u'UNINTER', u'IFPR', u'CPAFRO / RO', u'IFAL', u'IFAM', u'ITAL', u'UNILA', u'HUJM', u'UNIHORIZONTES', u'UNILAB', u'EMBRAPA/CPATU', u'FG', u'UEAP', u'IFFARROUP', u'FEPECS-ESCS', u'UNIVERITAS UNG', u'MAST', u'UFSC - ARARANGUA', u'UFRRJ/NI', u'EMBRAPA-CPAFAC', u'UNESP-REITORIA', u'SBBQ', u'FMP', u'EMBRAPA-CNPA', u'FUCAPE-RJ', u'FIOCRUZ-CPQGM', u'LACEN/RO', u'UNESP-TUP\\xc3', u'FTC-SSA', u'UFOB', u'UNIARP', u'FACVEST', u'SBF', u'ITV MI', u'EACH', u'FORTEC', u'UNINTA', u'UFPI', u'UNESP-ARAR', u'EDUCATIE', u'IEAPM', u'UNESP-PP', u'CPQLMD/FIOCRUZ', u'UNESP-GUAR', u'USU', u'UNESP-IS', u'IFBAIANO', u'FIOCRUZ/RO', u'UNESP-FR', u'FIOCRUZ-NESC/CPQAM', u'FACEPD', u'UNIPAMPA - CA SUL', u'UNCISAL', u'ANDIFES', u'UFPB-RT', u'SBIBAE', u'FUCAPE-MA', u'INTA', u'FIOCRUZ-CPQLMD', u'IFBA', u'IFSULDEMINAS', u'UERGS', u'FPT', u'FESP', u'UNEMAT-SNP', u'IDOR', u'FCRB', u'FEPAGRO', u'FGV - DIREITO SP', u'UNESP-JAB', u'UDF', u'FIOCRUZ-RO', u'FASATC'])"
  131. ]
  132. },
  133. "execution_count": 774,
  134. "metadata": {},
  135. "output_type": "execute_result"
  136. }
  137. ],
  138. "source": [
  139. "diferenca_A"
  140. ]
  141. },
  142. {
  143. "cell_type": "code",
  144. "execution_count": 775,
  145. "metadata": {},
  146. "outputs": [
  147. {
  148. "data": {
  149. "text/plain": [
  150. "148"
  151. ]
  152. },
  153. "execution_count": 775,
  154. "metadata": {},
  155. "output_type": "execute_result"
  156. }
  157. ],
  158. "source": [
  159. "len(diferenca_A)"
  160. ]
  161. },
  162. {
  163. "cell_type": "code",
  164. "execution_count": 776,
  165. "metadata": {},
  166. "outputs": [],
  167. "source": [
  168. "diferenca_B = s_df_prog.difference(s_df_cad)"
  169. ]
  170. },
  171. {
  172. "cell_type": "code",
  173. "execution_count": 777,
  174. "metadata": {},
  175. "outputs": [
  176. {
  177. "data": {
  178. "text/plain": [
  179. "0"
  180. ]
  181. },
  182. "execution_count": 777,
  183. "metadata": {},
  184. "output_type": "execute_result"
  185. }
  186. ],
  187. "source": [
  188. "len(diferenca_B)"
  189. ]
  190. },
  191. {
  192. "cell_type": "code",
  193. "execution_count": 805,
  194. "metadata": {},
  195. "outputs": [],
  196. "source": [
  197. "intersec = s_df_cad | s_df_prog "
  198. ]
  199. },
  200. {
  201. "cell_type": "code",
  202. "execution_count": 807,
  203. "metadata": {},
  204. "outputs": [
  205. {
  206. "data": {
  207. "text/plain": [
  208. "522"
  209. ]
  210. },
  211. "execution_count": 807,
  212. "metadata": {},
  213. "output_type": "execute_result"
  214. }
  215. ],
  216. "source": [
  217. "len(intersec)"
  218. ]
  219. },
  220. {
  221. "cell_type": "code",
  222. "execution_count": 808,
  223. "metadata": {},
  224. "outputs": [],
  225. "source": [
  226. "uniao_com_cadastro = intersec | s_df_cad"
  227. ]
  228. },
  229. {
  230. "cell_type": "code",
  231. "execution_count": 809,
  232. "metadata": {},
  233. "outputs": [
  234. {
  235. "data": {
  236. "text/plain": [
  237. "522"
  238. ]
  239. },
  240. "execution_count": 809,
  241. "metadata": {},
  242. "output_type": "execute_result"
  243. }
  244. ],
  245. "source": [
  246. "len(uniao_com_cadastro)"
  247. ]
  248. },
  249. {
  250. "cell_type": "code",
  251. "execution_count": null,
  252. "metadata": {},
  253. "outputs": [],
  254. "source": [
  255. "df_dif = pd.DataFrame()\n",
  256. "for x in diferenca:\n",
  257. " df_dif['SG_ENTIDADE_ENSINO_Capes'] = df_dif['SG_ENTIDADE_ENSINO_Capes'].fillna(x)\n",
  258. " #df_dif['SG_ENTIDADE_ENSINO_Capes'] = x\n",
  259. " \n",
  260. "print df_dif"
  261. ]
  262. },
  263. {
  264. "cell_type": "code",
  265. "execution_count": null,
  266. "metadata": {},
  267. "outputs": [],
  268. "source": [
  269. "for group in [s_df_cad, s_df_merged]: # doctest: +SKIP\n",
  270. " group.discard(diferenca)\n",
  271. " print group\n"
  272. ]
  273. },
  274. {
  275. "cell_type": "code",
  276. "execution_count": 728,
  277. "metadata": {},
  278. "outputs": [],
  279. "source": [
  280. "s_df_cad = Set(df_cad['NM_ENTIDADE_ENSINO_Capes'])\n",
  281. "s_df_merged = Set(df_merged['NM_ENTIDADE_ENSINO_Capes'])"
  282. ]
  283. },
  284. {
  285. "cell_type": "code",
  286. "execution_count": 729,
  287. "metadata": {},
  288. "outputs": [],
  289. "source": [
  290. "diferenca = s_df_cad - s_df_merged"
  291. ]
  292. },
  293. {
  294. "cell_type": "code",
  295. "execution_count": 853,
  296. "metadata": {},
  297. "outputs": [
  298. {
  299. "data": {
  300. "text/plain": [
  301. "148"
  302. ]
  303. },
  304. "execution_count": 853,
  305. "metadata": {},
  306. "output_type": "execute_result"
  307. }
  308. ],
  309. "source": [
  310. "len(diferenca)\n"
  311. ]
  312. },
  313. {
  314. "cell_type": "code",
  315. "execution_count": 1038,
  316. "metadata": {
  317. "scrolled": true
  318. },
  319. "outputs": [],
  320. "source": [
  321. "var = '/var/tmp/solr_front/collections/capes/programas/download/'\n",
  322. "for root, dirs, files in os.walk(var):\n",
  323. " df_auxiliar = []\n",
  324. " for file in files:\n",
  325. " arquivo = codecs.open(os.path.join(root, file), 'r') # , encoding='latin-1')\n",
  326. " df_auxiliar = pd.read_csv(arquivo, sep=';', low_memory=False, encoding='latin-1')\n",
  327. " \n",
  328. " \n",
  329. " dict_sge = {}\n",
  330. " \n",
  331. "# for index, row in df_auxiliar.iterrows(): \n",
  332. "# if not row['SG_ENTIDADE_ENSINO'] in dict_sge1:\n",
  333. "# dict_sge1[row['SG_ENTIDADE_ENSINO']] = 1\n",
  334. " \n",
  335. " for index, row in df_auxiliar.iterrows(): \n",
  336. " if not row['NM_ENTIDADE_ENSINO'] in dict_sge:\n",
  337. " dict_sge[row['NM_ENTIDADE_ENSINO']] = 1\n",
  338. " \n",
  339. " "
  340. ]
  341. },
  342. {
  343. "cell_type": "code",
  344. "execution_count": 1039,
  345. "metadata": {},
  346. "outputs": [
  347. {
  348. "data": {
  349. "text/plain": [
  350. "376"
  351. ]
  352. },
  353. "execution_count": 1039,
  354. "metadata": {},
  355. "output_type": "execute_result"
  356. }
  357. ],
  358. "source": [
  359. "len(dict_sge)\n"
  360. ]
  361. },
  362. {
  363. "cell_type": "code",
  364. "execution_count": 1040,
  365. "metadata": {},
  366. "outputs": [],
  367. "source": [
  368. "dir = '/var/tmp/solr_front/collections/capes/programas/cadastro/'\n",
  369. "for root, dirs, files in os.walk(dir):\n",
  370. " for file in files:\n",
  371. " arquivo = codecs.open(os.path.join(root, file), 'r') # , encoding='latin-1')\n",
  372. " df_cad_temp = pd.read_csv(arquivo, sep=';', low_memory=False, encoding='latin-1')\n",
  373. " \n",
  374. " dict_cadastro = {}\n",
  375. " \n",
  376. "# for index, row in df_cad_temp.iterrows(): \n",
  377. "# if not row['SG_ENTIDADE_ENSINO_Capes'] in dict_cadastro:\n",
  378. "# dict_cadastro[row['SG_ENTIDADE_ENSINO_Capes']] = 1\n",
  379. "# \n",
  380. "\n",
  381. " for index, row in df_cad_temp.iterrows(): \n",
  382. " if not row['NM_ENTIDADE_ENSINO_Capes'] in dict_cadastro:\n",
  383. " dict_cadastro[row['NM_ENTIDADE_ENSINO_Capes']] = 1\n",
  384. " \n",
  385. " \n",
  386. " #print dict_cadastro.keys()\n",
  387. "\n",
  388. " "
  389. ]
  390. },
  391. {
  392. "cell_type": "code",
  393. "execution_count": 1041,
  394. "metadata": {},
  395. "outputs": [
  396. {
  397. "data": {
  398. "text/plain": [
  399. "553"
  400. ]
  401. },
  402. "execution_count": 1041,
  403. "metadata": {},
  404. "output_type": "execute_result"
  405. }
  406. ],
  407. "source": [
  408. "len(dict_cadastro)"
  409. ]
  410. },
  411. {
  412. "cell_type": "code",
  413. "execution_count": 1043,
  414. "metadata": {},
  415. "outputs": [],
  416. "source": [
  417. "s_dict_sge = Set(dict_sge)\n",
  418. "s_dict_cadastro = Set(dict_cadastro)\n",
  419. "diferenca = s_dict_cadastro.difference(s_dict_sge)"
  420. ]
  421. },
  422. {
  423. "cell_type": "code",
  424. "execution_count": null,
  425. "metadata": {},
  426. "outputs": [],
  427. "source": [
  428. "diferenca"
  429. ]
  430. },
  431. {
  432. "cell_type": "code",
  433. "execution_count": 1045,
  434. "metadata": {},
  435. "outputs": [
  436. {
  437. "data": {
  438. "text/plain": [
  439. "178"
  440. ]
  441. },
  442. "execution_count": 1045,
  443. "metadata": {},
  444. "output_type": "execute_result"
  445. }
  446. ],
  447. "source": [
  448. "len(diferenca)"
  449. ]
  450. },
  451. {
  452. "cell_type": "code",
  453. "execution_count": 1047,
  454. "metadata": {},
  455. "outputs": [],
  456. "source": [
  457. "dif = s_dict_sge.difference(s_dict_cadastro)"
  458. ]
  459. },
  460. {
  461. "cell_type": "code",
  462. "execution_count": 1048,
  463. "metadata": {},
  464. "outputs": [
  465. {
  466. "data": {
  467. "text/plain": [
  468. "Set([u'FUNDACAO OSWALDO CRUZ'])"
  469. ]
  470. },
  471. "execution_count": 1048,
  472. "metadata": {},
  473. "output_type": "execute_result"
  474. }
  475. ],
  476. "source": [
  477. "dif"
  478. ]
  479. },
  480. {
  481. "cell_type": "code",
  482. "execution_count": null,
  483. "metadata": {},
  484. "outputs": [],
  485. "source": []
  486. },
  487. {
  488. "cell_type": "code",
  489. "execution_count": null,
  490. "metadata": {},
  491. "outputs": [],
  492. "source": [
  493. "# --------------- testando novamente - A PARTIR DAQUI --------------------------"
  494. ]
  495. },
  496. {
  497. "cell_type": "code",
  498. "execution_count": null,
  499. "metadata": {},
  500. "outputs": [],
  501. "source": []
  502. },
  503. {
  504. "cell_type": "code",
  505. "execution_count": null,
  506. "metadata": {},
  507. "outputs": [],
  508. "source": [
  509. "dir = '/var/tmp/solr_front/collections/capes/instituicoes/download'\n",
  510. "df_cad_temp = pd.DataFrame()\n",
  511. "for root, dirs, files in os.walk(dir):\n",
  512. " for file in files:\n",
  513. " print file\n",
  514. " arquivo = codecs.open(os.path.join(root, file), 'r') # , encoding='latin-1')\n",
  515. " df_cad_temp = pd.read_csv(arquivo, sep=';', low_memory=False, encoding='latin-1')\n",
  516. " #df_cad2 = df_cad_temp\n",
  517. " \n",
  518. " dict_cadastro = {}\n",
  519. " for index, row in df_cad_temp.iterrows(): \n",
  520. " if not row['NM_ENTIDADE_ENSINO_Capes'] in dict_cadastro:\n",
  521. " dict_cadastro[row['NM_ENTIDADE_ENSINO_Capes']] = 1\n"
  522. ]
  523. },
  524. {
  525. "cell_type": "code",
  526. "execution_count": 1248,
  527. "metadata": {},
  528. "outputs": [
  529. {
  530. "data": {
  531. "text/plain": [
  532. "580"
  533. ]
  534. },
  535. "execution_count": 1248,
  536. "metadata": {},
  537. "output_type": "execute_result"
  538. }
  539. ],
  540. "source": [
  541. "len(dict_cadastro)"
  542. ]
  543. },
  544. {
  545. "cell_type": "code",
  546. "execution_count": 1256,
  547. "metadata": {},
  548. "outputs": [],
  549. "source": [
  550. "df_cad2 = df_cad_temp.dropna(how = 'all',axis = 'columns')"
  551. ]
  552. },
  553. {
  554. "cell_type": "code",
  555. "execution_count": 1257,
  556. "metadata": {},
  557. "outputs": [],
  558. "source": [
  559. "df_cad2 = df_cad.dropna(how = 'all', axis = 'rows')"
  560. ]
  561. },
  562. {
  563. "cell_type": "code",
  564. "execution_count": 1258,
  565. "metadata": {},
  566. "outputs": [
  567. {
  568. "data": {
  569. "text/plain": [
  570. "AN_BASE 580\n",
  571. "SG_ENTIDADE_ENSINO_Capes 580\n",
  572. "NM_ENTIDADE_ENSINO_Capes 580\n",
  573. "CD_INST_GEI 580\n",
  574. "SG_INST_GEI 580\n",
  575. "NM_INST_GEI 580\n",
  576. "Codigo_do_Tipo_de_Instituicao 580\n",
  577. "Tipo_de_Instituicao 580\n",
  578. "CS_STATUS_JURIDICO 580\n",
  579. "DS_DEPENDENCIA_ADMINISTRATIVA 580\n",
  580. "Codigo_Natureza_Juridica-GEI 580\n",
  581. "Nome_Natureza_Juridica-GEI 580\n",
  582. "CD_ORGANIZACAO_ACADEMICA-GEI 580\n",
  583. "DS_ORGANIZACAO_ACADEMICA-GEI 580\n",
  584. "DS_ORGANIZACAO_ACADEMICA_Capes 576\n",
  585. "CD_Mantenedora 580\n",
  586. "NM_Mantenedora 580\n",
  587. "Unnamed: 17 0\n",
  588. "dtype: int64"
  589. ]
  590. },
  591. "execution_count": 1258,
  592. "metadata": {},
  593. "output_type": "execute_result"
  594. }
  595. ],
  596. "source": [
  597. "df_cad_temp.count()"
  598. ]
  599. },
  600. {
  601. "cell_type": "code",
  602. "execution_count": 1202,
  603. "metadata": {},
  604. "outputs": [],
  605. "source": [
  606. "var = '/var/tmp/solr_front/collections/capes/programas/download/'\n",
  607. "\n",
  608. "for root, dirs, files in os.walk(var):\n",
  609. " df_auxiliar = []\n",
  610. " for file in files:\n",
  611. " #print file\n",
  612. " arquivo = codecs.open(os.path.join(root, file), 'r') # , encoding='latin-1')\n",
  613. " df_auxiliar = pd.read_csv(arquivo, sep=';', low_memory=False, encoding='latin-1')\n",
  614. " \n",
  615. " dict_sge = {}\n",
  616. " \n",
  617. " for index, row in df_auxiliar.iterrows(): \n",
  618. " if not row['NM_ENTIDADE_ENSINO'] in dict_sge:\n",
  619. " dict_sge[row['NM_ENTIDADE_ENSINO']] = 1\n"
  620. ]
  621. },
  622. {
  623. "cell_type": "code",
  624. "execution_count": 1203,
  625. "metadata": {},
  626. "outputs": [
  627. {
  628. "data": {
  629. "text/plain": [
  630. "376"
  631. ]
  632. },
  633. "execution_count": 1203,
  634. "metadata": {},
  635. "output_type": "execute_result"
  636. }
  637. ],
  638. "source": [
  639. "len(dict_sge)\n"
  640. ]
  641. },
  642. {
  643. "cell_type": "code",
  644. "execution_count": 1204,
  645. "metadata": {},
  646. "outputs": [],
  647. "source": [
  648. "df_prog['SG_ENTIDADE_ENSINO_Capes'] = df_prog['SG_ENTIDADE_ENSINO']\n",
  649. "df_prog['NM_ENTIDADE_ENSINO_Capes'] = df_prog['NM_ENTIDADE_ENSINO']"
  650. ]
  651. },
  652. {
  653. "cell_type": "code",
  654. "execution_count": 1205,
  655. "metadata": {},
  656. "outputs": [
  657. {
  658. "data": {
  659. "text/plain": [
  660. "ANO_INICIO_PROGRAMA 3568\n",
  661. "AN_BASE 3568\n",
  662. "AN_INICIO_CURSO 3568\n",
  663. "CD_AREA_AVALIACAO 3568\n",
  664. "CD_CONCEITO_PROGRAMA 3568\n",
  665. "CD_PROGRAMA_IES 3568\n",
  666. "CS_STATUS_JURIDICO 3568\n",
  667. "DS_CLIENTELA_QUADRIENAL_2017 3568\n",
  668. "DS_DEPENDENCIA_ADMINISTRATIVA 3568\n",
  669. "DS_ORGANIZACAO_ACADEMICA 3568\n",
  670. "DS_SITUACAO_PROGRAMA 3568\n",
  671. "DT_SITUACAO_PROGRAMA 3568\n",
  672. "ID_ADD_FOTO_PROGRAMA 3568\n",
  673. "ID_ADD_FOTO_PROGRAMA_IES 3568\n",
  674. "IN_REDE 3568\n",
  675. "NM_AREA_AVALIACAO 3568\n",
  676. "NM_AREA_CONHECIMENTO 3568\n",
  677. "NM_ENTIDADE_ENSINO 3568\n",
  678. "NM_ESPECIALIDADE 3568\n",
  679. "NM_GRANDE_AREA_CONHECIMENTO 3568\n",
  680. "NM_GRAU_PROGRAMA 3568\n",
  681. "NM_MODALIDADE_PROGRAMA 3568\n",
  682. "NM_MUNICIPIO_PROGRAMA_IES 3568\n",
  683. "NM_PROGRAMA_IDIOMA 3568\n",
  684. "NM_PROGRAMA_IES 3568\n",
  685. "NM_REGIAO 3568\n",
  686. "NM_SUBAREA_CONHECIMENTO 3568\n",
  687. "SG_ENTIDADE_ENSINO 3568\n",
  688. "SG_ENTIDADE_ENSINO_REDE 69\n",
  689. "SG_UF_PROGRAMA 3568\n",
  690. "SG_ENTIDADE_ENSINO_Capes 3568\n",
  691. "NM_ENTIDADE_ENSINO_Capes 3568\n",
  692. "dtype: int64"
  693. ]
  694. },
  695. "execution_count": 1205,
  696. "metadata": {},
  697. "output_type": "execute_result"
  698. }
  699. ],
  700. "source": [
  701. "df_prog.count()\n"
  702. ]
  703. },
  704. {
  705. "cell_type": "code",
  706. "execution_count": null,
  707. "metadata": {},
  708. "outputs": [],
  709. "source": [
  710. "df_merged = df_prog.merge(df_cad, on=['SG_ENTIDADE_ENSINO_Capes', 'NM_ENTIDADE_ENSINO_Capes'])"
  711. ]
  712. },
  713. {
  714. "cell_type": "code",
  715. "execution_count": null,
  716. "metadata": {},
  717. "outputs": [],
  718. "source": []
  719. },
  720. {
  721. "cell_type": "code",
  722. "execution_count": 1206,
  723. "metadata": {},
  724. "outputs": [],
  725. "source": [
  726. "cad = Set(dict_cadastro)\n",
  727. "arq = Set(dict_sge)\n",
  728. "\n",
  729. "diferenca = arq.difference(cad)"
  730. ]
  731. },
  732. {
  733. "cell_type": "code",
  734. "execution_count": 1207,
  735. "metadata": {},
  736. "outputs": [
  737. {
  738. "data": {
  739. "text/plain": [
  740. "Set([])"
  741. ]
  742. },
  743. "execution_count": 1207,
  744. "metadata": {},
  745. "output_type": "execute_result"
  746. }
  747. ],
  748. "source": [
  749. "diferenca"
  750. ]
  751. },
  752. {
  753. "cell_type": "code",
  754. "execution_count": 1208,
  755. "metadata": {},
  756. "outputs": [
  757. {
  758. "data": {
  759. "text/plain": [
  760. "376"
  761. ]
  762. },
  763. "execution_count": 1208,
  764. "metadata": {},
  765. "output_type": "execute_result"
  766. }
  767. ],
  768. "source": [
  769. "len(cad)"
  770. ]
  771. },
  772. {
  773. "cell_type": "code",
  774. "execution_count": 1209,
  775. "metadata": {},
  776. "outputs": [
  777. {
  778. "data": {
  779. "text/plain": [
  780. "376"
  781. ]
  782. },
  783. "execution_count": 1209,
  784. "metadata": {},
  785. "output_type": "execute_result"
  786. }
  787. ],
  788. "source": [
  789. "len(arq)"
  790. ]
  791. }
  792. ],
  793. "metadata": {
  794. "kernelspec": {
  795. "display_name": "Python 2",
  796. "language": "python",
  797. "name": "jupyter2_python_2"
  798. },
  799. "language_info": {
  800. "codemirror_mode": {
  801. "name": "ipython",
  802. "version": 2
  803. },
  804. "file_extension": ".py",
  805. "mimetype": "text/x-python",
  806. "name": "python",
  807. "nbconvert_exporter": "python",
  808. "pygments_lexer": "ipython2",
  809. "version": "2.7.11"
  810. }
  811. },
  812. "nbformat": 4,
  813. "nbformat_minor": 2
  814. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement