Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "#!/usr/bin/env python\n",
- "# -*- coding: utf-8 -*-\n",
- "import pandas as pd\n",
- "import os\n",
- "import codecs\n",
- "from sets import Set\n",
- "import csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# ------ verifica se há alguma duplicidade dos campos passados na lista - retorna False e True se existir --------"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_cad.duplicated(['NM_ENTIDADE_ENSINO_Capes','SG_ENTIDADE_ENSINO_Capes' ])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# ---- Verificando a diferença de quantidade após o merge, se há campos duplicados. Usando o Set --------"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 768,
- "metadata": {},
- "outputs": [],
- "source": [
- "s_df_cad = Set(df_cad['SG_ENTIDADE_ENSINO_Capes'])\n",
- "s_df_prog = Set(df_prog['SG_ENTIDADE_ENSINO_Capes'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 769,
- "metadata": {},
- "outputs": [],
- "source": [
- "#diferenca = s_df_cad - s_df_merged"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 770,
- "metadata": {},
- "outputs": [],
- "source": [
- "diferenca = s_df_cad.symmetric_difference(s_df_prog)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 771,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Set([u'IFCE - SOBRAL', u'UNIFATEA', u'UNESP-ASSIS', u'CESAR-AM', u'IFMG', u'UNESP-SJC', u'IFNMG', u'UNESP-RC', u'EGN', u'UNICARIOCA', u'GHC', u'IFPI-FLORIANO', u'FEBASP', u'UNESP/SV', u'FIPECAFI', u'UNIFACCAMP', u'IESB', u'UNESP-BOT', u'FICSAE', u'IBDT', u'UFSC - BLUMENAU', u'UNEAL', u'IETEC', u'UNIFEMM', u'FIOCRUZ-EGS BRAS\\xcdLIA', u'CENARGEN', u'UFJF-GV', u'USP/EACH', u'UFPB-JP', u'EGS/FIOCRUZ BRAS\\xcdLIA', u'UEMS/DOURADOS', u'UFESBA', u'UNESP-MAR', u'UNESP-SJRP', u'FIAMFAAM', u'FCMMG', u'IPEA', u'FACENE', u'FADIP', u'FADIC', u'UFCA', u'UNICHRISTUS', u'FDC', u'EMBRAPA', u'UNESP-IFT', u'IFS', u'UNESP-ARA\\xc7', u'ENAP', u'FABAPAR', u'IFC', u'IFRS', u'IFRR', u'EMBRAPA-CPAFAP', u'UNILAB - REDEN\\xc7\\xc3O', u'UNESP/TUP\\xc3', u'UNESP-SOR', u'UNIMES', u'UFRGS-LITORAL', u'UNI7', u'UNESP-SV', u'HCPA', u'FIOCRUZ-CPQRR', u'UPE', u'UNIPLI', u'UNIFESP - DIADEMA', u'UNESP-BAURU', u'UNIP\\xca', u'UNIT/ALAGOAS', u'FFIA', u'MPEG', u'UTFPR-MD', u'IVB', u'UNIALFA', u'UNIVBRASIL', u'FUAM', u'UNINTER', u'IFPR', u'CPAFRO / RO', u'IFAL', u'IFAM', u'ITAL', u'UNILA', u'HUJM', u'UNIHORIZONTES', u'UNILAB', u'EMBRAPA/CPATU', u'FG', u'UEAP', u'IFFARROUP', u'FEPECS-ESCS', u'UNIVERITAS UNG', u'MAST', u'UFSC - ARARANGUA', u'UFRRJ/NI', u'EMBRAPA-CPAFAC', u'UNESP-REITORIA', u'SBBQ', u'FMP', u'EMBRAPA-CNPA', u'FUCAPE-RJ', u'FIOCRUZ-CPQGM', u'LACEN/RO', u'UNESP-TUP\\xc3', u'FTC-SSA', u'UFOB', u'UNIARP', u'FACVEST', u'SBF', u'ITV MI', u'EACH', u'FORTEC', u'UNINTA', u'UFPI', u'UNESP-ARAR', u'EDUCATIE', u'IEAPM', u'UNESP-PP', u'CPQLMD/FIOCRUZ', u'UNESP-GUAR', u'USU', u'UNESP-IS', u'IFBAIANO', u'FIOCRUZ/RO', u'UNESP-FR', u'FIOCRUZ-NESC/CPQAM', u'FACEPD', u'UNIPAMPA - CA SUL', u'UNCISAL', u'ANDIFES', u'UFPB-RT', u'SBIBAE', u'FUCAPE-MA', u'INTA', u'FIOCRUZ-CPQLMD', u'IFBA', u'IFSULDEMINAS', u'UERGS', u'FPT', u'FESP', u'UNEMAT-SNP', u'IDOR', u'FCRB', u'FEPAGRO', u'FGV - DIREITO SP', u'UNESP-JAB', u'UDF', u'FIOCRUZ-RO', u'FASATC'])"
- ]
- },
- "execution_count": 771,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "diferenca"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 772,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "148"
- ]
- },
- "execution_count": 772,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(diferenca)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 773,
- "metadata": {},
- "outputs": [],
- "source": [
- "diferenca_A = s_df_cad.difference(s_df_prog)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 774,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Set([u'IFCE - SOBRAL', u'UNIFATEA', u'UNESP-ASSIS', u'CESAR-AM', u'IFMG', u'UNESP-SJC', u'IFNMG', u'UNESP-RC', u'EGN', u'UNICARIOCA', u'GHC', u'IFPI-FLORIANO', u'FEBASP', u'UNESP/SV', u'FIPECAFI', u'UNIFACCAMP', u'IESB', u'UNESP-BOT', u'FICSAE', u'IBDT', u'UFSC - BLUMENAU', u'UNEAL', u'IETEC', u'UNIFEMM', u'FIOCRUZ-EGS BRAS\\xcdLIA', u'CENARGEN', u'UFJF-GV', u'USP/EACH', u'UFPB-JP', u'EGS/FIOCRUZ BRAS\\xcdLIA', u'UEMS/DOURADOS', u'UFESBA', u'UNESP-MAR', u'UNESP-SJRP', u'FIAMFAAM', u'FCMMG', u'IPEA', u'FACENE', u'FADIP', u'FADIC', u'UFCA', u'UNICHRISTUS', u'FDC', u'EMBRAPA', u'UNESP-IFT', u'IFS', u'UNESP-ARA\\xc7', u'ENAP', u'FABAPAR', u'IFC', u'IFRS', u'IFRR', u'EMBRAPA-CPAFAP', u'UNILAB - REDEN\\xc7\\xc3O', u'UNESP/TUP\\xc3', u'UNESP-SOR', u'UNIMES', u'UFRGS-LITORAL', u'UNI7', u'UNESP-SV', u'HCPA', u'FIOCRUZ-CPQRR', u'UPE', u'UNIPLI', u'UNIFESP - DIADEMA', u'UNESP-BAURU', u'UNIP\\xca', u'UNIT/ALAGOAS', u'FFIA', u'MPEG', u'UTFPR-MD', u'IVB', u'UNIALFA', u'UNIVBRASIL', u'FUAM', u'UNINTER', u'IFPR', u'CPAFRO / RO', u'IFAL', u'IFAM', u'ITAL', u'UNILA', u'HUJM', u'UNIHORIZONTES', u'UNILAB', u'EMBRAPA/CPATU', u'FG', u'UEAP', u'IFFARROUP', u'FEPECS-ESCS', u'UNIVERITAS UNG', u'MAST', u'UFSC - ARARANGUA', u'UFRRJ/NI', u'EMBRAPA-CPAFAC', u'UNESP-REITORIA', u'SBBQ', u'FMP', u'EMBRAPA-CNPA', u'FUCAPE-RJ', u'FIOCRUZ-CPQGM', u'LACEN/RO', u'UNESP-TUP\\xc3', u'FTC-SSA', u'UFOB', u'UNIARP', u'FACVEST', u'SBF', u'ITV MI', u'EACH', u'FORTEC', u'UNINTA', u'UFPI', u'UNESP-ARAR', u'EDUCATIE', u'IEAPM', u'UNESP-PP', u'CPQLMD/FIOCRUZ', u'UNESP-GUAR', u'USU', u'UNESP-IS', u'IFBAIANO', u'FIOCRUZ/RO', u'UNESP-FR', u'FIOCRUZ-NESC/CPQAM', u'FACEPD', u'UNIPAMPA - CA SUL', u'UNCISAL', u'ANDIFES', u'UFPB-RT', u'SBIBAE', u'FUCAPE-MA', u'INTA', u'FIOCRUZ-CPQLMD', u'IFBA', u'IFSULDEMINAS', u'UERGS', u'FPT', u'FESP', u'UNEMAT-SNP', u'IDOR', u'FCRB', u'FEPAGRO', u'FGV - DIREITO SP', u'UNESP-JAB', u'UDF', u'FIOCRUZ-RO', u'FASATC'])"
- ]
- },
- "execution_count": 774,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "diferenca_A"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 775,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "148"
- ]
- },
- "execution_count": 775,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(diferenca_A)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 776,
- "metadata": {},
- "outputs": [],
- "source": [
- "diferenca_B = s_df_prog.difference(s_df_cad)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 777,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0"
- ]
- },
- "execution_count": 777,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(diferenca_B)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 805,
- "metadata": {},
- "outputs": [],
- "source": [
- "intersec = s_df_cad | s_df_prog "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 807,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "522"
- ]
- },
- "execution_count": 807,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(intersec)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 808,
- "metadata": {},
- "outputs": [],
- "source": [
- "uniao_com_cadastro = intersec | s_df_cad"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 809,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "522"
- ]
- },
- "execution_count": 809,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(uniao_com_cadastro)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_dif = pd.DataFrame()\n",
- "for x in diferenca:\n",
- " df_dif['SG_ENTIDADE_ENSINO_Capes'] = df_dif['SG_ENTIDADE_ENSINO_Capes'].fillna(x)\n",
- " #df_dif['SG_ENTIDADE_ENSINO_Capes'] = x\n",
- " \n",
- "print df_dif"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for group in [s_df_cad, s_df_merged]: # doctest: +SKIP\n",
- " group.discard(diferenca)\n",
- " print group\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 728,
- "metadata": {},
- "outputs": [],
- "source": [
- "s_df_cad = Set(df_cad['NM_ENTIDADE_ENSINO_Capes'])\n",
- "s_df_merged = Set(df_merged['NM_ENTIDADE_ENSINO_Capes'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 729,
- "metadata": {},
- "outputs": [],
- "source": [
- "diferenca = s_df_cad - s_df_merged"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 853,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "148"
- ]
- },
- "execution_count": 853,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(diferenca)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1038,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "var = '/var/tmp/solr_front/collections/capes/programas/download/'\n",
- "for root, dirs, files in os.walk(var):\n",
- " df_auxiliar = []\n",
- " for file in files:\n",
- " arquivo = codecs.open(os.path.join(root, file), 'r') # , encoding='latin-1')\n",
- " df_auxiliar = pd.read_csv(arquivo, sep=';', low_memory=False, encoding='latin-1')\n",
- " \n",
- " \n",
- " dict_sge = {}\n",
- " \n",
- "# for index, row in df_auxiliar.iterrows(): \n",
- "# if not row['SG_ENTIDADE_ENSINO'] in dict_sge1:\n",
- "# dict_sge1[row['SG_ENTIDADE_ENSINO']] = 1\n",
- " \n",
- " for index, row in df_auxiliar.iterrows(): \n",
- " if not row['NM_ENTIDADE_ENSINO'] in dict_sge:\n",
- " dict_sge[row['NM_ENTIDADE_ENSINO']] = 1\n",
- " \n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1039,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "376"
- ]
- },
- "execution_count": 1039,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(dict_sge)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1040,
- "metadata": {},
- "outputs": [],
- "source": [
- "dir = '/var/tmp/solr_front/collections/capes/programas/cadastro/'\n",
- "for root, dirs, files in os.walk(dir):\n",
- " for file in files:\n",
- " arquivo = codecs.open(os.path.join(root, file), 'r') # , encoding='latin-1')\n",
- " df_cad_temp = pd.read_csv(arquivo, sep=';', low_memory=False, encoding='latin-1')\n",
- " \n",
- " dict_cadastro = {}\n",
- " \n",
- "# for index, row in df_cad_temp.iterrows(): \n",
- "# if not row['SG_ENTIDADE_ENSINO_Capes'] in dict_cadastro:\n",
- "# dict_cadastro[row['SG_ENTIDADE_ENSINO_Capes']] = 1\n",
- "# \n",
- "\n",
- " for index, row in df_cad_temp.iterrows(): \n",
- " if not row['NM_ENTIDADE_ENSINO_Capes'] in dict_cadastro:\n",
- " dict_cadastro[row['NM_ENTIDADE_ENSINO_Capes']] = 1\n",
- " \n",
- " \n",
- " #print dict_cadastro.keys()\n",
- "\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1041,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "553"
- ]
- },
- "execution_count": 1041,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(dict_cadastro)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1043,
- "metadata": {},
- "outputs": [],
- "source": [
- "s_dict_sge = Set(dict_sge)\n",
- "s_dict_cadastro = Set(dict_cadastro)\n",
- "diferenca = s_dict_cadastro.difference(s_dict_sge)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "diferenca"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1045,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "178"
- ]
- },
- "execution_count": 1045,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(diferenca)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1047,
- "metadata": {},
- "outputs": [],
- "source": [
- "dif = s_dict_sge.difference(s_dict_cadastro)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1048,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Set([u'FUNDACAO OSWALDO CRUZ'])"
- ]
- },
- "execution_count": 1048,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "dif"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# --------------- testando novamente - A PARTIR DAQUI --------------------------"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "dir = '/var/tmp/solr_front/collections/capes/instituicoes/download'\n",
- "df_cad_temp = pd.DataFrame()\n",
- "for root, dirs, files in os.walk(dir):\n",
- " for file in files:\n",
- " print file\n",
- " arquivo = codecs.open(os.path.join(root, file), 'r') # , encoding='latin-1')\n",
- " df_cad_temp = pd.read_csv(arquivo, sep=';', low_memory=False, encoding='latin-1')\n",
- " #df_cad2 = df_cad_temp\n",
- " \n",
- " dict_cadastro = {}\n",
- " for index, row in df_cad_temp.iterrows(): \n",
- " if not row['NM_ENTIDADE_ENSINO_Capes'] in dict_cadastro:\n",
- " dict_cadastro[row['NM_ENTIDADE_ENSINO_Capes']] = 1\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1248,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "580"
- ]
- },
- "execution_count": 1248,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(dict_cadastro)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1256,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_cad2 = df_cad_temp.dropna(how = 'all',axis = 'columns')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1257,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_cad2 = df_cad.dropna(how = 'all', axis = 'rows')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1258,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "AN_BASE 580\n",
- "SG_ENTIDADE_ENSINO_Capes 580\n",
- "NM_ENTIDADE_ENSINO_Capes 580\n",
- "CD_INST_GEI 580\n",
- "SG_INST_GEI 580\n",
- "NM_INST_GEI 580\n",
- "Codigo_do_Tipo_de_Instituicao 580\n",
- "Tipo_de_Instituicao 580\n",
- "CS_STATUS_JURIDICO 580\n",
- "DS_DEPENDENCIA_ADMINISTRATIVA 580\n",
- "Codigo_Natureza_Juridica-GEI 580\n",
- "Nome_Natureza_Juridica-GEI 580\n",
- "CD_ORGANIZACAO_ACADEMICA-GEI 580\n",
- "DS_ORGANIZACAO_ACADEMICA-GEI 580\n",
- "DS_ORGANIZACAO_ACADEMICA_Capes 576\n",
- "CD_Mantenedora 580\n",
- "NM_Mantenedora 580\n",
- "Unnamed: 17 0\n",
- "dtype: int64"
- ]
- },
- "execution_count": 1258,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_cad_temp.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1202,
- "metadata": {},
- "outputs": [],
- "source": [
- "var = '/var/tmp/solr_front/collections/capes/programas/download/'\n",
- "\n",
- "for root, dirs, files in os.walk(var):\n",
- " df_auxiliar = []\n",
- " for file in files:\n",
- " #print file\n",
- " arquivo = codecs.open(os.path.join(root, file), 'r') # , encoding='latin-1')\n",
- " df_auxiliar = pd.read_csv(arquivo, sep=';', low_memory=False, encoding='latin-1')\n",
- " \n",
- " dict_sge = {}\n",
- " \n",
- " for index, row in df_auxiliar.iterrows(): \n",
- " if not row['NM_ENTIDADE_ENSINO'] in dict_sge:\n",
- " dict_sge[row['NM_ENTIDADE_ENSINO']] = 1\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1203,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "376"
- ]
- },
- "execution_count": 1203,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(dict_sge)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1204,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_prog['SG_ENTIDADE_ENSINO_Capes'] = df_prog['SG_ENTIDADE_ENSINO']\n",
- "df_prog['NM_ENTIDADE_ENSINO_Capes'] = df_prog['NM_ENTIDADE_ENSINO']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1205,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "ANO_INICIO_PROGRAMA 3568\n",
- "AN_BASE 3568\n",
- "AN_INICIO_CURSO 3568\n",
- "CD_AREA_AVALIACAO 3568\n",
- "CD_CONCEITO_PROGRAMA 3568\n",
- "CD_PROGRAMA_IES 3568\n",
- "CS_STATUS_JURIDICO 3568\n",
- "DS_CLIENTELA_QUADRIENAL_2017 3568\n",
- "DS_DEPENDENCIA_ADMINISTRATIVA 3568\n",
- "DS_ORGANIZACAO_ACADEMICA 3568\n",
- "DS_SITUACAO_PROGRAMA 3568\n",
- "DT_SITUACAO_PROGRAMA 3568\n",
- "ID_ADD_FOTO_PROGRAMA 3568\n",
- "ID_ADD_FOTO_PROGRAMA_IES 3568\n",
- "IN_REDE 3568\n",
- "NM_AREA_AVALIACAO 3568\n",
- "NM_AREA_CONHECIMENTO 3568\n",
- "NM_ENTIDADE_ENSINO 3568\n",
- "NM_ESPECIALIDADE 3568\n",
- "NM_GRANDE_AREA_CONHECIMENTO 3568\n",
- "NM_GRAU_PROGRAMA 3568\n",
- "NM_MODALIDADE_PROGRAMA 3568\n",
- "NM_MUNICIPIO_PROGRAMA_IES 3568\n",
- "NM_PROGRAMA_IDIOMA 3568\n",
- "NM_PROGRAMA_IES 3568\n",
- "NM_REGIAO 3568\n",
- "NM_SUBAREA_CONHECIMENTO 3568\n",
- "SG_ENTIDADE_ENSINO 3568\n",
- "SG_ENTIDADE_ENSINO_REDE 69\n",
- "SG_UF_PROGRAMA 3568\n",
- "SG_ENTIDADE_ENSINO_Capes 3568\n",
- "NM_ENTIDADE_ENSINO_Capes 3568\n",
- "dtype: int64"
- ]
- },
- "execution_count": 1205,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_prog.count()\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_merged = df_prog.merge(df_cad, on=['SG_ENTIDADE_ENSINO_Capes', 'NM_ENTIDADE_ENSINO_Capes'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 1206,
- "metadata": {},
- "outputs": [],
- "source": [
- "cad = Set(dict_cadastro)\n",
- "arq = Set(dict_sge)\n",
- "\n",
- "diferenca = arq.difference(cad)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1207,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Set([])"
- ]
- },
- "execution_count": 1207,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "diferenca"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1208,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "376"
- ]
- },
- "execution_count": 1208,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(cad)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1209,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "376"
- ]
- },
- "execution_count": 1209,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(arq)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "jupyter2_python_2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement