Guest User

Untitled

a guest
Apr 9th, 2018
312
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.85 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Usando Pandas e subprocess para separar dados de um CSV"
  8. ]
  9. },
  10. {
  11. "cell_type": "markdown",
  12. "metadata": {},
  13. "source": [
  14. "## Objetivo: Ler os dados de um CSV e separar os dados por empresa em CSVs separados"
  15. ]
  16. },
  17. {
  18. "cell_type": "code",
  19. "execution_count": 1,
  20. "metadata": {},
  21. "outputs": [],
  22. "source": [
  23. "import csv\n",
  24. "import names\n",
  25. "import pandas as pd\n",
  26. "import subprocess"
  27. ]
  28. },
  29. {
  30. "cell_type": "code",
  31. "execution_count": 2,
  32. "metadata": {},
  33. "outputs": [],
  34. "source": [
  35. "# Gerando um CSV\n",
  36. "with open('/tmp/names.csv', 'w') as f:\n",
  37. " csv_writer = csv.writer(f)\n",
  38. " csv_writer.writerow(('nome', 'email', 'empresa'))\n",
  39. " for _ in range(1000):\n",
  40. " first_name = names.get_first_name()\n",
  41. " last_name = names.get_last_name()\n",
  42. " full_name = '{} {}'.format(first_name, last_name)\n",
  43. " email = '{}.{}@email.com'.format(first_name.lower(), last_name.lower())\n",
  44. " company = names.get_last_name()\n",
  45. " csv_writer.writerow((full_name, email, company))"
  46. ]
  47. },
  48. {
  49. "cell_type": "code",
  50. "execution_count": 3,
  51. "metadata": {},
  52. "outputs": [],
  53. "source": [
  54. "# Lendo o CSV\n",
  55. "filename = '/tmp/names.csv'"
  56. ]
  57. },
  58. {
  59. "cell_type": "code",
  60. "execution_count": 4,
  61. "metadata": {},
  62. "outputs": [],
  63. "source": [
  64. "# Criando um DataFrame\n",
  65. "df = pd.read_csv(filename)"
  66. ]
  67. },
  68. {
  69. "cell_type": "code",
  70. "execution_count": 5,
  71. "metadata": {},
  72. "outputs": [
  73. {
  74. "data": {
  75. "text/html": [
  76. "<div>\n",
  77. "<style scoped>\n",
  78. " .dataframe tbody tr th:only-of-type {\n",
  79. " vertical-align: middle;\n",
  80. " }\n",
  81. "\n",
  82. " .dataframe tbody tr th {\n",
  83. " vertical-align: top;\n",
  84. " }\n",
  85. "\n",
  86. " .dataframe thead th {\n",
  87. " text-align: right;\n",
  88. " }\n",
  89. "</style>\n",
  90. "<table border=\"1\" class=\"dataframe\">\n",
  91. " <thead>\n",
  92. " <tr style=\"text-align: right;\">\n",
  93. " <th></th>\n",
  94. " <th>nome</th>\n",
  95. " <th>email</th>\n",
  96. " <th>empresa</th>\n",
  97. " </tr>\n",
  98. " </thead>\n",
  99. " <tbody>\n",
  100. " <tr>\n",
  101. " <th>2</th>\n",
  102. " <td>Mabel Brund</td>\n",
  103. " <td>mabel.brund@email.com</td>\n",
  104. " <td>Barrett</td>\n",
  105. " </tr>\n",
  106. " <tr>\n",
  107. " <th>0</th>\n",
  108. " <td>Diann Driskell</td>\n",
  109. " <td>diann.driskell@email.com</td>\n",
  110. " <td>Brown</td>\n",
  111. " </tr>\n",
  112. " <tr>\n",
  113. " <th>3</th>\n",
  114. " <td>Donna Ivy</td>\n",
  115. " <td>donna.ivy@email.com</td>\n",
  116. " <td>Cheatom</td>\n",
  117. " </tr>\n",
  118. " <tr>\n",
  119. " <th>4</th>\n",
  120. " <td>Sandy Sutton</td>\n",
  121. " <td>sandy.sutton@email.com</td>\n",
  122. " <td>Shannon</td>\n",
  123. " </tr>\n",
  124. " <tr>\n",
  125. " <th>1</th>\n",
  126. " <td>Patricia Watkins</td>\n",
  127. " <td>patricia.watkins@email.com</td>\n",
  128. " <td>Yarbrough</td>\n",
  129. " </tr>\n",
  130. " </tbody>\n",
  131. "</table>\n",
  132. "</div>"
  133. ],
  134. "text/plain": [
  135. " nome email empresa\n",
  136. "2 Mabel Brund mabel.brund@email.com Barrett\n",
  137. "0 Diann Driskell diann.driskell@email.com Brown\n",
  138. "3 Donna Ivy donna.ivy@email.com Cheatom\n",
  139. "4 Sandy Sutton sandy.sutton@email.com Shannon\n",
  140. "1 Patricia Watkins patricia.watkins@email.com Yarbrough"
  141. ]
  142. },
  143. "execution_count": 5,
  144. "metadata": {},
  145. "output_type": "execute_result"
  146. }
  147. ],
  148. "source": [
  149. "# Mostrando os primeiros registros\n",
  150. "df.head().sort_values(by=['empresa'])"
  151. ]
  152. },
  153. {
  154. "cell_type": "code",
  155. "execution_count": 6,
  156. "metadata": {},
  157. "outputs": [
  158. {
  159. "data": {
  160. "text/html": [
  161. "<div>\n",
  162. "<style scoped>\n",
  163. " .dataframe tbody tr th:only-of-type {\n",
  164. " vertical-align: middle;\n",
  165. " }\n",
  166. "\n",
  167. " .dataframe tbody tr th {\n",
  168. " vertical-align: top;\n",
  169. " }\n",
  170. "\n",
  171. " .dataframe thead tr th {\n",
  172. " text-align: left;\n",
  173. " }\n",
  174. "\n",
  175. " .dataframe thead tr:last-of-type th {\n",
  176. " text-align: right;\n",
  177. " }\n",
  178. "</style>\n",
  179. "<table border=\"1\" class=\"dataframe\">\n",
  180. " <thead>\n",
  181. " <tr>\n",
  182. " <th></th>\n",
  183. " <th>nome</th>\n",
  184. " <th>email</th>\n",
  185. " </tr>\n",
  186. " <tr>\n",
  187. " <th></th>\n",
  188. " <th>count</th>\n",
  189. " <th>count</th>\n",
  190. " </tr>\n",
  191. " <tr>\n",
  192. " <th>empresa</th>\n",
  193. " <th></th>\n",
  194. " <th></th>\n",
  195. " </tr>\n",
  196. " </thead>\n",
  197. " <tbody>\n",
  198. " <tr>\n",
  199. " <th>Abrego</th>\n",
  200. " <td>1</td>\n",
  201. " <td>1</td>\n",
  202. " </tr>\n",
  203. " <tr>\n",
  204. " <th>Acosta</th>\n",
  205. " <td>1</td>\n",
  206. " <td>1</td>\n",
  207. " </tr>\n",
  208. " <tr>\n",
  209. " <th>Adams</th>\n",
  210. " <td>1</td>\n",
  211. " <td>1</td>\n",
  212. " </tr>\n",
  213. " <tr>\n",
  214. " <th>Akins</th>\n",
  215. " <td>1</td>\n",
  216. " <td>1</td>\n",
  217. " </tr>\n",
  218. " <tr>\n",
  219. " <th>Alarcon</th>\n",
  220. " <td>1</td>\n",
  221. " <td>1</td>\n",
  222. " </tr>\n",
  223. " </tbody>\n",
  224. "</table>\n",
  225. "</div>"
  226. ],
  227. "text/plain": [
  228. " nome email\n",
  229. " count count\n",
  230. "empresa \n",
  231. "Abrego 1 1\n",
  232. "Acosta 1 1\n",
  233. "Adams 1 1\n",
  234. "Akins 1 1\n",
  235. "Alarcon 1 1"
  236. ]
  237. },
  238. "execution_count": 6,
  239. "metadata": {},
  240. "output_type": "execute_result"
  241. }
  242. ],
  243. "source": [
  244. "# Contando quantos dados nós temos por empresa\n",
  245. "df.groupby(['empresa']).agg(['count']).head()"
  246. ]
  247. },
  248. {
  249. "cell_type": "code",
  250. "execution_count": 7,
  251. "metadata": {},
  252. "outputs": [
  253. {
  254. "data": {
  255. "text/plain": [
  256. "Johnson 13\n",
  257. "Brown 12\n",
  258. "Davis 12\n",
  259. "Smith 12\n",
  260. "Williams 6\n",
  261. "Name: empresa, dtype: int64"
  262. ]
  263. },
  264. "execution_count": 7,
  265. "metadata": {},
  266. "output_type": "execute_result"
  267. }
  268. ],
  269. "source": [
  270. "# Uma contagem mais simples, quantas vezes a empresa se repete?\n",
  271. "df['empresa'].value_counts().head()"
  272. ]
  273. },
  274. {
  275. "cell_type": "code",
  276. "execution_count": 8,
  277. "metadata": {},
  278. "outputs": [],
  279. "source": [
  280. "# Lendo os dados do DataFrame\n",
  281. "for i, row in df.iterrows():\n",
  282. " # Atribuição múltipla\n",
  283. " nome, email, empresa = row['nome'], row['email'], row['empresa']\n",
  284. " # Rodando um comando do shell script\n",
  285. " subprocess.call(\"mkdir -p /tmp/subpastas/{}\".format(empresa), shell=True)\n",
  286. " # Nome dos arquivos de saida /tmp/subpastas/empresa/empresa.txt'\n",
  287. " filename = '/tmp/subpastas/{0}/{0}.txt'.format(empresa)\n",
  288. " # Abrindo o arquivo para escrever os dados\n",
  289. " with open(filename, 'a') as f:\n",
  290. " texto = '{},{},{}\\n'.format(nome, email, empresa)\n",
  291. " # Escrevendo os dados no arquivo\n",
  292. " f.write(texto)"
  293. ]
  294. },
  295. {
  296. "cell_type": "code",
  297. "execution_count": 9,
  298. "metadata": {},
  299. "outputs": [],
  300. "source": [
  301. "for i, row in df.iterrows():\n",
  302. " nome, email, empresa = row['nome'], row['email'], row['empresa']\n",
  303. " pasta = '/tmp/subpastas2'\n",
  304. " subprocess.call(\"mkdir -p {}/{}\".format(pasta, empresa), shell=True)\n",
  305. " filename = '{0}/{1}/{1}.txt'.format(pasta, empresa)\n",
  306. " texto = '{},{},{}'.format(nome, email, empresa)\n",
  307. " # Salvando direto no arquivo com shell script\n",
  308. " subprocess.call(\"echo '{1}' >> {0}\".format(filename, texto), shell=True)"
  309. ]
  310. }
  311. ],
  312. "metadata": {
  313. "kernelspec": {
  314. "display_name": "Python 3",
  315. "language": "python",
  316. "name": "python3"
  317. },
  318. "language_info": {
  319. "codemirror_mode": {
  320. "name": "ipython",
  321. "version": 3
  322. },
  323. "file_extension": ".py",
  324. "mimetype": "text/x-python",
  325. "name": "python",
  326. "nbconvert_exporter": "python",
  327. "pygments_lexer": "ipython3",
  328. "version": "3.5.2"
  329. }
  330. },
  331. "nbformat": 4,
  332. "nbformat_minor": 2
  333. }
Add Comment
Please, Sign In to add comment