Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "metadata": {
- "toc": true
- },
- "cell_type": "markdown",
- "source": "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n<div class=\"toc\"><ul class=\"toc-item\"></ul></div>"
- },
- {
- "metadata": {
- "ExecuteTime": {
- "end_time": "2018-06-12T17:21:56.354446Z",
- "start_time": "2018-06-12T17:21:56.333079Z"
- },
- "extensions": {
- "jupyter_dashboards": {
- "version": 1,
- "views": {
- "grid_default": {
- "hidden": true
- },
- "report_default": {}
- }
- }
- },
- "hide_input": false,
- "trusted": true
- },
- "cell_type": "code",
- "source": "import xlrd\nimport os\nimport sys\nimport csv\nimport hashlib\nmd5 = lambda s: hashlib.md5(s).hexdigest()",
- "execution_count": 2,
- "outputs": []
- },
- {
- "metadata": {
- "ExecuteTime": {
- "end_time": "2018-06-12T17:22:02.924670Z",
- "start_time": "2018-06-12T17:22:02.762354Z"
- },
- "extensions": {
- "jupyter_dashboards": {
- "version": 1,
- "views": {
- "grid_default": {
- "hidden": true
- },
- "report_default": {}
- }
- }
- },
- "hide_input": false,
- "trusted": true
- },
- "cell_type": "code",
- "source": "def clean_idfa(idfa):\n \"\"\"\n 清洗 idfa 数据格式\n \"\"\"\n try:\n if isinstance(idfa, float): # 检查是否 float 型数据\n return None\n idfa = idfa.upper()\n if len(idfa) == 32:\n idfa = '-'.join([idfa[:8], idfa[8:12], idfa[12:16], idfa[16:20], idfa[20:]])\n except: # 有异常直接 pass 掉\n return None \n return idfa\n\ndef clean_imei(imei):\n \"\"\"\n 清洗 imei 的数据格式\n \"\"\"\n try:\n if len(imei) != 32: # 如果不是 32 位则是异常数据\n return None\n return imei.lower() # 统一转为小写\n except:\n return None\n \ndef clean_excel_file(file_name):\n \"\"\"\n 清洗 Excel 文件的数据\n \"\"\"\n \n idfa_set = set()\n imei_md5_set = set()\n \n wb = xlrd.open_workbook(file_name) # 使用 xlrd 库读取 excel 文件\n sheet = wb.sheets()[0]\n nrows = sheet.nrows #获取第一张表格的行数赋值给nrows\n for i in xrange (1, nrows): #用一个for循环遍历所有的行数\n row = sheet.row_values(i) #打印所有遍历到的行数的内容\n idfa = row[0]\n idfa_ = row[1]\n imei_md5 = row[4]\n\n if idfa != u'':\n idfa = clean_idfa(idfa)\n if idfa: # 有数据则为正常数据\n idfa_set.add(idfa)\n if idfa_ != u'':\n idfa_ = clean_idfa(idfa_)\n if idfa:\n idfa_set.add(idfa_) \n \n if imei_md5 != u'':\n imei_md5 = clean_imei(imei_md5)\n if imei_md5: # 有数据则为正常数据\n imei_md5_set.add(imei_md5)\n \n return idfa_set, imei_md5_set\n\ndef clean_csv_file(file_name):\n \"\"\"\n 清洗 csv 格式的数据\n \"\"\"\n idfa_set = set()\n imei_md5_set = set()\n \n with open(file_name, 'rb') as f:\n reader = csv.reader(f) \n # rows = list(reader) # todo: 可能存在内存问题\n # 直接迭代速度相对会比全部加进内存满\n for row in reader: #用一个for循环遍历所有的行数\n idfa = row[0]\n idfa_ = row[1]\n imei_md5 = row[4]\n\n if idfa != u'':\n idfa = clean_idfa(idfa)\n if idfa:\n idfa_set.add(idfa)\n if idfa_ != u'':\n idfa_ = clean_idfa(idfa_)\n if idfa:\n idfa_set.add(idfa_) \n\n if imei_md5 != u'':\n imei_md5 = clean_imei(imei_md5)\n if imei_md5:\n imei_md5_set.add(imei_md5)\n return idfa_set, imei_md5_set\n \n\ndef clean_file(file_name, output_dir):\n \"\"\"\n 清洗文件\n 目前支持两种格式的特定数据(csv|xlsx)\n \"\"\"\n _, file_extension = os.path.splitext(file_name) # 获取文件后缀\n base_name = os.path.splitext(os.path.basename(file_name))[0] # 获取文件名\n des_dir = os.path.join(output_dir, base_name) # 输出目录\n if not os.path.exists(des_dir):\n os.makedirs(des_dir)\n if file_extension == '.xlsx': # 如果是 xlsx 后缀的文件\n idfa_set, imei_md5_set = clean_excel_file(file_name)\n else:\n idfa_set, imei_md5_set = clean_csv_file(file_name)\n\n idfa_file_path = os.path.join(des_dir, 'idfa.txt')\n imei_file_path = os.path.join(des_dir, 'imei.txt')\n idfa_file = open(idfa_file_path, 'w')\n imei_file = open(imei_file_path, 'w')\n \n for imei in imei_md5_set:\n imei_file.write(imei)\n imei_file.write('\\n')\n imei_file.close() # 养成 close 文件的好习惯\n \n for idfa in idfa_set:\n try:\n idfa_file.write(idfa)\n idfa_file.write('\\n')\n except:\n print idfa\n idfa_file.close()\n \ndef clean_dir(source_dir):\n \"\"\"\n 清洗目录下的所有文件,目前支持一级目录\n 生成路径为: ./source_dir+cleaned/{file_name}/imei.txt 和 ./source_dir+cleaned/{file_name}/idfa.txt\n \"\"\"\n output_dir = os.path.abspath(os.path.join(os.path.join(source_dir, os.pardir), os.path.basename(os.path.normpath(source_dir)) + '-cleaned'))\n if not os.path.exists(output_dir):\n os.mkdir(output_dir)\n file_list = os.listdir(source_dir)\n full_file_path_list = [os.path.join(source_dir, file_path) for file_path in file_list]\n for file_path in full_file_path_list:\n clean_file(file_path, output_dir)",
- "execution_count": 3,
- "outputs": []
- },
- {
- "metadata": {
- "ExecuteTime": {
- "end_time": "2018-06-12T17:22:25.640025Z",
- "start_time": "2018-06-12T17:22:14.803565Z"
- },
- "trusted": true
- },
- "cell_type": "code",
- "source": "data_dir = '../data/'\n\ndir_ = os.path.join(data_dir, u'东风日产官网-人群包原文件-导出excel')\n\n# 清洗指定目录文件\nclean_dir(dir_)",
- "execution_count": 4,
- "outputs": []
- },
- {
- "metadata": {
- "ExecuteTime": {
- "end_time": "2018-06-12T17:22:33.010756Z",
- "start_time": "2018-06-12T17:22:27.475955Z"
- },
- "extensions": {
- "jupyter_dashboards": {
- "version": 1,
- "views": {
- "grid_default": {
- "hidden": true
- },
- "report_default": {}
- }
- }
- },
- "trusted": true
- },
- "cell_type": "code",
- "source": "\nfile_name = os.path.join(data_dir, u'奇骏、逍客、劲客、楼兰、天籁外的其他车型0308-0607.csv')\nout_put_dir = '../data/clean/'\n# 清洗指定文件\nclean_file(file_name, out_put_dir)",
- "execution_count": 5,
- "outputs": []
- },
- {
- "metadata": {
- "extensions": {
- "jupyter_dashboards": {
- "version": 1,
- "views": {
- "grid_default": {
- "hidden": true
- },
- "report_default": {}
- }
- }
- },
- "trusted": true
- },
- "cell_type": "code",
- "source": "",
- "execution_count": null,
- "outputs": []
- }
- ],
- "metadata": {
- "extensions": {
- "jupyter_dashboards": {
- "activeView": "grid_default",
- "version": 1,
- "views": {
- "grid_default": {
- "cellMargin": 10,
- "defaultCellHeight": 20,
- "maxColumns": 12,
- "name": "grid",
- "type": "grid"
- },
- "report_default": {
- "name": "report",
- "type": "report"
- }
- }
- }
- },
- "kernelspec": {
- "name": "python2",
- "display_name": "Python2",
- "language": "python"
- },
- "language_info": {
- "mimetype": "text/x-python",
- "nbconvert_exporter": "python",
- "name": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.15",
- "file_extension": ".py",
- "codemirror_mode": {
- "version": 2,
- "name": "ipython"
- }
- },
- "toc": {
- "nav_menu": {},
- "number_sections": true,
- "sideBar": true,
- "skip_h1_title": true,
- "base_numbering": 1,
- "title_cell": "Table of Contents",
- "title_sidebar": "Contents",
- "toc_cell": true,
- "toc_position": {
- "height": "calc(100% - 180px)",
- "left": "10px",
- "top": "150px",
- "width": "251px"
- },
- "toc_section_display": true,
- "toc_window_display": true
- },
- "gist": {
- "id": "",
- "data": {
- "description": "notebook/device_clean.ipynb",
- "public": true
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment