Guest User

Untitled

a guest
Apr 10th, 2018
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.95 KB | None | 0 0
  1. #TODO Реализовать обвноление данных с использованием Ваниного скрипта автоматом.
  2. #!/usr/bin/python
  3. # -*- coding: utf-8 -*-
  4. import os
  5. import pandas as pd
  6. import psycopg2
  7. import requests
  8. import time
  9. import pickle
  10. import csv
  11. import logging
  12. from zipfile import ZipFile
  13. from urllib.request import urlopen
  14. from datetime import datetime, timedelta
  15. from binascii import b2a_base64
  16. from hashlib import sha1
  17. from hmac import new
  18. from io import BytesIO
  19. from urllib.parse import quote
  20. from sqlalchemy import create_engine
  21. from time import gmtime, strftime
  22.  
  23.  
  24. #SETTINGS
  25. path_to_paid_users_folder = r'/Users/a.eryomin/PycharmProjects/DataDownloader/onelink_paid_users'
  26. onelink_pure_data = r'/Users/a.eryomin/PycharmProjects/DataDownloader/onelink_pure_data'
  27.  
  28. # #Variables for Postgresql settings
  29. PG = {
  30. 'host': 'youlahdp-dev-statdb1.devmail.ru',
  31. 'user': 'stat',
  32. 'password': 'stat',
  33. 'database': 'statdb',
  34. }
  35.  
  36. db_connection = psycopg2.connect(dbname=PG['database'], user=PG['user'], password=PG['password'], host=PG['host'])
  37.  
  38.  
  39.  
  40. # #Query for gettinbg data from Postgresql
  41. # pg_query = '''
  42. # SELECT
  43. # y_users.user_id,
  44. # lower(y_users.user_adv_id) AS adv_user_id,
  45. # y_users.date_registered,
  46. # y_users.date_authorized,
  47. # y_users.push_token_current
  48. # FROM aeryomin.y_users AS y_users
  49. #
  50. # JOIN
  51. # (SELECT sender_id, press_chat_button, press_call_button, product_id FROM tmp_tables.y_user_user_counter)
  52. # AS y_usr_usr
  53. # ON y_users.user_id = y_usr_usr.sender_id
  54. # WHERE
  55. # date_trunc('day', TO_TIMESTAMP(y_users.date_registered)) >= date_trunc('day', now()) - INTERVAL '7 day'
  56. # AND (y_usr_usr.press_call_button IS NOT NULL
  57. # OR y_usr_usr.press_chat_button IS NOT NULL
  58. # OR y_usr_usr.product_id IS NOT NULL)
  59. # AND (y_users.platform = 'android'
  60. # OR y_users.platform = 'ios')
  61. # ;
  62. # '''
  63. #
  64. # updated_query = """`
  65. # SELECT
  66. # y_users.user_id,
  67. # lower(y_users.user_adv_id) AS adv_user_id,
  68. # y_users.date_registered,
  69. # y_users.date_authorized,
  70. # y_users.push_token_current
  71. #
  72. # FROM aeryomin.y_users AS y_users
  73. # WHERE
  74. # date_trunc('day', TO_TIMESTAMP(y_users.date_registered)) >= date_trunc('day', now()) - INTERVAL '7 day'
  75. # AND y_users.user_id NOT IN (SELECT sender_id FROM aeryomin.y_user_user_counter)
  76. # AND (y_users.platform = 'android'
  77. # OR y_users.platform = 'ios')
  78. # ;
  79. # """
  80. #
  81. # paid_user_id.to_sql('aeryomin.from_1link', engine)
  82. # cur = db_connection.cursor()
  83. # cur.execute(updated_query)
  84. # y_users_df = pd.DataFrame(cur.fetchall())
  85.  
  86.  
  87. def one_link_downloader():
  88. logging.info("One Link downloader has been started.")
  89. projects = {
  90. '100260': 'ios',
  91. '101318': 'android'
  92. }
  93. now = str(datetime.now().date())
  94. dates = [str((datetime.now() - timedelta(days=i)).date()) for i in range(1, 10)]
  95. result_projects = []
  96. getliststart_idfa_idAd_customEvent_1_8_9 = "http://1l-api.mail.ru/v1/profile/getliststart.json?" + \
  97. "idProject={idProject}&" + \
  98. "fields%5B%5D=idfa&" + \
  99. "fields%5B%5D=tsLastCustomEventRepeatable2&" + \
  100. "fields%5B%5D=tsLastCustomEventRepeatable13&" + \
  101. "fields%5B%5D=tsLastCustomEventRepeatable15&" + \
  102. "fields%5B%5D=idAd&" + \
  103. "fields%5B%5D=idUser&" + \
  104. "fields%5B%5D=tsReg&" + \
  105. "filters%5BtsReg%5D%5Bbetween%5D%5Bstart%5D={date1}&" + \
  106. "filters%5BtsReg%5D%5Bbetween%5D%5Bend%5D={date2}"
  107.  
  108. getliststart = 'http://1l-api.mail.ru/v1/profile/getliststart.json?idProject={idProject}&fields=*'
  109. getliststatus = "http://1l-api.mail.ru/v1/profile/getliststatus.json?uidReport={uidReport}"
  110. get_campaign_name = 'http://1l-api.mail.ru/v1/campaign/gettitle.json?id={id}'
  111.  
  112. def get_info(url, params={}):
  113. def get_signature(access_id, private_key, url, method='GET', post_data=None):
  114. method = method.upper()
  115. # post_data must be in URL-encoded query string format
  116. data = post_data if post_data else ''
  117. string = '%s&%s&%s' % (
  118. method,
  119. quote(url, safe='~'),
  120. quote(data, safe='~')
  121. )
  122. key_bytes = bytes(private_key, 'latin-1')
  123.  
  124. signature = b2a_base64(
  125. new(key_bytes, string.encode(), sha1).digest()
  126. ).decode().rstrip('\n')
  127. return 'AuthHMAC %s:%s' % (access_id, signature)
  128.  
  129. url = url.format(**params)
  130. sign = get_signature('a.eryomin', 'y;+DmDRwf64Yb}2bnYZ', url)
  131. return requests.get(url, headers={'Authorization': sign}).json()
  132.  
  133. def get_report(link):
  134. url = urlopen(link)
  135. print(type(url))
  136. print(url)
  137. zipfile = ZipFile(BytesIO(url.read()))
  138. f = open(list(zipfile.NameToInfo.keys())[0], 'wb')
  139. f.write(zipfile.open(list(zipfile.NameToInfo.keys())[0]).read())
  140. f.close()
  141. return pd.read_csv(list(zipfile.NameToInfo.keys())[0], sep=';')
  142.  
  143. now = str(datetime.now().date())
  144. dates = [str((datetime.now() - timedelta(days=i)).date()) for i in range(1, 8)]
  145.  
  146. result_projects = []
  147.  
  148. for project_id in projects:
  149. print(project_id, dates)
  150. print(get_info(getliststart_idfa_idAd_customEvent_1_8_9,
  151. {'date1': min(dates), 'date2': max(dates), 'idProject': project_id}))
  152. result_projects.append(
  153. get_info(getliststart_idfa_idAd_customEvent_1_8_9,
  154. {'date1': min(dates), 'date2': max(dates), 'idProject': project_id})['data']['uidReport']
  155. )
  156.  
  157. links = []
  158. for project_id in result_projects:
  159. link = get_info(getliststatus, {'uidReport': project_id})['data'].get('link')
  160. if link is not None:
  161. links.append(link)
  162.  
  163. while True:
  164. links = []
  165. for project_id in result_projects:
  166. link = get_info(getliststatus, {'uidReport': project_id})['data'].get('link')
  167. if link is not None:
  168. links.append(link)
  169. if len(links) == len(result_projects):
  170. break
  171. time.sleep(6 * 10)
  172. print('-')
  173.  
  174. dfs = []
  175. for link, params in zip(links, list(projects.values())):
  176. df = get_report(link)
  177. df.columns = ['idfa', 'goal1', 'goal8', 'goal9', 'campaign', 'id_user', 'date_reg']
  178. df['platform'] = [params] * df.shape[0]
  179. df['date_check'] = [now] * df.shape[0]
  180. dfs.append(df)
  181.  
  182. result = pd.concat(dfs)
  183. result['campaign'] = result['campaign'].astype(str)
  184. need_campaign = set(map(lambda x: x, result['campaign']))
  185.  
  186. campaign = pickle.load(open('campaign.pckl', 'rb'))
  187. for i in need_campaign:
  188. if i not in set(campaign.keys()):
  189. resp = get_info(get_campaign_name, {'id': i})['data']
  190. if type(resp) == list and len(resp) == 1:
  191. campaign = dict({i: resp[0]}, **campaign)
  192. else:
  193. campaign = dict(resp, **campaign)
  194. pickle.dump(campaign, open('campaign.pckl', 'wb'), protocol=0)
  195.  
  196. df_campaign = pd.DataFrame(list(campaign.items()))
  197. df_campaign.columns = ['campaign_id', 'campaign_name']
  198. result = pd.merge(result, df_campaign, how='outer', left_on='campaign', right_on='campaign_id')
  199. result['campaign_name'] = result['campaign_name'].map(lambda x: x.replace('\t', ' '))
  200. result['goal1'] = pd.to_datetime(result['goal1'] * 1000000000)
  201. result['goal8'] = pd.to_datetime(result['goal8'] * 1000000000)
  202. result['goal9'] = pd.to_datetime(result['goal9'] * 1000000000)
  203. result['date_reg'] = pd.to_datetime(result['date_reg'] * 1000000000)
  204. result['date_reg'] = result['date_reg'].map(lambda x: str(x.date()))
  205. result = result[
  206. ['idfa', 'goal1', 'goal8', 'goal9', 'campaign', 'id_user',
  207. 'platform', 'date_reg', 'date_check', 'campaign_id',
  208. 'campaign_name']
  209. ]
  210. result.to_csv(os.path.join(onelink_pure_data, 'onelink_pure_data_{}.csv'.format(now)), encoding='utf-8', sep='\t', index=False, header=False)
  211.  
  212.  
  213. def get_only_paid_traffic():
  214. path_to_csv = os.path.join(onelink_pure_data, 'onelink_pure_data_{}.csv'.format(str(datetime.now().date())))
  215. date_from_file = pd.read_csv(path_to_csv, sep='\t')
  216. df = pd.DataFrame(date_from_file)
  217. paid_df = df[df.iloc[:, 10] != 'Organic: Empty tracking id']
  218. paid_user_id = paid_df.iloc[:, 0]
  219. #Rename old csv file
  220. os.rename(os.path.join(path_to_paid_users_folder, 'paid_users.csv'),
  221. os.path.join(path_to_paid_users_folder, 'paid_users_{0}.csv'.format(strftime("%Y-%m-%d", gmtime()))))
  222. #Store data into paid_users.csv
  223. paid_user_id.to_csv(os.path.join(path_to_paid_users_folder, 'paid_users.csv'), header=False, index=False)
  224.  
  225.  
  226. def add_one_link_to_pg():
  227. #Drop table in PG
  228. drop_query = '''
  229. DROP TABLE aeryomin.onelink_paid_users;
  230. '''
  231. create_query = '''
  232. CREATE TABLE aeryomin.onelink_paid_users (device_id VARCHAR)
  233. '''
  234. db_connection = psycopg2.connect(dbname=PG['database'], user=PG['user'], password=PG['password'], host=PG['host'])
  235. drop_cursor = db_connection.cursor()
  236. drop_cursor.execute(drop_query)
  237. db_connection.commit()
  238. create_cursor = db_connection.cursor()
  239. create_cursor.execute(create_query)
  240. db_connection.commit()
  241. insert_cursor = db_connection.cursor()
  242. with open(os.path.join(path_to_paid_users_folder, 'paid_users.csv'), 'r') as f:
  243. next(f)
  244. insert_cursor.copy_from(f, 'aeryomin.onelink_paid_users')
  245. db_connection.commit()
  246. db_connection.close()
  247.  
  248.  
  249. def create_tmp_users_table():
  250. #Getting users from y_users by condition.
  251. pg_select_query = '''
  252. SELECT
  253. users.user_id
  254. date_trunc('day', TO_TIMESTAMP(users.date_registered)),
  255. FROM aeryomin.y_users as users
  256. WHERE
  257. upper(user_adv_id) IN (SELECT * FROM aeryomin.onelink_paid_users)
  258. AND date_trunc('day', TO_TIMESTAMP(users.date_registered)) >= date_trunc('day', now()) - INTERVAL '7 day'
  259. AND
  260. GROUP BY
  261. date_trunc('day', TO_TIMESTAMP(users.date_registered)),
  262. users.user_id
  263. ;
  264. '''
  265. cur = db_connection.cursor()
  266. cur.execute(pg_select_query)
  267. y_users_df = pd.DataFrame(cur.fetchall())
  268. y_users_df.to_csv('final_users_.csv', header=False, index=False)
  269.  
  270.  
  271. if __name__ == '__main__':
  272. #Выкачивание данных из 1линка. По результату работы будет сохранён файл .csv с выгрузкой.
  273. one_link_downloader()
  274. add_one_link_to_pg()
  275. get_only_paid_traffic()
  276. create_tmp_users_table()
Add Comment
Please, Sign In to add comment