Advertisement
Guest User

Untitled

a guest
May 20th, 2019
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.15 KB | None | 0 0
  1. allowed_str = "0,1,2,3,4,5,6,7,8,9,10,17,20,21,27,56,57,58,59,60,61,62,63,64,65,66,69,70,71,72,73,74,75,79,80,81,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,149,150,151,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,223,224,225,226,227,228,229,230,231,232,233,237,238,239,240,243,244,249,250,251,252,256,257,268,269,271,272,273,274,275,276,277,278,279,280,281,282,317,318,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,409,412,413,468,469,528,529,530,531,532,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,557,558,559,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621,639,640,641,642,643,652,653,654,655,657,658,659,660,661,662,664,665,666,667,668,669,670,671,672,673,674,675,676,677,678,679,680,681,682,683,684,685,686,687,688,689,690,691,692,693,694,705,706,707,708,709,710,711,712,713,714,715,716,717,718,719,720,721,722,723,724,725,726,727,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817,818,819,820,821,822,823,824,825,826,827,828,829,830,831,832,833,834,836,837,838,839,840,841,853,854,855,856,857,858,859,860,861,862,863,864,865,866,867,868,869,870,871,872,873,874,875,880,881,882,883,884,885,886,887,894,895,896,897,898,899,900,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,921,922,923,924,925,926,927,928,929,930,931,932,933,934,935,936,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,1005,1006,1007,1008,1009,1010,1026,1027,1030,1031,1032,1033,1034,1035,1036,1037,1038,1039,1040,1041,1042,1043,1044,1047,1048,1049,1050,1051,1052,1053,1054,1055,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1095,1096,1097,1098,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,1111,1112"
  2. allowed_common_str = "0,1,2,3,4,5,6,7,8,9,63,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,337,338,339,340,341,342,343,344,345,346,369,395,397,398,399,540,541,542,543,544,545,546,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621,664,666,667,670,672,674,675,678,735,736,737,738,739,740,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,961,962,1030,1031,1032,1033"
  3.  
  4. allowed = allowed_str.split(',')
  5. common = allowed_common_str.split(',')
  6.  
  7. all_f = []
  8. for i in range(1114):
  9.     all_f.append('{}'.format(i))
  10.  
  11.  
  12. WHOLE_POOL = True
  13. filter_fact = common
  14. N = 310_000
  15. in_name = 'tv_pool'
  16. out_name = './Common/' + in_name + '_parsed310k.tsv'
  17.  
  18. def pars(s: str):
  19.     res = {}
  20.     ss = s.split('\t')
  21.     for cur in ss:
  22.         cur_split = cur.split('=')
  23.         arg = cur_split[0]
  24.         val = cur_split[1]
  25.         if arg == 'factors':
  26.             res[arg] = val.split(' ')
  27.         if arg == 'reqid':
  28.             res[arg] = int(val)
  29.         if arg == 'query':
  30.             res[arg] = val
  31.         if arg == 'clicked':
  32.             if val[0] == 'f':
  33.                 res[arg] = 0
  34.             else:
  35.                 res[arg] = 1
  36.     return res
  37.  
  38.  
  39. def pars_to_csv(params: dict, factors):
  40.     res = params['query'] + '\t'
  41.     for f in factors:
  42.         res += params['factors'][int(f)] + '\t'
  43.     #for factor in params['factors']:
  44.     #    res += factor + '\t'
  45.     return res + str(params['clicked']) + '\n'
  46.  
  47.  
  48.  
  49.  
  50. file = open(in_name, 'r')
  51. out = open(out_name, 'w')
  52.  
  53.  
  54.  
  55.  
  56.  
  57. first = file.readline()
  58. data = pars(first)
  59. out.write('\"Query\"\t')
  60. for i in range(len(filter_fact)):
  61.     out.write('\"{0}\"\t'.format(i))
  62. out.write('\"Class\"\n')
  63. out.write(pars_to_csv(data, filter_fact))
  64.  
  65.  
  66. cnt = 0
  67. for line in file:
  68.     data = pars(line)
  69.  
  70.  
  71.     if cnt < N or WHOLE_POOL:
  72.         out.write(pars_to_csv(data, filter_fact))
  73.     #elif cnt < 11000:
  74.     #    out1.write(pars_to_csv(data, filter_fact))
  75.     #else:
  76.     #    break
  77.     else:
  78.         break
  79.  
  80. file.close()
  81. out.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement