Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- allowed_str = "0,1,2,3,4,5,6,7,8,9,10,17,20,21,27,56,57,58,59,60,61,62,63,64,65,66,69,70,71,72,73,74,75,79,80,81,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,149,150,151,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,223,224,225,226,227,228,229,230,231,232,233,237,238,239,240,243,244,249,250,251,252,256,257,268,269,271,272,273,274,275,276,277,278,279,280,281,282,317,318,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,409,412,413,468,469,528,529,530,531,532,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,557,558,559,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621,639,640,641,642,643,652,653,654,655,657,658,659,660,661,662,664,665,666,667,668,669,670,671,672,673,674,675,676,677,678,679,680,681,682,683,684,685,686,687,688,689,690,691,692,693,694,705,706,707,708,709,710,711,712,713,714,715,716,717,718,719,720,721,722,723,724,725,726,727,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817,818,819,820,821,822,823,824,825,826,827,828,829,830,831,832,833,834,836,837,838,839,840,841,853,854,855,856,857,858,859,860,861,862,863,864,865,866,867,868,869,870,871,872,873,874,875,880,881,882,883,884,885,886,887,894,895,896,897,898,899,900,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,921,922,923,924,925,926,927,928,929,930,931,932,933,934,935,936,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,1005,1006,1007,1008,1009,1010,1026,1027,1030,1031,1032,1033,1034,1035,1036,1037,1038,1039,1040,1041,1042,1043,1044,1047,1048,1049,1050,1051,1052,1053,1054,1055,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1095,1096,1097,1098,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,1111,1112"
- allowed_common_str = "0,1,2,3,4,5,6,7,8,9,63,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,337,338,339,340,341,342,343,344,345,346,369,395,397,398,399,540,541,542,543,544,545,546,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621,664,666,667,670,672,674,675,678,735,736,737,738,739,740,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,961,962,1030,1031,1032,1033"
- allowed = allowed_str.split(',')
- common = allowed_common_str.split(',')
- all_f = []
- for i in range(1114):
- all_f.append('{}'.format(i))
- WHOLE_POOL = True
- filter_fact = common
- N = 310_000
- in_name = 'tv_pool'
- out_name = './Common/' + in_name + '_parsed310k.tsv'
- def pars(s: str):
- res = {}
- ss = s.split('\t')
- for cur in ss:
- cur_split = cur.split('=')
- arg = cur_split[0]
- val = cur_split[1]
- if arg == 'factors':
- res[arg] = val.split(' ')
- if arg == 'reqid':
- res[arg] = int(val)
- if arg == 'query':
- res[arg] = val
- if arg == 'clicked':
- if val[0] == 'f':
- res[arg] = 0
- else:
- res[arg] = 1
- return res
- def pars_to_csv(params: dict, factors):
- res = params['query'] + '\t'
- for f in factors:
- res += params['factors'][int(f)] + '\t'
- #for factor in params['factors']:
- # res += factor + '\t'
- return res + str(params['clicked']) + '\n'
- file = open(in_name, 'r')
- out = open(out_name, 'w')
- first = file.readline()
- data = pars(first)
- out.write('\"Query\"\t')
- for i in range(len(filter_fact)):
- out.write('\"{0}\"\t'.format(i))
- out.write('\"Class\"\n')
- out.write(pars_to_csv(data, filter_fact))
- cnt = 0
- for line in file:
- data = pars(line)
- if cnt < N or WHOLE_POOL:
- out.write(pars_to_csv(data, filter_fact))
- #elif cnt < 11000:
- # out1.write(pars_to_csv(data, filter_fact))
- #else:
- # break
- else:
- break
- file.close()
- out.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement