Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def load_pool(file, stop_size=None):
- if file == "pool":
- filename = "\sport_pool_20190305_20190307"
- elif file == "common":
- filename = "\out_common"
- elif file == "allowed":
- filename = "\out_allowed"
- elif file == "shuffled":
- filename = "\shuf"
- else:
- print("filename error")
- df = pd.read_csv(r'C:\Users\Anastasiya\Desktop\диплом' + filename, delimiter='\t',
- encoding='utf-8', nrows=300000, low_memory=False,
- names=['query', 'factors', 'urls', 'target', 'clicks'])
- cnt = 0
- data = []
- target = []
- cnt0 = 0
- cnt1 = 0
- len_of_facts = len(list(str(df.values[0][1])[8:].split()))
- for ex in df.values:
- query = str(ex[0])[6:]
- if query == "":
- continue
- facts = list(str(ex[1])[8:].split())
- urls = list(str(ex[2])[5:].split())
- targ = int(str(ex[3])[7:])
- if len(facts) != len_of_facts:
- continue
- cur_list = [query]
- query_facts = facts[:]
- cur_list += list(map(float, query_facts))
- data.append(cur_list)
- if targ == 0:
- target.append(0)
- cnt0 += 1
- else:
- target.append(1)
- cnt1 += 1
- cnt += 1
- if cnt % 5000 == 0:
- print("total: {}, 0: {}, 1: {}".format(cnt, cnt0, cnt1))
- if stop_size is not None:
- if cnt >= stop_size:
- break
- print("data is loaded, cnt0 = {}, cnt1 = {}".format(cnt0, cnt1))
- return data, target
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement