Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import re, string
- import numpy as np
- from scipy.spatial.distance import pdist, squareform, euclidean
- def firmmeans(data,year,key):
- d = data[data['bvdid'].isin(key['bvdid'][key['year']==year].tolist())]
- d[['year']] = d[['year']].astype(int)
- d = d[d['year']<=year]
- cols = [col for col in d.columns.values if col not in ['year','appln_id']]
- d = d[cols].groupby('bvdid',as_index=False)
- d = d.aggregate(np.mean)
- cols = [col for col in d.columns.values if col not in ['bvdid']]
- firms = d['bvdid'].tolist()
- d = d[cols].as_matrix()
- d = pdist(d,metric='euclidean')
- d = squareform(d)
- d = pd.DataFrame(d,columns=firms)
- d['bvdid'] = firms
- d = d.set_index('bvdid')
- return d
- if __name__ == '__main__':
- pattern = re.compile('[\W_]+')
- df = pd.read_csv('bvd_small.csv')
- df = df[['bvd_id','year']]
- patents = pd.read_csv('labelledpredictions.txt',header=None)
- link = pd.read_csv('bvdid_patents_link.csv')
- date = pd.read_csv('appln_date.csv')
- new_columns = df.columns.values
- new_columns[0] = 'bvdid'
- df.columns = new_columns
- new_columns = patents.columns.values.tolist()
- new_columns[0] = 'appln_id'
- patents.columns = new_columns
- new_columns = link.columns.values
- new_columns[0] = 'bvdid'
- link.columns = new_columns
- link['bvdid'] = [pattern.sub('', x) for x in link['bvdid']]
- df['bvdid'] = [pattern.sub('', x) for x in df['bvdid']]
- date['appln_date'] = [x[-4:] for x in date['appln_date']]
- new_columns = date.columns.values
- new_columns[1] = 'year'
- date.columns = new_columns
- link = link[link['bvdid'].isin(df['bvdid'].unique().tolist())]
- patents = patents.dropna()
- patents['appln_id'] = patents[['appln_id']].astype(int)
- patents = patents[patents['appln_id'].isin(link['appln_id'].tolist())]
- data = patents.merge(date,on='appln_id',how='left')
- data = data.merge(link,on='appln_id',how='left')
- for i in df['year'].unique().tolist():
- a = firmmeans(data,i,df)
- year = str(i)
- name = 'firmtechdist'+year+'.csv'
- a.to_csv(path=str(name))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement