Advertisement
Guest User

Untitled

a guest
Jan 31st, 2015
203
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.94 KB | None | 0 0
  1. import pandas as pd
  2. import re, string
  3. import numpy as np
  4. from scipy.spatial.distance import pdist, squareform, euclidean
  5.  
  6. def firmmeans(data,year,key):
  7. d = data[data['bvdid'].isin(key['bvdid'][key['year']==year].tolist())]
  8. d[['year']] = d[['year']].astype(int)
  9. d = d[d['year']<=year]
  10. cols = [col for col in d.columns.values if col not in ['year','appln_id']]
  11. d = d[cols].groupby('bvdid',as_index=False)
  12. d = d.aggregate(np.mean)
  13. cols = [col for col in d.columns.values if col not in ['bvdid']]
  14. firms = d['bvdid'].tolist()
  15. d = d[cols].as_matrix()
  16. d = pdist(d,metric='euclidean')
  17. d = squareform(d)
  18. d = pd.DataFrame(d,columns=firms)
  19. d['bvdid'] = firms
  20. d = d.set_index('bvdid')
  21. return d
  22.  
  23. if __name__ == '__main__':
  24. pattern = re.compile('[\W_]+')
  25. df = pd.read_csv('bvd_small.csv')
  26. df = df[['bvd_id','year']]
  27. patents = pd.read_csv('labelledpredictions.txt',header=None)
  28. link = pd.read_csv('bvdid_patents_link.csv')
  29. date = pd.read_csv('appln_date.csv')
  30. new_columns = df.columns.values
  31. new_columns[0] = 'bvdid'
  32. df.columns = new_columns
  33. new_columns = patents.columns.values.tolist()
  34. new_columns[0] = 'appln_id'
  35. patents.columns = new_columns
  36. new_columns = link.columns.values
  37. new_columns[0] = 'bvdid'
  38. link.columns = new_columns
  39. link['bvdid'] = [pattern.sub('', x) for x in link['bvdid']]
  40. df['bvdid'] = [pattern.sub('', x) for x in df['bvdid']]
  41. date['appln_date'] = [x[-4:] for x in date['appln_date']]
  42. new_columns = date.columns.values
  43. new_columns[1] = 'year'
  44. date.columns = new_columns
  45. link = link[link['bvdid'].isin(df['bvdid'].unique().tolist())]
  46. patents = patents.dropna()
  47. patents['appln_id'] = patents[['appln_id']].astype(int)
  48. patents = patents[patents['appln_id'].isin(link['appln_id'].tolist())]
  49. data = patents.merge(date,on='appln_id',how='left')
  50. data = data.merge(link,on='appln_id',how='left')
  51. for i in df['year'].unique().tolist():
  52. a = firmmeans(data,i,df)
  53. year = str(i)
  54. name = 'firmtechdist'+year+'.csv'
  55. a.to_csv(path=str(name))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement