Advertisement
Guest User

Untitled

a guest
Jul 23rd, 2019
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.60 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3.  
  4. '''
  5. edgelistとlabellistを0始まりのIDに変換する
  6. '''
  7.  
  8. import os
  9. import sys
  10. #from snlocest.largedict import LargeDict
  11.  
  12.  
  13. def parse_args():
  14. import argparse
  15. parser = argparse.ArgumentParser()
  16. parser.add_argument('mode', help='処理のモード')
  17. parser.add_argument('--tablepath', required=True)
  18. parser.add_argument('inputfiles', nargs='+')
  19. return parser.parse_args()
  20.  
  21.  
  22. def load_table(filepath):
  23. user2id = dict() #LargeDict()
  24. with open(filepath, 'r') as fd:
  25. for line in fd:
  26. user_id, idx = line.rstrip().split('\t')
  27. user2id[user_id] = int(idx)
  28. idcnt = max(user2id.values()) + 1
  29. return user2id, idcnt
  30.  
  31.  
  32. if __name__ == '__main__':
  33. args = parse_args()
  34.  
  35. if args.mode == 'table':
  36. # Generate convert table from inputfiles and save to tablepath
  37. if os.path.exists(args.tablepath):
  38. print('Table path already exists', file=sys.stderr)
  39. sys.exit(1)
  40. # create table
  41. user2id = dict() #LargeDict()
  42. idcnt = 0
  43. for filepath in args.inputfiles:
  44. with open(filepath, 'r') as inputfile:
  45. for line in inputfile:
  46. tokens = line.rstrip().split('\t')
  47. user_id = tokens[0]
  48. if user_id not in user2id:
  49. user2id[user_id] = idcnt
  50. idcnt += 1
  51. user_id = tokens[1]
  52. if user_id not in user2id:
  53. user2id[user_id] = idcnt
  54. idcnt += 1
  55. # save table
  56. with open(args.tablepath, 'w') as fd:
  57. for k, v in user2id.items():
  58. print(k, v, sep='\t', file=fd)
  59.  
  60. elif args.mode == 'edgelist':
  61. # Convert edgelist using table
  62. user2id, idcnt = load_table(args.tablepath)
  63.  
  64. for filepath in args.inputfiles:
  65. with open(filepath, 'r') as fd:
  66. for line in fd:
  67. row = line.rstrip().split('\t')
  68. src = row[0]
  69. dst = row[1]
  70. print(user2id[src], user2id[dst], *row[2:], sep='\t')
  71.  
  72. elif args.mode == 'label':
  73. # Convert label file
  74. user2id, idcnt = load_table(args.tablepath)
  75.  
  76. for filepath in args.inputfiles:
  77. with open(filepath, 'r') as fd:
  78. for line in fd:
  79. row = line.rstrip().split('\t')
  80. label = row[0]
  81. print(user2id[label], *row[1:], sep='\t')
  82.  
  83. else:
  84. print('Invalid mode. Choose "table" or "edgelist" or "label"', file=sys.stderr)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement