Advertisement
Guest User

Untitled

a guest
Sep 17th, 2015
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.99 KB | None | 0 0
  1. #encoding:utf8
  2. import os
  3. from ast import literal_eval
  4. import re
  5. import MeCab
  6. import unicodedata
  7. import sys
  8. import ngram
  9.  
  10.  
  11. argvs = sys.argv # コマンドライン引数を格納したリストの取得
  12. argc = len(argvs) # 引数の個数
  13. # デバッグプリント
  14. print argvs[1]
  15. #print argc
  16. #ID = '0002'
  17. ID = str(argvs[1])
  18. files = os.listdir('../tcserv.nii.ac.jp/access/tomokiitoupcfax@gmail.com/832c5b059b15f647/nicocomm/data/thread/' + ID)
  19. thread = {}
  20. thread[ID] = {}
  21. kigou = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~"
  22. index = ngram.NGram(N=2)
  23. for nfile in files:
  24. filepass = ('../tcserv.nii.ac.jp/access/tomokiitoupcfax@gmail.com/832c5b059b15f647/nicocomm/data/thread/' + ID +'/' + str(nfile))
  25. f = open(filepass)
  26. lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
  27. #data1 = f.read() # ファイル終端まで全て読んだデータを返す
  28. f.close()
  29. Lines2 = {}
  30. count = 0
  31. for line in lines2:
  32. try:
  33. Lines2[count] = literal_eval(line)
  34. except:
  35. print line
  36. print count
  37. print nfile
  38. line = line.replace('null', '"null"')
  39. print line
  40. try:
  41. Lines2[count] = literal_eval(line)
  42. except:
  43. continue
  44.  
  45. try:
  46. Lines2[count]['comment'] = Lines2[count]['comment'].decode('unicode_escape')
  47. except:
  48. try:
  49. #print ("Eroor1" + Lines2[count]['comment'])
  50. Lines2[count]['comment'] = Lines2[count]['comment'][0:-1]
  51. except:
  52. print ("Eroor2" + line)
  53. #print Lines2[count]['comment']
  54. count += 1
  55. thread[ID][nfile] = Lines2
  56. #tagger = MeCab.Tagger( '-Owakati -u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
  57. #commentfiles = os.listdir('comment')
  58. for j in thread[ID].keys():
  59. filename = ("comment2_bigram" + ID + "/" + j[0:-3] +"txt")
  60. fo = file(filename,'w')
  61. print filename
  62. commenttext = ''
  63. for i in range(0,len(thread[ID][j])):
  64. if i > 20000:
  65. print i,j
  66. break
  67. commenttext += thread[ID][j][i]["comment"]
  68. try:
  69. thread[ID][j][i]["comment"] = unicodedata.normalize('NFKC', thread[ID][j][i]["comment"])
  70. except:
  71. print "normalize Eroor"
  72. pluscomment = str(thread[ID][j][i]["comment"].encode('utf-8'))
  73. pluscomment = pluscomment.replace("█", "")
  74. pluscomment = pluscomment.replace("□", "")
  75. pluscomment = pluscomment.replace("※", "")
  76. pluscomment = pluscomment.replace("∴", "")
  77. pluscomment = pluscomment.replace("*", "")
  78. pluscomment = pluscomment.replace("+", "")
  79. pluscomment = pluscomment.replace("・", "")
  80. pluscomment = pluscomment.replace("°", "")
  81. pluscomment = pluscomment.replace("w", "")
  82. pluscomment = pluscomment.replace("null", "")
  83. pluscomment = ((((pluscomment.replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")
  84. pluscomment = pluscomment.replace("\n", "")
  85. pluscomment = pluscomment.replace("\t", "")
  86. pluscomment = pluscomment.replace(" ", "")
  87. pluscomment = pluscomment.replace(" ", "")
  88. pluscomment = re.sub(re.compile("[!-/:-@[-`{-~]"), '', pluscomment)
  89. if pluscomment != '':
  90. #pluscomment = tagger.parse(pluscomment)
  91. for text in list(index.ngrams(index.pad(pluscomment.decode("utf-8")))):
  92. fo.write(text.encode("utf-8") + " ")
  93. #thread[ID][j]["comment"] =
  94. fo.write("\n")
  95. fo.close()
  96.  
  97. files = os.listdir('../tcserv.nii.ac.jp/access/tomokiitoupcfax@gmail.com/832c5b059b15f647/nicocomm/data/video')
  98. for nfile in files[1:2]:
  99. #print file
  100. nfile = (ID + ".dat")
  101. filepass = ('../tcserv.nii.ac.jp/access/tomokiitoupcfax@gmail.com/832c5b059b15f647/nicocomm/data/video/' + str(nfile))
  102. f = open(filepass)
  103. lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
  104. f.close()
  105. Lines2 = {}
  106. count = 0
  107.  
  108. for line in lines2:
  109. try:
  110. Lines2[count] = literal_eval(line)
  111. print Lines2[count]["video_id"], Lines2[count]["title"].decode('unicode_escape')
  112. thread[ID][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape')
  113. count += 1
  114. except:
  115. print line
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement