Advertisement
Guest User

Untitled

a guest
Nov 26th, 2014
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.91 KB | None | 0 0
  1. from os import listdir
  2. from math import ceil
  3.  
  4. parts = listdir("wikiarticles/")
  5. partt = []
  6. for part in parts:
  7. partt.append(part)
  8. partt.sort()
  9. parts = []
  10.  
  11. article_id = 0
  12.  
  13. for part in partt:
  14. mkdir("chunks/"+part)
  15. folders = listdir("wikiarticles/" + part)
  16. folderr = []
  17. for folder in folders:
  18. folderr.append(folder)
  19. folderr.sort()
  20. folders = []
  21.  
  22. for folder in folderr:
  23. fout = open("chunks/"+part+"/"+folder+".json")
  24. files = listdir("wikiarticles/"+part+"/"+folder)
  25. filee = []
  26. for filename in files:
  27. filee.append(filename)
  28. filee.sort()
  29. files = []
  30.  
  31. for filename in filee:
  32. article_id += 1
  33. data = open("wikiarticles/"+part+"/"+folder+"/"+filename)
  34. size = len(data)
  35. chunks = ceil(len(data)/2500)
  36. for i in range(chunks):
  37. chunk['_id'] = float(str(article_id)+'.'+str(i+1))
  38. chunk['c'] = data[i*2500:(i+1)*2500]
  39. fout.write(str(chunk) + "\n")
  40.  
  41. fout.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement