Advertisement
Guest User

spark whole text files api issue

a guest
Jul 5th, 2017
152
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 16.79 KB | None | 0 0
  1. 2017-07-05 12:27:56,372 [task-result-getter-0] WARN TaskSetManager - Lost task 0.0 in stage 0.0 (TID 0, 172.18.26.77, executor 0): java.lang.OutOfMemoryError: Requested array size exceeds VM limit
  2. at java.lang.StringCoding$StringEncoder.encode(StringCoding.java:300)
  3. at java.lang.StringCoding.encode(StringCoding.java:344)
  4. at java.lang.String.getBytes(String.java:918)
  5. at org.sfsu.spark.CluewebReader$.getWarcRecordsFromString(CluewebReader.scala:315)
  6. at org.sfsu.spark.CluewebReader$$anonfun$getWarcRecordsFromDirectory$1.apply(CluewebReader.scala:305)
  7. at org.sfsu.spark.CluewebReader$$anonfun$getWarcRecordsFromDirectory$1.apply(CluewebReader.scala:305)
  8. at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
  9. at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
  10. at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:461)
  11. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  12. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  13. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  14. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  15. at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:215)
  16. at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1005)
  17. at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:996)
  18. at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:936)
  19. at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:996)
  20. at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:700)
  21. at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
  22. at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
  23. at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
  24. at org.apache.spark.scheduler.Task.run(Task.scala:99)
  25. at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
  26. at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
  27. at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
  28. at java.lang.Thread.run(Thread.java:748)
  29.  
  30. 2017-07-05 12:27:57,144 [dispatcher-event-loop-1] ERROR TaskSchedulerImpl - Lost executor 0 on 172.18.26.77: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
  31. 2017-07-05 12:27:57,150 [dispatcher-event-loop-1] WARN TaskSetManager - Lost task 0.1 in stage 0.0 (TID 1, 172.18.26.77, executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
  32. 2017-07-05 12:28:09,245 [task-result-getter-1] WARN TaskSetManager - Lost task 0.2 in stage 0.0 (TID 2, 172.18.26.77, executor 1): java.lang.OutOfMemoryError: Requested array size exceeds VM limit
  33. at java.lang.StringCoding$StringEncoder.encode(StringCoding.java:300)
  34. at java.lang.StringCoding.encode(StringCoding.java:344)
  35. at java.lang.String.getBytes(String.java:918)
  36. at org.sfsu.spark.CluewebReader$.getWarcRecordsFromString(CluewebReader.scala:315)
  37. at org.sfsu.spark.CluewebReader$$anonfun$getWarcRecordsFromDirectory$1.apply(CluewebReader.scala:305)
  38. at org.sfsu.spark.CluewebReader$$anonfun$getWarcRecordsFromDirectory$1.apply(CluewebReader.scala:305)
  39. at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
  40. at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
  41. at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:461)
  42. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  43. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  44. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  45. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  46. at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:215)
  47. at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1005)
  48. at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:996)
  49. at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:936)
  50. at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:996)
  51. at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:700)
  52. at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
  53. at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
  54. at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
  55. at org.apache.spark.scheduler.Task.run(Task.scala:99)
  56. at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
  57. at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
  58. at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
  59. at java.lang.Thread.run(Thread.java:748)
  60.  
  61. 2017-07-05 12:28:10,199 [dispatcher-event-loop-3] ERROR TaskSchedulerImpl - Lost executor 1 on 172.18.26.77: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
  62. 2017-07-05 12:28:10,200 [dispatcher-event-loop-3] WARN TaskSetManager - Lost task 0.3 in stage 0.0 (TID 3, 172.18.26.77, executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
  63. 2017-07-05 12:28:28,962 [task-result-getter-2] WARN TaskSetManager - Lost task 0.4 in stage 0.0 (TID 4, 172.18.26.77, executor 2): java.lang.OutOfMemoryError: Requested array size exceeds VM limit
  64. at java.lang.StringCoding$StringEncoder.encode(StringCoding.java:300)
  65. at java.lang.StringCoding.encode(StringCoding.java:344)
  66. at java.lang.String.getBytes(String.java:918)
  67. at org.sfsu.spark.CluewebReader$.getWarcRecordsFromString(CluewebReader.scala:315)
  68. at org.sfsu.spark.CluewebReader$$anonfun$getWarcRecordsFromDirectory$1.apply(CluewebReader.scala:305)
  69. at org.sfsu.spark.CluewebReader$$anonfun$getWarcRecordsFromDirectory$1.apply(CluewebReader.scala:305)
  70. at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
  71. at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
  72. at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:461)
  73. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  74. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  75. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  76. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  77. at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:215)
  78. at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1005)
  79. at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:996)
  80. at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:936)
  81. at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:996)
  82. at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:700)
  83. at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
  84. at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
  85. at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
  86. at org.apache.spark.scheduler.Task.run(Task.scala:99)
  87. at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
  88. at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
  89. at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
  90. at java.lang.Thread.run(Thread.java:748)
  91.  
  92. 2017-07-05 12:28:30,366 [dispatcher-event-loop-1] ERROR TaskSchedulerImpl - Lost executor 2 on 172.18.26.77: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
  93. 2017-07-05 12:28:30,366 [dispatcher-event-loop-1] WARN TaskSetManager - Lost task 0.5 in stage 0.0 (TID 5, 172.18.26.77, executor 2): ExecutorLostFailure (executor 2 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
  94. 2017-07-05 12:28:46,695 [task-result-getter-3] WARN TaskSetManager - Lost task 0.6 in stage 0.0 (TID 6, 172.18.26.77, executor 3): java.lang.OutOfMemoryError: Requested array size exceeds VM limit
  95. at java.lang.StringCoding$StringEncoder.encode(StringCoding.java:300)
  96. at java.lang.StringCoding.encode(StringCoding.java:344)
  97. at java.lang.String.getBytes(String.java:918)
  98. at org.sfsu.spark.CluewebReader$.getWarcRecordsFromString(CluewebReader.scala:315)
  99. at org.sfsu.spark.CluewebReader$$anonfun$getWarcRecordsFromDirectory$1.apply(CluewebReader.scala:305)
  100. at org.sfsu.spark.CluewebReader$$anonfun$getWarcRecordsFromDirectory$1.apply(CluewebReader.scala:305)
  101. at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
  102. at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
  103. at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:461)
  104. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  105. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  106. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  107. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  108. at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:215)
  109. at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1005)
  110. at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:996)
  111. at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:936)
  112. at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:996)
  113. at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:700)
  114. at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
  115. at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
  116. at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
  117. at org.apache.spark.scheduler.Task.run(Task.scala:99)
  118. at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
  119. at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
  120. at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
  121. at java.lang.Thread.run(Thread.java:748)
  122.  
  123. 2017-07-05 12:28:47,510 [dispatcher-event-loop-2] ERROR TaskSchedulerImpl - Lost executor 3 on 172.18.26.77: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
  124. 2017-07-05 12:28:47,510 [dispatcher-event-loop-2] WARN TaskSetManager - Lost task 0.7 in stage 0.0 (TID 7, 172.18.26.77, executor 3): ExecutorLostFailure (executor 3 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
  125. 2017-07-05 12:29:01,034 [task-result-getter-0] WARN TaskSetManager - Lost task 0.8 in stage 0.0 (TID 8, 172.18.26.77, executor 4): java.lang.OutOfMemoryError: Requested array size exceeds VM limit
  126. at java.lang.StringCoding$StringEncoder.encode(StringCoding.java:300)
  127. at java.lang.StringCoding.encode(StringCoding.java:344)
  128. at java.lang.String.getBytes(String.java:918)
  129. at org.sfsu.spark.CluewebReader$.getWarcRecordsFromString(CluewebReader.scala:315)
  130. at org.sfsu.spark.CluewebReader$$anonfun$getWarcRecordsFromDirectory$1.apply(CluewebReader.scala:305)
  131. at org.sfsu.spark.CluewebReader$$anonfun$getWarcRecordsFromDirectory$1.apply(CluewebReader.scala:305)
  132. at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
  133. at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
  134. at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:461)
  135. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  136. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  137. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  138. at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  139. at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:215)
  140. at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1005)
  141. at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:996)
  142. at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:936)
  143. at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:996)
  144. at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:700)
  145. at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
  146. at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
  147. at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
  148. at org.apache.spark.scheduler.Task.run(Task.scala:99)
  149. at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
  150. at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
  151. at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
  152. at java.lang.Thread.run(Thread.java:748)
  153.  
  154. 2017-07-05 12:29:01,849 [dispatcher-event-loop-6] ERROR TaskSchedulerImpl - Lost executor 4 on 172.18.26.77: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
  155. 2017-07-05 12:29:01,849 [dispatcher-event-loop-6] WARN TaskSetManager - Lost task 0.9 in stage 0.0 (TID 9, 172.18.26.77, executor 4): ExecutorLostFailure (executor 4 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
  156. 2017-07-05 12:29:01,850 [dispatcher-event-loop-6] ERROR TaskSetManager - Task 0 in stage 0.0 failed 10 times; aborting job
  157. Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 10 times, most recent failure: Lost task 0.9 in stage 0.0 (TID 9, 172.18.26.77, executor 4): ExecutorLostFailure (executor 4 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
  158. Driver stacktrace:
  159. at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
  160. at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
  161. at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
  162. at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
  163. at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
  164. at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
  165. at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
  166. at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
  167. at scala.Option.foreach(Option.scala:257)
  168. at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
  169. at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
  170. at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
  171. at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
  172. at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
  173. at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
  174. at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
  175. at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
  176. at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
  177. at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
  178. at org.apache.spark.rdd.RDD.count(RDD.scala:1158)
  179. at org.sfsu.spark.CluewebReader$.kmeanForClueWeb09WarcFiles(CluewebReader.scala:98)
  180. at org.sfsu.spark.SClusterDocsIndexToSolr.run(SClusterDocsIndexToSolr.scala:175)
  181. at com.lucidworks.spark.SparkApp.main(SparkApp.java:83)
  182. at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
  183. at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
  184. at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
  185. at java.lang.reflect.Method.invoke(Method.java:498)
  186. at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:743)
  187. at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:187)
  188. at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:212)
  189. at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:126)
  190. at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement