Advertisement
Guest User

Untitled

a guest
Oct 24th, 2016
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.79 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "metadata": {},
  5. "cell_type": "markdown",
  6. "source": "## PDF preparation (create \"sub\" PDF with required information/pages, here ListOfContents.pdf)\nAus Copyright Gründen nur das Inhaltsverzeichnis"
  7. },
  8. {
  9. "metadata": {
  10. "collapsed": false,
  11. "trusted": true
  12. },
  13. "cell_type": "code",
  14. "source": "#pdftk Indonesien_Motive.pdf cat 5-12 output ListOfContents.pdf\nclass PDF(object):\n def __init__(self, pdf, size=(200,200)):\n self.pdf = pdf\n self.size = size\n\n def _repr_html_(self):\n return '<iframe src={0} width={1[0]} height={1[1]}></iframe>'.format(self.pdf, self.size)\n\n def _repr_latex_(self):\n return r'\\includegraphics[width=1.0\\textwidth]{{{0}}}'.format(self.pdf)\nPDF('ListOfContents.pdf',size=(900,550))",
  15. "execution_count": 6,
  16. "outputs": [
  17. {
  18. "execution_count": 6,
  19. "metadata": {},
  20. "data": {
  21. "text/latex": "\\includegraphics[width=1.0\\textwidth]{ListOfContents.pdf}",
  22. "text/plain": "<__main__.PDF at 0x7f1fa4ede908>",
  23. "text/html": "<iframe src=ListOfContents.pdf width=900 height=550></iframe>"
  24. },
  25. "output_type": "execute_result"
  26. }
  27. ]
  28. },
  29. {
  30. "metadata": {},
  31. "cell_type": "markdown",
  32. "source": "## PDF2JSON\n-f: first page \n-l: last page \n-i: ignore images \n-split: split into subfiles (pages) \n-compress: use compressed mode "
  33. },
  34. {
  35. "metadata": {
  36. "collapsed": false,
  37. "trusted": true
  38. },
  39. "cell_type": "code",
  40. "source": "!pdf2json -f 2 -l 2 -i -xml -split 1 ListOfContents.pdf Motive.json",
  41. "execution_count": 8,
  42. "outputs": [
  43. {
  44. "name": "stdout",
  45. "output_type": "stream",
  46. "text": "Page-2\r\n"
  47. }
  48. ]
  49. },
  50. {
  51. "metadata": {},
  52. "cell_type": "markdown",
  53. "source": "## Continue with the JSON \"Motive.json\"\nfor each row: \nfirst column: text (motiv, numbering, page number) \nsecond column: position (horizontal) in text \ne.g. ('Paddler' is on Position 376, 'Paar' is also on Positon 376 --> same yierarchy level etc) \n'Menschen' is on Position 322 --> upper level in hierarchy \n"
  54. },
  55. {
  56. "metadata": {
  57. "collapsed": false,
  58. "trusted": true
  59. },
  60. "cell_type": "code",
  61. "source": "import json\nimport codecs\nwith open('Motive.json', encoding=\"iso-8859-15\") as json_data:\n d = json.load(json_data)\n\nlist = []\nd=d[0][\"text\"];\n#print(len(d))\nfor i in range(len(d)):\n if d[i].get('data') != None:\n list.append((d[i][\"data\"],d[i][\"left\"]))\nlist",
  62. "execution_count": 45,
  63. "outputs": [
  64. {
  65. "execution_count": 45,
  66. "metadata": {},
  67. "data": {
  68. "text/plain": "[('6', 210),\n ('Inhalt', 580),\n ('1.1.1.1.4 ', 316),\n ('Paddler', 376),\n (' ', 432),\n ('53', 875),\n ('1.1.1.1.5 ', 316),\n ('ausguck', 376),\n (' ', 433),\n ('56', 875),\n ('1.1.1.1.6 ', 316),\n ('Paar', 376),\n (' ', 409),\n ('58', 875),\n ('1.1.1.1.7 ', 316),\n ('Paar, ', 376),\n ('erotisches', 418),\n (' ', 488),\n ('59', 875),\n ('1.1.1.1.8 ', 316),\n ('züchtiger', 376),\n (' ', 442),\n ('60', 875),\n ('1.1.1.1.9 ', 316),\n ('geBärende?', 376),\n (' ', 456),\n ('61', 875),\n ('1.1.1.1.10 ', 316),\n ('drachenlenker', 384),\n (' ', 486),\n ('62', 875),\n ('1.1.1.1.11 ', 316),\n ('unBestiMMBarer', 383),\n (' ', 494),\n ('63', 875),\n ('1.1.1.2 ', 274),\n ('Menschen, ', 322),\n ('teil', 399),\n (' ', 425),\n ('des', 429),\n (' ', 452),\n ('81', 875),\n ('1.1.1.2.1 ', 316),\n ('handsilhouette, ', 378),\n ('gesPrühte', 492),\n (' ', 559),\n ('81', 875),\n ('6-fingerige', 338),\n (' ', 398),\n ('82', 877),\n ('5-fingerige', 338),\n (' ', 398),\n ('(norMalhand)', 401),\n (' ', 480),\n ('82', 877),\n ('4-fingerige', 338),\n (' ', 398),\n ('88', 877),\n ('3-fingerige', 338),\n (' ', 398),\n ('91', 877),\n ('2-fingerige', 338),\n (' ', 398),\n ('92', 877),\n ('üBergrösse', 338),\n (' ', 403),\n ('93', 877),\n ('Mit', 338),\n (' ', 358),\n ('Binnenzeichnung', 362),\n (' ', 454),\n ('/ ', 458),\n ('angeschnitteneM', 465),\n (' ', 557),\n ('Motiv', 560),\n (' ', 594),\n ('94', 877),\n ('verstüMMelte?', 338),\n (' ', 420),\n ('95', 877),\n ('Beschädigte', 338),\n (' ', 405),\n ('97', 877),\n ('1.1.1.2.2 ', 316),\n ('hand, ', 378),\n ('flächige', 424),\n (' ', 478),\n ('103', 869),\n ('5-fingerige', 338),\n (' ', 398),\n ('(norMalhand)', 401),\n (' ', 480),\n ('103', 871),\n ('4-fingerige', 338),\n (' ', 398),\n ('103', 871),\n ('3-fingerige', 338),\n (' ', 398),\n ('103', 871),\n ('1.1.1.2.3 ', 316),\n ('hand, ', 378),\n ('lineare', 424),\n (' ', 471),\n ('104', 869),\n ('üBergrösse', 338),\n (' ', 403),\n ('104', 871),\n ('üBergrösse', 338),\n (' ', 403),\n ('104', 871),\n ('1.1.1.2.4 ', 316),\n ('fusssilhouette, ', 378),\n ('gesPrühte', 488),\n (' ', 554),\n ('105', 869),\n ('5-zehige', 338),\n (' ', 382),\n ('(norMalfuss)', 386),\n (' ', 461),\n ('105', 871),\n ('4-zehige', 338),\n (' ', 382),\n ('105', 871),\n ('1.1.1.2.5 ', 316),\n ('koPf', 378),\n (' ', 411),\n ('106', 869),\n ('1.1.1.2.6 ', 316),\n ('haartracht', 378),\n (' ', 456),\n ('109', 869),\n ('1.1.1.2.7 ', 316),\n ('vulva?', 377),\n (' ', 422),\n ('113', 870),\n ('1.1.2 ', 253),\n ('tiere', 287),\n (' ', 321),\n ('114', 870),\n ('1.1.2.1 ', 274),\n ('landtiere', 324),\n (' ', 389),\n ('114', 870),\n ('1.1.2.1.1 ', 316),\n ('schWein', 378),\n (' ', 435),\n ('115', 870),\n ('1.1.2.1.2 ', 316),\n ('Bovide', 378),\n (' ', 423),\n ('117', 870),\n ('1.1.2.1.3 ', 316),\n ('canide', 378),\n (' ', 424),\n ('119', 870),\n ('1.1.2.1.4 ', 316),\n ('caPride', 378),\n (' ', 430),\n ('122', 869),\n ('1.1.2.1.5 ', 316),\n ('cervide', 378),\n (' ', 429),\n ('123', 869),\n ('1.1.2.1.6 ', 316),\n ('equide', 378),\n (' ', 424),\n ('126', 869),\n ('1.1.2.1.7 ', 316),\n ('affe', 378),\n (' ', 409),\n ('129', 869),\n ('1.1.2.1.8 ', 316),\n ('echse', 378),\n (' ', 418),\n ('129', 869),\n ('1.1.2.1.9 ', 316),\n ('fuchs?', 378),\n (' ', 426),\n ('132', 869),\n ('1.1.2.1.10 ', 316),\n ('tausendfüssler?', 384),\n (' ', 497),\n ('133', 869),\n ('1.1.2.1.11 ', 316),\n ('känguruh', 386),\n (' ', 452),\n ('134', 869),\n ('1.1.2.1.12 ', 316),\n ('schlange', 387),\n (' ', 451),\n ('135', 869),\n ('1.1.2.1.13 ', 316),\n ('sPinne', 387),\n (' ', 431),\n ('135', 869),\n ('1.1.2.1.14 ', 316),\n ('reittier, ', 387),\n ('unBestiMMBares', 448),\n (' ', 554),\n ('136', 869),\n ('1.1.2.1.15 ', 316),\n ('unBestiMMBare', 387),\n (' ', 490),\n ('138', 869)]"
  69. },
  70. "output_type": "execute_result"
  71. }
  72. ]
  73. },
  74. {
  75. "metadata": {
  76. "collapsed": true,
  77. "trusted": true
  78. },
  79. "cell_type": "code",
  80. "source": "",
  81. "execution_count": null,
  82. "outputs": []
  83. }
  84. ],
  85. "metadata": {
  86. "language_info": {
  87. "mimetype": "text/x-python",
  88. "file_extension": ".py",
  89. "version": "3.5.2",
  90. "nbconvert_exporter": "python",
  91. "codemirror_mode": {
  92. "name": "ipython",
  93. "version": 3
  94. },
  95. "pygments_lexer": "ipython3",
  96. "name": "python"
  97. },
  98. "latex_envs": {
  99. "current_citInitial": 1,
  100. "bibliofile": "biblio.bib",
  101. "cite_by": "apalike",
  102. "eqLabelWithNumbers": true,
  103. "eqNumInitial": 0
  104. },
  105. "toc": {
  106. "threshold": 6,
  107. "number_sections": true,
  108. "toc_cell": false,
  109. "toc_window_display": false,
  110. "toc_section_display": "block",
  111. "sideBar": true,
  112. "navigate_menu": true
  113. },
  114. "anaconda-cloud": {},
  115. "kernelspec": {
  116. "name": "Python [Root]",
  117. "display_name": "Python [Root]",
  118. "language": "python"
  119. },
  120. "nav_menu": {},
  121. "gist": {
  122. "id": "",
  123. "data": {
  124. "description": "rockpaintings/ParsePDF.ipynb",
  125. "public": true
  126. }
  127. }
  128. },
  129. "nbformat": 4,
  130. "nbformat_minor": 0
  131. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement