Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "metadata": {},
- "cell_type": "markdown",
- "source": "## PDF preparation (create \"sub\" PDF with required information/pages, here ListOfContents.pdf)\nAus Copyright Gründen nur das Inhaltsverzeichnis"
- },
- {
- "metadata": {
- "collapsed": false,
- "trusted": true
- },
- "cell_type": "code",
- "source": "#pdftk Indonesien_Motive.pdf cat 5-12 output ListOfContents.pdf\nclass PDF(object):\n def __init__(self, pdf, size=(200,200)):\n self.pdf = pdf\n self.size = size\n\n def _repr_html_(self):\n return '<iframe src={0} width={1[0]} height={1[1]}></iframe>'.format(self.pdf, self.size)\n\n def _repr_latex_(self):\n return r'\\includegraphics[width=1.0\\textwidth]{{{0}}}'.format(self.pdf)\nPDF('ListOfContents.pdf',size=(900,550))",
- "execution_count": 6,
- "outputs": [
- {
- "execution_count": 6,
- "metadata": {},
- "data": {
- "text/latex": "\\includegraphics[width=1.0\\textwidth]{ListOfContents.pdf}",
- "text/plain": "<__main__.PDF at 0x7f1fa4ede908>",
- "text/html": "<iframe src=ListOfContents.pdf width=900 height=550></iframe>"
- },
- "output_type": "execute_result"
- }
- ]
- },
- {
- "metadata": {},
- "cell_type": "markdown",
- "source": "## PDF2JSON\n-f: first page \n-l: last page \n-i: ignore images \n-split: split into subfiles (pages) \n-compress: use compressed mode "
- },
- {
- "metadata": {
- "collapsed": false,
- "trusted": true
- },
- "cell_type": "code",
- "source": "!pdf2json -f 2 -l 2 -i -xml -split 1 ListOfContents.pdf Motive.json",
- "execution_count": 8,
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": "Page-2\r\n"
- }
- ]
- },
- {
- "metadata": {},
- "cell_type": "markdown",
- "source": "## Continue with the JSON \"Motive.json\"\nfor each row: \nfirst column: text (motiv, numbering, page number) \nsecond column: position (horizontal) in text \ne.g. ('Paddler' is on Position 376, 'Paar' is also on Positon 376 --> same yierarchy level etc) \n'Menschen' is on Position 322 --> upper level in hierarchy \n"
- },
- {
- "metadata": {
- "collapsed": false,
- "trusted": true
- },
- "cell_type": "code",
- "source": "import json\nimport codecs\nwith open('Motive.json', encoding=\"iso-8859-15\") as json_data:\n d = json.load(json_data)\n\nlist = []\nd=d[0][\"text\"];\n#print(len(d))\nfor i in range(len(d)):\n if d[i].get('data') != None:\n list.append((d[i][\"data\"],d[i][\"left\"]))\nlist",
- "execution_count": 45,
- "outputs": [
- {
- "execution_count": 45,
- "metadata": {},
- "data": {
- "text/plain": "[('6', 210),\n ('Inhalt', 580),\n ('1.1.1.1.4 ', 316),\n ('Paddler', 376),\n (' ', 432),\n ('53', 875),\n ('1.1.1.1.5 ', 316),\n ('ausguck', 376),\n (' ', 433),\n ('56', 875),\n ('1.1.1.1.6 ', 316),\n ('Paar', 376),\n (' ', 409),\n ('58', 875),\n ('1.1.1.1.7 ', 316),\n ('Paar, ', 376),\n ('erotisches', 418),\n (' ', 488),\n ('59', 875),\n ('1.1.1.1.8 ', 316),\n ('züchtiger', 376),\n (' ', 442),\n ('60', 875),\n ('1.1.1.1.9 ', 316),\n ('geBärende?', 376),\n (' ', 456),\n ('61', 875),\n ('1.1.1.1.10 ', 316),\n ('drachenlenker', 384),\n (' ', 486),\n ('62', 875),\n ('1.1.1.1.11 ', 316),\n ('unBestiMMBarer', 383),\n (' ', 494),\n ('63', 875),\n ('1.1.1.2 ', 274),\n ('Menschen, ', 322),\n ('teil', 399),\n (' ', 425),\n ('des', 429),\n (' ', 452),\n ('81', 875),\n ('1.1.1.2.1 ', 316),\n ('handsilhouette, ', 378),\n ('gesPrühte', 492),\n (' ', 559),\n ('81', 875),\n ('6-fingerige', 338),\n (' ', 398),\n ('82', 877),\n ('5-fingerige', 338),\n (' ', 398),\n ('(norMalhand)', 401),\n (' ', 480),\n ('82', 877),\n ('4-fingerige', 338),\n (' ', 398),\n ('88', 877),\n ('3-fingerige', 338),\n (' ', 398),\n ('91', 877),\n ('2-fingerige', 338),\n (' ', 398),\n ('92', 877),\n ('üBergrösse', 338),\n (' ', 403),\n ('93', 877),\n ('Mit', 338),\n (' ', 358),\n ('Binnenzeichnung', 362),\n (' ', 454),\n ('/ ', 458),\n ('angeschnitteneM', 465),\n (' ', 557),\n ('Motiv', 560),\n (' ', 594),\n ('94', 877),\n ('verstüMMelte?', 338),\n (' ', 420),\n ('95', 877),\n ('Beschädigte', 338),\n (' ', 405),\n ('97', 877),\n ('1.1.1.2.2 ', 316),\n ('hand, ', 378),\n ('flächige', 424),\n (' ', 478),\n ('103', 869),\n ('5-fingerige', 338),\n (' ', 398),\n ('(norMalhand)', 401),\n (' ', 480),\n ('103', 871),\n ('4-fingerige', 338),\n (' ', 398),\n ('103', 871),\n ('3-fingerige', 338),\n (' ', 398),\n ('103', 871),\n ('1.1.1.2.3 ', 316),\n ('hand, ', 378),\n ('lineare', 424),\n (' ', 471),\n ('104', 869),\n ('üBergrösse', 338),\n (' ', 403),\n ('104', 871),\n ('üBergrösse', 338),\n (' ', 403),\n ('104', 871),\n ('1.1.1.2.4 ', 316),\n ('fusssilhouette, ', 378),\n ('gesPrühte', 488),\n (' ', 554),\n ('105', 869),\n ('5-zehige', 338),\n (' ', 382),\n ('(norMalfuss)', 386),\n (' ', 461),\n ('105', 871),\n ('4-zehige', 338),\n (' ', 382),\n ('105', 871),\n ('1.1.1.2.5 ', 316),\n ('koPf', 378),\n (' ', 411),\n ('106', 869),\n ('1.1.1.2.6 ', 316),\n ('haartracht', 378),\n (' ', 456),\n ('109', 869),\n ('1.1.1.2.7 ', 316),\n ('vulva?', 377),\n (' ', 422),\n ('113', 870),\n ('1.1.2 ', 253),\n ('tiere', 287),\n (' ', 321),\n ('114', 870),\n ('1.1.2.1 ', 274),\n ('landtiere', 324),\n (' ', 389),\n ('114', 870),\n ('1.1.2.1.1 ', 316),\n ('schWein', 378),\n (' ', 435),\n ('115', 870),\n ('1.1.2.1.2 ', 316),\n ('Bovide', 378),\n (' ', 423),\n ('117', 870),\n ('1.1.2.1.3 ', 316),\n ('canide', 378),\n (' ', 424),\n ('119', 870),\n ('1.1.2.1.4 ', 316),\n ('caPride', 378),\n (' ', 430),\n ('122', 869),\n ('1.1.2.1.5 ', 316),\n ('cervide', 378),\n (' ', 429),\n ('123', 869),\n ('1.1.2.1.6 ', 316),\n ('equide', 378),\n (' ', 424),\n ('126', 869),\n ('1.1.2.1.7 ', 316),\n ('affe', 378),\n (' ', 409),\n ('129', 869),\n ('1.1.2.1.8 ', 316),\n ('echse', 378),\n (' ', 418),\n ('129', 869),\n ('1.1.2.1.9 ', 316),\n ('fuchs?', 378),\n (' ', 426),\n ('132', 869),\n ('1.1.2.1.10 ', 316),\n ('tausendfüssler?', 384),\n (' ', 497),\n ('133', 869),\n ('1.1.2.1.11 ', 316),\n ('känguruh', 386),\n (' ', 452),\n ('134', 869),\n ('1.1.2.1.12 ', 316),\n ('schlange', 387),\n (' ', 451),\n ('135', 869),\n ('1.1.2.1.13 ', 316),\n ('sPinne', 387),\n (' ', 431),\n ('135', 869),\n ('1.1.2.1.14 ', 316),\n ('reittier, ', 387),\n ('unBestiMMBares', 448),\n (' ', 554),\n ('136', 869),\n ('1.1.2.1.15 ', 316),\n ('unBestiMMBare', 387),\n (' ', 490),\n ('138', 869)]"
- },
- "output_type": "execute_result"
- }
- ]
- },
- {
- "metadata": {
- "collapsed": true,
- "trusted": true
- },
- "cell_type": "code",
- "source": "",
- "execution_count": null,
- "outputs": []
- }
- ],
- "metadata": {
- "language_info": {
- "mimetype": "text/x-python",
- "file_extension": ".py",
- "version": "3.5.2",
- "nbconvert_exporter": "python",
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "pygments_lexer": "ipython3",
- "name": "python"
- },
- "latex_envs": {
- "current_citInitial": 1,
- "bibliofile": "biblio.bib",
- "cite_by": "apalike",
- "eqLabelWithNumbers": true,
- "eqNumInitial": 0
- },
- "toc": {
- "threshold": 6,
- "number_sections": true,
- "toc_cell": false,
- "toc_window_display": false,
- "toc_section_display": "block",
- "sideBar": true,
- "navigate_menu": true
- },
- "anaconda-cloud": {},
- "kernelspec": {
- "name": "Python [Root]",
- "display_name": "Python [Root]",
- "language": "python"
- },
- "nav_menu": {},
- "gist": {
- "id": "",
- "data": {
- "description": "rockpaintings/ParsePDF.ipynb",
- "public": true
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement