Advertisement
Roman_Sarnov

Untitled

Nov 7th, 2019
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 15.98 KB | None | 0 0
  1. from os.path import exists
  2. from tempfile import mkdtemp, mkstemp
  3. from shutil import rmtree
  4. from binascii import b2a_hex
  5. from os import write, close
  6. from threading import Thread
  7.  
  8. from pdfminer.pdfpage import PDFPage
  9. from pdfminer.pdfparser import PDFParser
  10. from pdfminer.converter import PDFPageAggregator
  11. from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
  12. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  13. from pdfminer.layout import (
  14. LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar, LTCurve,
  15. LTLine, LTRect,
  16. )
  17.  
  18. from kivy.lang import Builder
  19. from kivy.clock import Clock
  20. import kivy.graphics
  21. from kivy.graphics import Mesh, Color
  22. from kivy.graphics.tesselator import Tesselator
  23.  
  24. from kivy.uix.widget import Widget
  25. from kivy.uix.recycleview import RecycleView
  26. from kivy.uix.label import Label
  27. from kivy.uix.image import Image
  28. from kivy.uix.relativelayout import RelativeLayout
  29. from kivy.uix.boxlayout import BoxLayout
  30.  
  31. from kivy.properties import (
  32. StringProperty, ListProperty, NumericProperty, AliasProperty,
  33. DictProperty, ObjectProperty, BooleanProperty, ColorProperty,
  34. )
  35.  
  36. Builder.load_string('''
  37. #:import RGBA kivy.utils.rgba
  38.  
  39. <PDFDocumentWidget>:
  40. viewclass: 'PDFPageWidget'
  41. key_size: 'size'
  42. # async load is buggy at the moment
  43. # async_load: True
  44.  
  45. RecycleGridLayout:
  46. spacing: 5
  47. cols: root.cols
  48. rows: root.rows
  49. size_hint: None, None
  50. size: self.minimum_size
  51. default_size_hint: None, None
  52.  
  53. <PDFPageWidget>:
  54. size_hint: None, None
  55.  
  56. canvas.before:
  57. Color:
  58. rgba: RGBA('FFFFFF')
  59. Rectangle:
  60. size: self.size
  61.  
  62. <PDFLabelWidget,PDFImageWidget>:
  63. size_hint: None, None
  64.  
  65. <PDFImageWidget>:
  66. pos: self.bbox[:2]
  67. size: self.bbox[2] - self.x, self.bbox[3] - self.y
  68.  
  69. <PDFLabelWidget>:
  70. text_size: self.width, None
  71. height: self.texture_size[1]
  72. color: RGBA('000000')
  73. font_size: 8
  74.  
  75. <PDFCurveWidget>:
  76. ''')
  77.  
  78.  
  79. class PDFDocumentWidget(RecycleView):
  80. source = StringProperty()
  81. password = StringProperty()
  82. cols = NumericProperty(None)
  83. rows = NumericProperty(None)
  84. _toc = ListProperty()
  85. async_load = BooleanProperty(False)
  86.  
  87. def __init__(self, **kwargs):
  88. super(PDFDocumentWidget, self).__init__(**kwargs)
  89. self._fp = None
  90. self._document = None
  91. self._tmpdir = None
  92. self.bind(source=self.load)
  93. if self.source:
  94. self.load()
  95.  
  96. def load(self, *args):
  97. if self._fp:
  98. # close the previous pdf file
  99. self._fp.close()
  100.  
  101. pdf_doc = self.source
  102. data = []
  103. if not pdf_doc or not exists(pdf_doc):
  104. self.pages = []
  105. self._doc = []
  106. self._document = None
  107. if self._tmpdir:
  108. rmtree(self._tmpdir)
  109. self._tmpdir = None
  110.  
  111. try:
  112. # open the pdf file
  113. self._fp = fp = open(pdf_doc, 'rb')
  114. # create a parser object associated with the file object
  115. parser = PDFParser(fp)
  116. # create a PDFDocument object that stores the document structure
  117. doc = PDFDocument(parser)
  118. # connect the parser and document objects
  119. parser.set_document(doc)
  120. # supply the password for initialization
  121. # doc.initialize(self.password)
  122.  
  123. # if doc.is_extractable:
  124. # apply the function and return the result
  125. self._document = doc
  126. self._parse_toc()
  127. self._create_tmpdir()
  128. self._parse_pages()
  129. except IOError as e:
  130. # the file doesn't exist or similar problem
  131. print(e)
  132.  
  133. def _create_tmpdir(self):
  134. if not self._tmpdir:
  135. self._tmpdir = mkdtemp()
  136. return self._tmpdir
  137.  
  138. def _parse_toc(self):
  139. """With an open PDFDocument object, get the table of contents (toc) data
  140. [this is a higher-order function to be passed to with_pdf()]"""
  141. toc = []
  142. doc = self._document
  143. try:
  144. outlines = doc.get_outlines()
  145. for (level, title, dest, a, se) in outlines:
  146. toc.append((level, title))
  147. except:
  148. pass
  149. finally:
  150. self._toc = toc
  151.  
  152. def _parse_pages(self):
  153. doc = self._document
  154. if not doc:
  155. self.data = []
  156. return
  157.  
  158. data = []
  159.  
  160. rsrcmgr = PDFResourceManager()
  161. laparams = LAParams()
  162. self.device = device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  163. self.interpreter = PDFPageInterpreter(rsrcmgr, device)
  164.  
  165. for i, page in enumerate(PDFPage.create_pages(doc)):
  166. p = {
  167. 'manager': self,
  168. 'page': page,
  169. 'size': page.attrs.get('MediaBox', [0, 0, 0, 0])[2:],
  170. }
  171. data.append(p)
  172. self.data = data
  173.  
  174.  
  175. class PDFImageWidget(Image):
  176. bbox = ListProperty([0, 0, 100, 100])
  177.  
  178.  
  179. class PDFLabelWidget(Label):
  180. bbox = ListProperty([0, 0, 100, 100])
  181.  
  182.  
  183. class PDFCurveWidget(Widget):
  184. points = ListProperty()
  185. line_width = NumericProperty()
  186. stroke = BooleanProperty(False)
  187. fill = BooleanProperty(False)
  188. even_odd = BooleanProperty()
  189. color = ColorProperty()
  190. fill_color = ColorProperty()
  191.  
  192. def __init__(self, **kwargs):
  193. super(PDFCurveWidget, self).__init__(**kwargs)
  194. build = Clock.create_trigger(self.build, 0)
  195.  
  196. self.bind(
  197. points=build,
  198. line_width=build,
  199. stroke=build,
  200. fill=build,
  201. even_odd=build,
  202. color=build,
  203. fill_color=build
  204. )
  205.  
  206. def build(self, *args):
  207. self.canvas.clear()
  208. if not self.points:
  209. return
  210.  
  211. with self.canvas:
  212. if self.fill:
  213. Color(rgba=self.fill_color)
  214. t = Tesselator()
  215. t.add_contour(self.points)
  216. if tess.tesselate:
  217. for vertices, indices in tess.meshes:
  218. Mesh(
  219. vertices=vertices,
  220. indices=indices,
  221. mode='triangle fan'
  222. )
  223. else:
  224. print("mesh didn't tesselate!")
  225.  
  226. if self.stroke:
  227. Color(rgba=self.color)
  228. Line(
  229. points=self.points,
  230. width=self.line_width
  231. )
  232.  
  233.  
  234. class PDFPageWidget(RelativeLayout):
  235. labels = DictProperty()
  236. attributes = DictProperty()
  237. manager = ObjectProperty()
  238. page = ObjectProperty()
  239. items = ListProperty()
  240.  
  241. def on_page(self, *args):
  242. if self.manager.async_load:
  243. Thread(target=self._load_page).start()
  244. else:
  245. self._load_page()
  246.  
  247. def _load_page(self):
  248. self.manager.interpreter.process_page(self.page)
  249. self.items = self.manager.device.get_result()
  250.  
  251. def on_items(self, *args):
  252. self.clear_widgets()
  253. self._render_content(self.items)
  254.  
  255. def _render_content(self, lt_objs):
  256. """Iterate through the list of LT* objects and capture the text
  257. or image data contained in each
  258. """
  259. for lt_obj in lt_objs:
  260. print(lt_obj)
  261. if isinstance(lt_obj, LTChar):
  262. self.add_text(
  263. text=lt_obj.get_text(),
  264. box_pos=(lt_obj.x0, lt_obj.y0),
  265. box_size=(lt_obj.width, lt_obj.height),
  266. # font_size=lt_obj.fontsize,
  267. # font_name=lt_obj.fontname,
  268. )
  269.  
  270. elif isinstance(lt_obj, (LTTextBox, LTTextLine)):
  271. # text, so arrange is logically based on its column width
  272. # this way is very limited style wise, and doesn't allow
  273. # support for font, color, style, etc management, as
  274. # pdfminer doesn't provide these information at text box
  275. # level, by using the following nested loop, it's
  276. # possible to have font family info, but for individual
  277. # character, which is impractical to create direct
  278. # labels for.
  279. # for obj in lt_obj:
  280. # print(obj)
  281. # for o in obj:
  282. # print(o)
  283.  
  284. self.add_text(
  285. text=lt_obj.get_text(),
  286. box_pos=(lt_obj.x0, lt_obj.y0),
  287. box_size=(lt_obj.width, lt_obj.height),
  288. )
  289.  
  290. elif isinstance(lt_obj, LTImage):
  291. saved_file = self.save_image(lt_obj)
  292. if saved_file:
  293. self.add_widget(
  294. PDFImageWidget(
  295. source=saved_file,
  296. bbox=lt_obj.bbox
  297. )
  298. )
  299.  
  300. elif isinstance(lt_obj, LTFigure):
  301. self._render_content(lt_obj)
  302.  
  303. # all of these are actually LTCurves, but all types here for
  304. # clarity
  305. elif isinstance(lt_obj, (LTLine, LTRect, LTCurve)):
  306. self.add_widget(
  307. PDFCurveWidget(
  308. points=lt_obj.pts or [],
  309. line_width=lt_obj.linewidth or 1.0,
  310. stroke=lt_obj.stroke,
  311. fill=lt_obj.fill,
  312. even_odd=lt_obj.evenodd,
  313. # colors seem to be indices, to some dict i
  314. # can't find in what pdfminer exposes
  315. color='#FFFFFFFF', # lt_obj.stroking_color or
  316. fill_color='#00000000' # lt_obj.non_stroking_color or
  317. )
  318. )
  319.  
  320. def save_image(self, lt_image):
  321. """Try to save the image data from this LTImage object, and
  322. return the file name, if successful
  323. """
  324. if lt_image.stream:
  325. file_stream = lt_image.stream.get_rawdata()
  326. if file_stream:
  327. file_ext = self.determine_image_type(file_stream[0:4])
  328. if file_ext:
  329. fd, fn = mkstemp(dir=self.manager._tmpdir, suffix='.{}'.format(file_ext))
  330. write(fd, file_stream)
  331. close(fd)
  332. return fn
  333.  
  334. @staticmethod
  335. def determine_image_type(stream_first_4_bytes):
  336. """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes"""
  337. file_type = None
  338. bytes_as_hex = b2a_hex(stream_first_4_bytes)
  339. if bytes_as_hex.startswith(b'ffd8'):
  340. file_type = '.jpeg'
  341. elif bytes_as_hex == b'89504e47':
  342. file_type = '.png'
  343. elif bytes_as_hex == b'47494638':
  344. file_type = '.gif'
  345. elif bytes_as_hex.startswith(b'424d'):
  346. file_type = '.bmp'
  347. return file_type
  348.  
  349. def add_text(self, text, box_pos, box_size, **kwargs):
  350. label = self.labels.get((box_pos, box_pos))
  351. if not label:
  352. label = PDFLabelWidget(text=text, pos=box_pos, size=box_size, **kwargs)
  353. self.labels[(box_pos, box_size)] = label
  354. self.add_widget(label)
  355. else:
  356. label.text += text
  357.  
  358. def add_image(self, lt_image):
  359. source = self.save_image(lt_image)
  360. if source:
  361. image = PDFImageWidget(
  362. source=source,
  363. pos=(lt_image.x0, lt_image.y0),
  364. size=(lt_image.widt, lt_image.height)
  365. )
  366. self.add_widget(image)
  367. self.images.append(image)
  368.  
  369.  
  370. if __name__ == '__main__':
  371. from sys import argv
  372. from kivy.base import runTouchApp
  373. from kivy.uix.scrollview import ScrollView
  374. fn = 'c'
  375. root = PDFDocumentWidget(source=fn, cols=1)
  376.  
  377.  
  378.  
  379.  
  380.  
  381.  
  382. from kivy.app import App
  383. from kivy.lang import Builder
  384. from kivy.uix.recycleview import RecycleView
  385. from kivy.uix.screenmanager import ScreenManager, Screen
  386. from kivy.uix.boxlayout import BoxLayout
  387. from kivy.uix.gridlayout import GridLayout
  388. from kivy.config import Config
  389. from knopki import Knopki
  390.  
  391. Config.set("graphics", 'resizable', 0)
  392. Config.set("graphics", 'height', 960)
  393. Config.set("graphics", 'width', 640)
  394. Builder.load_string("""
  395. #:import Knopki knopki
  396. <Button1@Button>:
  397. background_color:255, 255, 255, 1
  398. size:300,150
  399. size_hint:None, None
  400.  
  401. <Image>:
  402. source:'фон.jpg'
  403. allow_stretch: True
  404.  
  405. <Label>:
  406. color:0,0,0,1
  407. font_size:30
  408.  
  409. <BoxLayout>:
  410. orientation:'vertical'
  411. spacing:200
  412. padding:170,200
  413. background_color:1,1,1,1
  414.  
  415. <GridLayout10@GridLayout>:
  416. cols:2
  417. spacing:10,40
  418. padding:15
  419.  
  420. <MainScreen>:
  421. name:"Menu"
  422. Image
  423. BoxLayout:
  424. Button1:
  425. on_press:root.manager.current="Subjects"
  426. text:"Subjects"
  427. Button1:
  428. text:"Special"
  429. on_press:root.manager.current="Special"
  430.  
  431.  
  432. <SubjectsScreen>:
  433. name:"Subjects"
  434. Image
  435. BoxLayout:
  436. Button1:
  437. text:"10 class"
  438. on_press:root.manager.current="SubjectsScreen10"
  439. Button1:
  440. text:"11 class"
  441. on_press:root.manager.current="SubjectsScreen11"
  442.  
  443. <SubjectsScreen10>:
  444. name:"SubjectsScreen10"
  445. Image
  446. GridLayout10:
  447. Button1:
  448. text:"Геометрия"
  449. Button1:
  450. text:"Алгебра"
  451. Button1:
  452. text:"Химия"
  453. Button1:
  454. text:"Физика"
  455. Button1:
  456. text:"Русская литература"
  457. Button1:
  458. text:"Русский язык"
  459. on_press:root.russ_yaz()
  460. Button1:
  461. text:"География"
  462. Button1:
  463. text:"Биология"
  464. Button1:
  465. text:"Белорусский язык"
  466.  
  467. <SubjectsScreen11>:
  468. name:"SubjectsScreen11"
  469. Image
  470. GridLayout10:
  471. Button1:
  472. text:"Алгебра"
  473. Button1:
  474. text:"Геометрия"
  475. Button1:
  476. text:"Физика"
  477. Button1:
  478. text:"Химия"
  479. Button1:
  480. text:"Русская литература"
  481. Button1:
  482. text:"Русский язык"
  483. on_press:self.russ_yaz
  484.  
  485. Button1:
  486. text:"География"
  487. Button1:
  488. text:"Биология"
  489. Button1:
  490. text:"Белорусский язык"
  491. Button1:
  492. text:"Астрономия"
  493.  
  494. <SpecialSubjectsScreen>:
  495. name:"Special"
  496. Image
  497. AnchorLayout:
  498. Button1:
  499. text:"No material(Back)"
  500. on_press: root.manager.current='Menu'
  501. """)
  502.  
  503.  
  504. class MainScreen(Screen):
  505. pass
  506.  
  507.  
  508. class SpecialSubjectsScreen(Screen):
  509. pass
  510.  
  511.  
  512. class SubjectsScreen(Screen):
  513. pass
  514.  
  515.  
  516. class SubjectsScreen10(Screen):
  517. def russ_yaz(self):
  518. fn = 'Дудников, А.В Русский язык.pdf'
  519. root = PDFDocumentWidget(source=fn, cols=1)
  520. runTouchApp(root)
  521. PDFDocumentWidget()
  522.  
  523.  
  524.  
  525. class SubjectsScreen11(Screen):
  526. pass
  527.  
  528.  
  529. sm = ScreenManager()
  530. sm.add_widget(MainScreen(name='Menu'))
  531. sm.add_widget(SpecialSubjectsScreen(name='Special'))
  532. sm.add_widget(SubjectsScreen(name="Subjects"))
  533. sm.add_widget(SubjectsScreen10(name="SubjectsScreen10"))
  534. sm.add_widget(SubjectsScreen11(name="SubjectsScreen11"))
  535.  
  536.  
  537. class LibraryApp(App):
  538. def russ_yaz(self):
  539. fn = 'Дудников, А.В Русский язык.pdf'
  540. root = PDFDocumentWidget(source=fn, cols=1)
  541. runTouchApp(root)
  542. def build(self):
  543. return sm
  544.  
  545.  
  546.  
  547. if __name__ == '__main__':
  548. LibraryApp().run()
  549.  
  550.  
  551. runTouchApp(root)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement