Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import pyPdf
- import re
- import isbn
- import sys
- def get_pdf_content(path, pages_count=10):
- content = ""
- pdf = pyPdf.PdfFileReader(file(path, "rb"))
- pages_total = pdf.getNumPages()
- if pages_total < pages_count:
- pages = pages_total
- else:
- pages = pages_count
- for i in xrange(0, pages):
- content += pdf.getPage(i).extractText() + "\n"
- content = " ".join(content.replace(u"\xa0", " ").strip().split())
- return content.encode("ascii", "ignore")
- content = get_pdf_content(sys.argv[1])
- isbn_13 = re.findall("ISBN.{0,15}([0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9])", content)
- isbn_10 = re.findall("ISBN.{0,15}([0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9][-]?[0-9])", content)
- isbn_13 = list(set(isbn_13))
- isbn_10 = list(set(isbn_10))
- isbn_13_valid = [i for i in isbn_13 if isbn.isValid(i)]
- isbn_10_valid = [i for i in isbn_10 if isbn.isValid(i)]
- print "ISBN-13:", isbn_13_valid
- print "ISBN-10:", isbn_10_valid
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement