nzb_utils.py

#!/usr/bin/env python
# -*- coding: latin-1 -*-

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>

# http://docs.newzbin.com/index.php/Newzbin:NZB_Specs

import pynzb # http://pypi.python.org/pypi/pynzb/
import os
import io
import re
import time
import datetime
from xml.dom import minidom

def read_nzb(nzb_file):
    print("Reading %s." % os.path.basename(nzb_file))
    def parse(nzb_file):
        try: # file on disk
            return pynzb.nzb_parser.parse(open(nzb_file).read())
        except: # an open file object
            return pynzb.nzb_parser.parse(nzb_file.read())

    try:
        return parse(nzb_file)
    except:
        print("Parsing the nzb file failed. Trying to fix invalid XML.")
        # Problem with the ampersand.
        # newsmangler doesn't properly escape the & in the NZB
        # http://www.powergrep.com/manual/xmpxmlfixentities.html
        XML_AMP_FIX = "&(?!(?:[a-z]+|#[0-9]+|#x[0-9a-f]+);)"
        fixed_nzb = io.BytesIO()
        for line in open(nzb_file, "r").readlines():
            line = re.sub(XML_AMP_FIX, "&amp;", line)
            line = re.sub("&ouml;", "ö", line)
            fixed_nzb.write(line)
        fixed_nzb.seek(0)
        return parse(fixed_nzb)

def parse_name(subject):
    """ Grabs the file name from the subject of the Usenet posting.
    Return the whole subject if the file name isn't parseable.
    &quot; must be replaced by " for this to work. """
    match = re.search('''"(.*)"''', subject)
    if match:
        return match.group(1).strip('"')
    else:
        # "Because the poster used a non-standard subject line, the system was
        # unable to determine the filename with certainty."
        match = re.search(".*(\]-| )(?P<filename>.*) [\d/\(\)]+", subject)
        if match:
            return match.group("filename")
        else:
            return subject

"""
NZBFile Objects
===============

All of the parsers return ``NZBFile`` objects, which are objects with the
following properties:

``poster``:
    The name of the user who posted the file to the newsgroup.

``date``:
    A ``datetime.date`` representation of when the server first saw the file.

``subject``:
    The subject used when the user posted the file to the newsgroup.

``groups``:
    A list of strings representing the newsgroups in which this file may be
    found.

``segments``:
    A list of ``NZBSegment`` objects talking about where to get the contents
    of this file.


NZBSegment Objects
==================

Each ``NZBFile`` has a list of ``NZBSegment`` objects, which include
information
on how to retrieve a part of a file.  Here's what you can find on an
``NZBSegment`` object:

``number``:
    The number of the segment in the list of files.

``bytes``:
    The size of the segment, in bytes.

``message_id``:
    The Message-ID of the segment (useful for retrieving the full contents)
"""

# fix pynzb library
def _parse_date(date):
    if isinstance(date, basestring):
        date = int(date)
    return datetime.datetime.fromtimestamp(date)
pynzb.base.parse_date = _parse_date

# add compare functionality
def _equality_test(self, other):
    try:
        return (self.bytes == other.bytes and
            self.number == other.number and
            self.message_id == other.message_id)
    except AttributeError:
        return (self.bytes == other.bytes and
            self.number == other.number)
pynzb.base.NZBSegment.__eq__ = _equality_test

# pynzb library only supports parsing
#'add_group', 'add_segment', 'date', 'groups', 'poster', 'segments', 'subject'

def empty_nzb_document():
    """ Creates xmldoc XML document for a NZB file. """
    # http://stackoverflow.com/questions/1980380/how-to-render-a-doctype-with-pythons-xml-dom-minidom
    imp = minidom.getDOMImplementation()
    dt = imp.createDocumentType("nzb", "-//newzBin//DTD NZB 1.1//EN",
                                "http://www.newzbin.com/DTD/nzb/nzb-1.1.dtd")
    doc = imp.createDocument("http://www.newzbin.com/DTD/2003/nzb", "nzb", dt)
    # http://stackoverflow.com/questions/2306149/how-to-write-xml-elements-with-namespaces-in-python
    doc.documentElement.setAttribute('xmlns', 'http://www.newzbin.com/DTD/2003/nzb')
    return doc

def get_pretty_xml(document):
    """ NZB will have bugs if used with the wrong Python version:
    http://bugs.python.org/issue1777134 """
    return document.toprettyxml(encoding="UTF-8")

def get_xml(document):
    return document.toxml("UTF-8")

def _date_to_posix(date):
    """ date: datetime.datetime object """
    posix = time.mktime(date.timetuple())
    return str(int(posix))

def add_file(document, nzb_file):
    """ document: xml.dom.minidom.Document object """
    top_element = document.documentElement
    file = document.createElement("file")

    # add file attributes
    file.setAttribute("poster", nzb_file.poster)
    file.setAttribute("date", _date_to_posix(nzb_file.date))
    file.setAttribute("subject", nzb_file.subject)

    # groups
    groups = document.createElement("groups")
    for group in nzb_file.groups:
        g = document.createElement("group")
        g.appendChild(document.createTextNode(group))
        groups.appendChild(g)
    file.appendChild(groups)

    # segments
    segments = document.createElement("segments")
    for segment in nzb_file.segments:
        s = document.createElement("segment")
        s.setAttribute("bytes", str(segment.bytes))
        s.setAttribute("number", str(segment.number))
        s.appendChild(document.createTextNode(segment.message_id))
        segments.appendChild(s)
    file.appendChild(segments)

    top_element.appendChild(file)
    return document

def list_filenames(nzb_file):
    return [(parse_name(f.subject), f.date) for f in read_nzb(nzb_file)]