url.py

#!/usr/bin/env python
"""
url.py - jenni Bitly Module
Copyright 2015, Sujeet Akula (sujeet@freeboson.org)
Copyright 2010-2013, Michael Yanovich (yanovich.net)
Copyright 2010-2013, Kenneth Sham
Licensed under the Eiffel Forum License 2.

More info:
 * jenni: https://github.com/myano/jenni/
 * Phenny: http://inamidst.com/phenny/

This module will record all URLs to bitly via an api key and account.
It also automatically displays the "title" of any URL pasted into the channel.
"""

import json
import re
from htmlentitydefs import name2codepoint
from modules import unicode as uc
from modules import proxy
import time
import urllib2
import web


# Place a file in your ~/jenni/ folder named, bitly.txt
# and inside this file place your API key followed by a ','
# and then your username. For example, the only line in that
# file should look like this:
# R_d67798xkjc87sdx6x8c7kjc87,myusername

# this variable is to determine when to use bitly. If the URL is more
# than this length, it'll display a bitly URL instead. To disable bit.ly,
# put None even if it's set to None, triggering .bitly command will still work!
BITLY_TRIGGER_LEN_TITLE = 20
BITLY_TRIGGER_LEN_NOTITLE = 80
EXCLUSION_CHAR = '!'
IGNORE = ["http://morethan.tv/user.php"]

# do not edit below this line unless you know what you're doing
bitly_loaded = False
BLOCKED_MODULES = ['bitly', 'head', 'host', 'ip', 'isup', 'longurl', 'py',
                   'short', 'spotify', 'sp', 'st', 'tell', 'title', 'tw',
                   'twitter', 'unbitly', 'untiny',]
simple_channels = list()

try:
    file = open('bitly.txt', 'r')
    key = file.read()
    key = key.split(',')
    bitly_api_key = str(key[0].strip())
    bitly_user = str(key[1].strip())
    file.close()
    bitly_loaded = True
except:
    print 'WARNING: No bitly.txt found.'

try:
    f = open('simple_channels.txt', 'r')
    channels = f.read()
    channels = channels.split(',')
    for channel in channels:
        simple_channels.append(channel.strip())
    f.close()
except:
    print 'WARNING: No simple_channels.txt found'

url_finder = re.compile(r'(?iu)(%s?(http|https|ftp)(://\S+\.?\S+/?\S+?))' %
                        (EXCLUSION_CHAR))
r_entity = re.compile(r'&[A-Za-z0-9#]+;')
INVALID_WEBSITE = 0x01
HTML_ENTITIES = { 'apos': "'" }


def noteuri(jenni, input):
    uri = input.group(1).encode('utf-8')
    if not hasattr(jenni, 'last_seen_uri'):
        jenni.last_seen_uri = {}
    jenni.last_seen_uri[input.sender] = uri
noteuri.rule = r'(?u).*(http[s]?://[^<> "\x01]+)[,.]?'
noteuri.priority = 'low'


def get_page_backup(url):
    req = urllib2.Request(url, headers={'Accept':'*/*'})
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0')
    u = urllib2.urlopen(req)
    contents = u.read()
    out = dict()
    try:
        con = (contents).decode('utf-8')
    except:
        con = (contents).decode('iso-8859-1')
    out['code'] = u.code
    out['read'] = con
    out['geturl'] = u.geturl()
    out['headers'] = u.headers.dict
    out['url'] = u.url
    return out['code'], out


def find_title(url):
    """
    This finds the title when provided with a string of a URL.
    """

    for item in IGNORE:
        if item in url:
            return False, 'ignored'

    if not re.search('^((https?)|(ftp))://', url):
        url = 'http://' + url

    if '/#!' in url:
        url = url.replace('/#!', '/?_escaped_fragment_=')

    if 'i.imgur' in url:
        a = url.split('.')
        url = a[0][:-1] + '.'.join(a[1:-1])

    if 'zerobin.net' in url:
        return True, 'ZeroBin'

    url = uc.decode(url)

    msg = str()
    k = 0
    status = False

    while not status:
        k += 1
        if k > 3:
            break

        msg = dict()

        try:
            status, msg = proxy.get_more(url)
        except:
            try:
                status, msg = get_page_backup(url)
            except:
                continue

        if type(msg) == type(dict()) and 'code' in msg:
            status = msg['code']
        else:
            continue

        time.sleep(0.5)


    if not status:
        return False, msg

    useful = msg

    info = useful['headers']
    page = useful['read']

    try:
        mtype = info['content-type']
    except:
        print 'failed mtype:', str(info)
        return False, 'mtype failed'
    if not (('/html' in mtype) or ('/xhtml' in mtype)):
        return False, str(mtype)

    content = page
    regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
    content = regex.sub(r'<\1title>', content)
    regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
    content = regex.sub('', content)
    start = content.find('<title>')
    if start == -1:
        return False, 'NO <title> found'
    end = content.find('</title>', start)
    if end == -1:
        return False, 'NO </title> found'
    content = content[start + 7:end]
    content = content.strip('\n').rstrip().lstrip()
    title = content

    if len(title) > 200:
        title = title[:200] + '[...]'

    def e(m):
        entity = m.group()
        if entity.startswith('&#x'):
            cp = int(entity[3:-1], 16)
            meep = unichr(cp)
        elif entity.startswith('&#'):
            cp = int(entity[2:-1])
            meep = unichr(cp)
        else:
            entity_stripped = entity[1:-1]
            try:
                char = name2codepoint[entity_stripped]
                meep = unichr(char)
            except:
                if entity_stripped in HTML_ENTITIES:
                    meep = HTML_ENTITIES[entity_stripped]
                else:
                    meep = str()
        try:
            return uc.decode(meep)
        except:
            return uc.decode(uc.encode(meep))

    title = r_entity.sub(e, title)

    title = title.replace('\n', '')
    title = title.replace('\r', '')

    def remove_spaces(x):
        if '  ' in x:
            x = x.replace('  ', ' ')
            return remove_spaces(x)
        else:
            return x

    title = remove_spaces(title)

    new_title = str()
    for char in title:
        unichar = uc.encode(char)
        if len(list(uc.encode(char))) <= 3:
            new_title += uc.encode(char)
    title = new_title

    title = re.sub(r'(?i)dcc\ssend', '', title)

    title += '\x0F'

    if title:
        return True, title
    else:
        return False, 'No Title'

def is_bitly(txt):
    bitly_domains = ['//j.mp', '//bit.ly', '//bitly.com']
    for each in bitly_domains:
        if each in txt:
            return True
    return False


def short(text):
    """
    This function creates a bitly url for each url in the provided string.
    The return type is a list.
    """

    if not bitly_loaded:
        return list()
    if not text:
        return list()
    bitlys = list()
    try:
        a = re.findall(url_finder, text)
        k = len(a)
        i = 0
        while i < k:
            b = uc.decode(a[i][0])
            ## make sure that it is not already a bitly shortened link
            if not is_bitly(b):
                longer = urllib2.quote(b)
                url = 'https://api-ssl.bitly.com/v3/shorten?login=%s' % (bitly_user)
                url += '&apiKey=%s&longUrl=%s&format=txt' % (bitly_api_key,
                                                             longer)
                #shorter = proxy.get(url)
                shorter = web.get(url)
                shorter.strip()
                bitlys.append([b, shorter])
            else:
                bitlys.append([b, str()])
            i += 1
            return bitlys
    except:
        return
    return bitlys


def generateBitLy(jenni, input):
    url = input.group(2)
    if not url:
        if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri:
            url = jenni.last_seen_uri[input.sender]
        else:
            return jenni.say('No URL provided')

    bitly = short(url)
    for b in bitly:
        displayBitLy(jenni, b[0], b[1])
generateBitLy.commands = ['bitly']
generateBitLy.priority = 'high'


def displayBitLy(jenni, url, shorten):
    if url is None or shorten is None:
        return
    u = getTLD(url)
    shorten = shorten.replace('http:', 'https:')
    jenni.say('%s  -  %s' % (u, shorten))


def remove_nonprint(text):
    new = str()
    for char in text:
        x = ord(char)
        if x > 32 and x <= 126:
            new += char
    return new


def getTLD(url):
    url = url.strip()
    url = remove_nonprint(url)
    idx = 7
    if url.startswith('https://'):
        idx = 8
    elif url.startswith('ftp://'):
        idx = 6
    u = url[idx:]
    f = u.find('/')
    if f == -1:
        u = url
    else:
        u = url[0:idx] + u[0:f]
    return remove_nonprint(u)


def doUseBitLy(title, url):
    BTL = None
    if title:
        BTL = BITLY_TRIGGER_LEN_TITLE
    else:
        BTL = BITLY_TRIGGER_LEN_NOTITLE
    return bitly_loaded and BTL is not None and len(url) > BTL


def get_results(text, manual=False):
    if not text:
        return False, list()
    a = re.findall(url_finder, text)
    k = len(a)
    i = 0
    display = list()
    passs = False
    channel = str()
    if hasattr(text, 'sender'):
        channel = text.sender
    while i < k:
        url = uc.encode(a[i][0])
        url = uc.decode(url)
        url = uc.iriToUri(url)
        url = remove_nonprint(url)
        domain = getTLD(url)
        if '//' in domain:
            domain = domain.split('//')[1]
        if 'i.imgur.com' in url and url.startswith('http://'):
            url = url.replace('http:', 'https:')

        bitly = url

        if not url.startswith(EXCLUSION_CHAR):
            passs, page_title = find_title(url)
            if not manual:
                if bitly_loaded:
                    if channel and channel not in simple_channels:
                        bitly = short(url)
                        if bitly:
                            bitly = bitly[0][1]
            display.append([page_title, url, bitly, passs])
        else:
            ## has exclusion character
            if manual:
                ## only process excluded URLs if .title is used
                url = url[1:]
                passs, page_title = find_title(url)
                display.append([page_title, url, bitly, passs])
        i += 1

    ## check to make sure at least 1 URL worked correctly
    overall_pass = False
    for x in display:
        if x[-1] == True:
            overall_pass = True

    return overall_pass, display


def show_title_auto(jenni, input):
    '''No command - Automatically displays titles for URLs'''
    for each in BLOCKED_MODULES:
        if input.startswith('.%s ' % (each)):
            ## Don't want it to show duplicate titles
            return
    if len(re.findall('\([\d]+\sfiles\sin\s[\d]+\sdirs\)', input)) == 1:
        ## Directory Listing of files
        return

    try:
        status, results = get_results(input)
    except Exception, e:
        print '[%s]' % e, input
        return

    k = 1

    output_shorts = str()
    results_len = len(results)

    for r in results:
        ## loop through link, shorten pairs, and titles
        returned_title = r[0]
        orig = r[1]
        bitly_link = r[2]
        link_pass = r[3]


        if orig and bitly_link and bitly_link != orig and ('bit.ly' in bitly_link or 'j.mp' in bitly_link):
            ## if we get back useful data
            ## and we have a bitly link (bitly worked!)
            ## and the shortened link is 'valid'
            ## let's make it 'https' instead of 'http'
            bitly_link = bitly_link.replace('http:', 'https:')

        if returned_title == 'imgur: the simple image sharer':
            ## because of the i.imgur hack above this is done
            ## to prevent from showing useless titles on image
            ## files
            return

        if k > 3:
            ## more than 3 titles to show from one line of text?
            ## let's just show only the first 3.
            break
        k += 1

        ## deteremine if we should display the bitly link
        useBitLy = doUseBitLy(returned_title, orig)

        reg_format = '[ %s ] - %s'
        special_format = '[ %s ]'
        response = str()

        if status and link_pass:
            if useBitLy and input.sender not in simple_channels and bitly_link:
                response = reg_format % (uc.decode(returned_title), bitly_link)
            else:
                if input.sender in simple_channels:
                    response = special_format % (returned_title)
                else:
                    response = reg_format % (returned_title, getTLD(orig))
        elif len(orig) > BITLY_TRIGGER_LEN_NOTITLE:
            if useBitLy and bitly_link != orig:
                #response = '%s' % (bitly_link)
                output_shorts += bitly_link + ' '
            else:
                ## Fail silently, link can't be bitly'ed and no title was found
                pass

        if response:
            jenni.say(response)

    if output_shorts:
        jenni.say((output_shorts).strip())
show_title_auto.rule = '(?iu).*(%s?(http|https)(://\S+)).*' % (EXCLUSION_CHAR)
show_title_auto.priority = 'high'


def show_title_demand(jenni, input):
    '''.title http://google.com/ -- forcibly show titles for a given URL'''
    uri = input.group(2)

    if uri and 'http' not in uri:
        uri = 'http://' + uri

    if not uri:
        channel = input.sender
        if not hasattr(jenni, 'last_seen_uri'):
            jenni.last_seen_uri = dict()
        if channel in jenni.last_seen_uri:
            uri = jenni.last_seen_uri[channel]
        else:
            return jenni.say('No recent links seen in this channel.')

    status, results = get_results(uri, True)

    for r in results:
        returned_title = r[0]
        orig = r[1]
        bitly_link = r[2]
        link_pass = r[3]

        if returned_title is None:
            jenni.say('No title returned.')
            continue

        if status and link_pass:
            response = '[ %s ]' % (returned_title)
        else:
            response = '(%s)' % (returned_title)
        jenni.say(response)
show_title_demand.commands = ['title']
show_title_demand.priority = 'high'


def collect_links(jenni, input):
    link = input.groups()
    channel = input.sender
    link = link[0]
    if not hasattr(jenni, 'last_seen_uri'):
        jenni.last_seen_uri = dict()
    jenni.last_seen_uri[channel] = link
collect_links.rule = '(?iu).*(%s?(http|https)(://\S+)).*' % (EXCLUSION_CHAR)
collect_links.priority = 'low'

re_meta = re.compile('(?i)content="\S+;\s*?url=(\S+)"\s*?>')


def unbitly(jenni, input):
    '''.longurl <link> -- obtain the final destination URL from a short URL'''
    url = input.group(2)
    if not url:
        if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri:
            url = jenni.last_seen_uri[input.sender]
        else:
            return jenni.say('No URL provided')
    if not url.startswith(('http://', 'https://')):
        url = 'http://' + url

    status, useful = proxy.get_more(url)
    try:
        new_url = re_meta.findall(useful['read'])
    except:
        return jenni.say(str(useful))

    if new_url:
        new_url = new_url[0]
    else:
        url = url.replace("'", r"\'")
        try:
            status, results = proxy.get_more(url)
            new_url = results['geturl']
        except:
            return jenni.say('Failed to grab URL: %s' % (url))

    if new_url.startswith(('http://', 'https://')):
        jenni.say(new_url)
    else:
        jenni.say('Failed to obtain final destination.')
unbitly.commands = ['unbitly', 'untiny', 'longurl', 'st', 'short']
unbitly.priority = 'low'
unbitly.example = '.unbitly http://git.io/6fY4OQ'


def puny(jenni, input):
    '''.puny -- convert to xn-- code for URLs'''
    text = input.group(2)
    if not text:
        return jenni.say('No input provided.')

    if text.startswith('xn--'):
        text = text[4:]
        text_ascii = (text).encode('utf-8')
        try:
            text_unpuny = (text_ascii).decode('punycode')
        except:
            return jenni.say('Stop being a twat.')
        output = (text_unpuny).encode('utf-8')
        output = (output).decode('utf-8')
    else:
        text = (text).encode('utf-8')
        text_utf = (text).decode('utf-8')

        text_puny = (text_utf).encode('punycode')

        output = 'xn--' + text_puny

    return jenni.say(output)
puny.commands = ['puny', 'idn', 'idna']


if __name__ == '__main__':
    print __doc__.strip()