Advertisement
Guest User

url.py

a guest
Mar 27th, 2016
150
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 16.78 KB | None | 0 0
  1. #!/usr/bin/env python
  2. """
  3. url.py - jenni Bitly Module
  4. Copyright 2015, Sujeet Akula (sujeet@freeboson.org)
  5. Copyright 2010-2013, Michael Yanovich (yanovich.net)
  6. Copyright 2010-2013, Kenneth Sham
  7. Licensed under the Eiffel Forum License 2.
  8.  
  9. More info:
  10. * jenni: https://github.com/myano/jenni/
  11. * Phenny: http://inamidst.com/phenny/
  12.  
  13. This module will record all URLs to bitly via an api key and account.
  14. It also automatically displays the "title" of any URL pasted into the channel.
  15. """
  16.  
  17. import json
  18. import re
  19. from htmlentitydefs import name2codepoint
  20. from modules import unicode as uc
  21. from modules import proxy
  22. import time
  23. import urllib2
  24. import web
  25.  
  26.  
  27. # Place a file in your ~/jenni/ folder named, bitly.txt
  28. # and inside this file place your API key followed by a ','
  29. # and then your username. For example, the only line in that
  30. # file should look like this:
  31. # R_d67798xkjc87sdx6x8c7kjc87,myusername
  32.  
  33. # this variable is to determine when to use bitly. If the URL is more
  34. # than this length, it'll display a bitly URL instead. To disable bit.ly,
  35. # put None even if it's set to None, triggering .bitly command will still work!
  36. BITLY_TRIGGER_LEN_TITLE = 20
  37. BITLY_TRIGGER_LEN_NOTITLE = 80
  38. EXCLUSION_CHAR = '!'
  39. IGNORE = ["http://morethan.tv/user.php"]
  40.  
  41. # do not edit below this line unless you know what you're doing
  42. bitly_loaded = False
  43. BLOCKED_MODULES = ['bitly', 'head', 'host', 'ip', 'isup', 'longurl', 'py',
  44. 'short', 'spotify', 'sp', 'st', 'tell', 'title', 'tw',
  45. 'twitter', 'unbitly', 'untiny',]
  46. simple_channels = list()
  47.  
  48. try:
  49. file = open('bitly.txt', 'r')
  50. key = file.read()
  51. key = key.split(',')
  52. bitly_api_key = str(key[0].strip())
  53. bitly_user = str(key[1].strip())
  54. file.close()
  55. bitly_loaded = True
  56. except:
  57. print 'WARNING: No bitly.txt found.'
  58.  
  59. try:
  60. f = open('simple_channels.txt', 'r')
  61. channels = f.read()
  62. channels = channels.split(',')
  63. for channel in channels:
  64. simple_channels.append(channel.strip())
  65. f.close()
  66. except:
  67. print 'WARNING: No simple_channels.txt found'
  68.  
  69. url_finder = re.compile(r'(?iu)(%s?(http|https|ftp)(://\S+\.?\S+/?\S+?))' %
  70. (EXCLUSION_CHAR))
  71. r_entity = re.compile(r'&[A-Za-z0-9#]+;')
  72. INVALID_WEBSITE = 0x01
  73. HTML_ENTITIES = { 'apos': "'" }
  74.  
  75.  
  76. def noteuri(jenni, input):
  77. uri = input.group(1).encode('utf-8')
  78. if not hasattr(jenni, 'last_seen_uri'):
  79. jenni.last_seen_uri = {}
  80. jenni.last_seen_uri[input.sender] = uri
  81. noteuri.rule = r'(?u).*(http[s]?://[^<> "\x01]+)[,.]?'
  82. noteuri.priority = 'low'
  83.  
  84.  
  85. def get_page_backup(url):
  86. req = urllib2.Request(url, headers={'Accept':'*/*'})
  87. req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0')
  88. u = urllib2.urlopen(req)
  89. contents = u.read()
  90. out = dict()
  91. try:
  92. con = (contents).decode('utf-8')
  93. except:
  94. con = (contents).decode('iso-8859-1')
  95. out['code'] = u.code
  96. out['read'] = con
  97. out['geturl'] = u.geturl()
  98. out['headers'] = u.headers.dict
  99. out['url'] = u.url
  100. return out['code'], out
  101.  
  102.  
  103. def find_title(url):
  104. """
  105. This finds the title when provided with a string of a URL.
  106. """
  107.  
  108. for item in IGNORE:
  109. if item in url:
  110. return False, 'ignored'
  111.  
  112. if not re.search('^((https?)|(ftp))://', url):
  113. url = 'http://' + url
  114.  
  115. if '/#!' in url:
  116. url = url.replace('/#!', '/?_escaped_fragment_=')
  117.  
  118. if 'i.imgur' in url:
  119. a = url.split('.')
  120. url = a[0][:-1] + '.'.join(a[1:-1])
  121.  
  122. if 'zerobin.net' in url:
  123. return True, 'ZeroBin'
  124.  
  125. url = uc.decode(url)
  126.  
  127. msg = str()
  128. k = 0
  129. status = False
  130.  
  131. while not status:
  132. k += 1
  133. if k > 3:
  134. break
  135.  
  136. msg = dict()
  137.  
  138. try:
  139. status, msg = proxy.get_more(url)
  140. except:
  141. try:
  142. status, msg = get_page_backup(url)
  143. except:
  144. continue
  145.  
  146. if type(msg) == type(dict()) and 'code' in msg:
  147. status = msg['code']
  148. else:
  149. continue
  150.  
  151. time.sleep(0.5)
  152.  
  153.  
  154. if not status:
  155. return False, msg
  156.  
  157. useful = msg
  158.  
  159. info = useful['headers']
  160. page = useful['read']
  161.  
  162. try:
  163. mtype = info['content-type']
  164. except:
  165. print 'failed mtype:', str(info)
  166. return False, 'mtype failed'
  167. if not (('/html' in mtype) or ('/xhtml' in mtype)):
  168. return False, str(mtype)
  169.  
  170. content = page
  171. regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
  172. content = regex.sub(r'<\1title>', content)
  173. regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
  174. content = regex.sub('', content)
  175. start = content.find('<title>')
  176. if start == -1:
  177. return False, 'NO <title> found'
  178. end = content.find('</title>', start)
  179. if end == -1:
  180. return False, 'NO </title> found'
  181. content = content[start + 7:end]
  182. content = content.strip('\n').rstrip().lstrip()
  183. title = content
  184.  
  185. if len(title) > 200:
  186. title = title[:200] + '[...]'
  187.  
  188. def e(m):
  189. entity = m.group()
  190. if entity.startswith('&#x'):
  191. cp = int(entity[3:-1], 16)
  192. meep = unichr(cp)
  193. elif entity.startswith('&#'):
  194. cp = int(entity[2:-1])
  195. meep = unichr(cp)
  196. else:
  197. entity_stripped = entity[1:-1]
  198. try:
  199. char = name2codepoint[entity_stripped]
  200. meep = unichr(char)
  201. except:
  202. if entity_stripped in HTML_ENTITIES:
  203. meep = HTML_ENTITIES[entity_stripped]
  204. else:
  205. meep = str()
  206. try:
  207. return uc.decode(meep)
  208. except:
  209. return uc.decode(uc.encode(meep))
  210.  
  211. title = r_entity.sub(e, title)
  212.  
  213. title = title.replace('\n', '')
  214. title = title.replace('\r', '')
  215.  
  216. def remove_spaces(x):
  217. if ' ' in x:
  218. x = x.replace(' ', ' ')
  219. return remove_spaces(x)
  220. else:
  221. return x
  222.  
  223. title = remove_spaces(title)
  224.  
  225. new_title = str()
  226. for char in title:
  227. unichar = uc.encode(char)
  228. if len(list(uc.encode(char))) <= 3:
  229. new_title += uc.encode(char)
  230. title = new_title
  231.  
  232. title = re.sub(r'(?i)dcc\ssend', '', title)
  233.  
  234. title += '\x0F'
  235.  
  236. if title:
  237. return True, title
  238. else:
  239. return False, 'No Title'
  240.  
  241. def is_bitly(txt):
  242. bitly_domains = ['//j.mp', '//bit.ly', '//bitly.com']
  243. for each in bitly_domains:
  244. if each in txt:
  245. return True
  246. return False
  247.  
  248.  
  249. def short(text):
  250. """
  251. This function creates a bitly url for each url in the provided string.
  252. The return type is a list.
  253. """
  254.  
  255. if not bitly_loaded:
  256. return list()
  257. if not text:
  258. return list()
  259. bitlys = list()
  260. try:
  261. a = re.findall(url_finder, text)
  262. k = len(a)
  263. i = 0
  264. while i < k:
  265. b = uc.decode(a[i][0])
  266. ## make sure that it is not already a bitly shortened link
  267. if not is_bitly(b):
  268. longer = urllib2.quote(b)
  269. url = 'https://api-ssl.bitly.com/v3/shorten?login=%s' % (bitly_user)
  270. url += '&apiKey=%s&longUrl=%s&format=txt' % (bitly_api_key,
  271. longer)
  272. #shorter = proxy.get(url)
  273. shorter = web.get(url)
  274. shorter.strip()
  275. bitlys.append([b, shorter])
  276. else:
  277. bitlys.append([b, str()])
  278. i += 1
  279. return bitlys
  280. except:
  281. return
  282. return bitlys
  283.  
  284.  
  285. def generateBitLy(jenni, input):
  286. url = input.group(2)
  287. if not url:
  288. if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri:
  289. url = jenni.last_seen_uri[input.sender]
  290. else:
  291. return jenni.say('No URL provided')
  292.  
  293. bitly = short(url)
  294. for b in bitly:
  295. displayBitLy(jenni, b[0], b[1])
  296. generateBitLy.commands = ['bitly']
  297. generateBitLy.priority = 'high'
  298.  
  299.  
  300. def displayBitLy(jenni, url, shorten):
  301. if url is None or shorten is None:
  302. return
  303. u = getTLD(url)
  304. shorten = shorten.replace('http:', 'https:')
  305. jenni.say('%s - %s' % (u, shorten))
  306.  
  307.  
  308. def remove_nonprint(text):
  309. new = str()
  310. for char in text:
  311. x = ord(char)
  312. if x > 32 and x <= 126:
  313. new += char
  314. return new
  315.  
  316.  
  317. def getTLD(url):
  318. url = url.strip()
  319. url = remove_nonprint(url)
  320. idx = 7
  321. if url.startswith('https://'):
  322. idx = 8
  323. elif url.startswith('ftp://'):
  324. idx = 6
  325. u = url[idx:]
  326. f = u.find('/')
  327. if f == -1:
  328. u = url
  329. else:
  330. u = url[0:idx] + u[0:f]
  331. return remove_nonprint(u)
  332.  
  333.  
  334. def doUseBitLy(title, url):
  335. BTL = None
  336. if title:
  337. BTL = BITLY_TRIGGER_LEN_TITLE
  338. else:
  339. BTL = BITLY_TRIGGER_LEN_NOTITLE
  340. return bitly_loaded and BTL is not None and len(url) > BTL
  341.  
  342.  
  343. def get_results(text, manual=False):
  344. if not text:
  345. return False, list()
  346. a = re.findall(url_finder, text)
  347. k = len(a)
  348. i = 0
  349. display = list()
  350. passs = False
  351. channel = str()
  352. if hasattr(text, 'sender'):
  353. channel = text.sender
  354. while i < k:
  355. url = uc.encode(a[i][0])
  356. url = uc.decode(url)
  357. url = uc.iriToUri(url)
  358. url = remove_nonprint(url)
  359. domain = getTLD(url)
  360. if '//' in domain:
  361. domain = domain.split('//')[1]
  362. if 'i.imgur.com' in url and url.startswith('http://'):
  363. url = url.replace('http:', 'https:')
  364.  
  365. bitly = url
  366.  
  367. if not url.startswith(EXCLUSION_CHAR):
  368. passs, page_title = find_title(url)
  369. if not manual:
  370. if bitly_loaded:
  371. if channel and channel not in simple_channels:
  372. bitly = short(url)
  373. if bitly:
  374. bitly = bitly[0][1]
  375. display.append([page_title, url, bitly, passs])
  376. else:
  377. ## has exclusion character
  378. if manual:
  379. ## only process excluded URLs if .title is used
  380. url = url[1:]
  381. passs, page_title = find_title(url)
  382. display.append([page_title, url, bitly, passs])
  383. i += 1
  384.  
  385. ## check to make sure at least 1 URL worked correctly
  386. overall_pass = False
  387. for x in display:
  388. if x[-1] == True:
  389. overall_pass = True
  390.  
  391. return overall_pass, display
  392.  
  393.  
  394. def show_title_auto(jenni, input):
  395. '''No command - Automatically displays titles for URLs'''
  396. for each in BLOCKED_MODULES:
  397. if input.startswith('.%s ' % (each)):
  398. ## Don't want it to show duplicate titles
  399. return
  400. if len(re.findall('\([\d]+\sfiles\sin\s[\d]+\sdirs\)', input)) == 1:
  401. ## Directory Listing of files
  402. return
  403.  
  404. try:
  405. status, results = get_results(input)
  406. except Exception, e:
  407. print '[%s]' % e, input
  408. return
  409.  
  410. k = 1
  411.  
  412. output_shorts = str()
  413. results_len = len(results)
  414.  
  415. for r in results:
  416. ## loop through link, shorten pairs, and titles
  417. returned_title = r[0]
  418. orig = r[1]
  419. bitly_link = r[2]
  420. link_pass = r[3]
  421.  
  422.  
  423. if orig and bitly_link and bitly_link != orig and ('bit.ly' in bitly_link or 'j.mp' in bitly_link):
  424. ## if we get back useful data
  425. ## and we have a bitly link (bitly worked!)
  426. ## and the shortened link is 'valid'
  427. ## let's make it 'https' instead of 'http'
  428. bitly_link = bitly_link.replace('http:', 'https:')
  429.  
  430. if returned_title == 'imgur: the simple image sharer':
  431. ## because of the i.imgur hack above this is done
  432. ## to prevent from showing useless titles on image
  433. ## files
  434. return
  435.  
  436. if k > 3:
  437. ## more than 3 titles to show from one line of text?
  438. ## let's just show only the first 3.
  439. break
  440. k += 1
  441.  
  442. ## deteremine if we should display the bitly link
  443. useBitLy = doUseBitLy(returned_title, orig)
  444.  
  445. reg_format = '[ %s ] - %s'
  446. special_format = '[ %s ]'
  447. response = str()
  448.  
  449. if status and link_pass:
  450. if useBitLy and input.sender not in simple_channels and bitly_link:
  451. response = reg_format % (uc.decode(returned_title), bitly_link)
  452. else:
  453. if input.sender in simple_channels:
  454. response = special_format % (returned_title)
  455. else:
  456. response = reg_format % (returned_title, getTLD(orig))
  457. elif len(orig) > BITLY_TRIGGER_LEN_NOTITLE:
  458. if useBitLy and bitly_link != orig:
  459. #response = '%s' % (bitly_link)
  460. output_shorts += bitly_link + ' '
  461. else:
  462. ## Fail silently, link can't be bitly'ed and no title was found
  463. pass
  464.  
  465. if response:
  466. jenni.say(response)
  467.  
  468. if output_shorts:
  469. jenni.say((output_shorts).strip())
  470. show_title_auto.rule = '(?iu).*(%s?(http|https)(://\S+)).*' % (EXCLUSION_CHAR)
  471. show_title_auto.priority = 'high'
  472.  
  473.  
  474. def show_title_demand(jenni, input):
  475. '''.title http://google.com/ -- forcibly show titles for a given URL'''
  476. uri = input.group(2)
  477.  
  478. if uri and 'http' not in uri:
  479. uri = 'http://' + uri
  480.  
  481. if not uri:
  482. channel = input.sender
  483. if not hasattr(jenni, 'last_seen_uri'):
  484. jenni.last_seen_uri = dict()
  485. if channel in jenni.last_seen_uri:
  486. uri = jenni.last_seen_uri[channel]
  487. else:
  488. return jenni.say('No recent links seen in this channel.')
  489.  
  490. status, results = get_results(uri, True)
  491.  
  492. for r in results:
  493. returned_title = r[0]
  494. orig = r[1]
  495. bitly_link = r[2]
  496. link_pass = r[3]
  497.  
  498. if returned_title is None:
  499. jenni.say('No title returned.')
  500. continue
  501.  
  502. if status and link_pass:
  503. response = '[ %s ]' % (returned_title)
  504. else:
  505. response = '(%s)' % (returned_title)
  506. jenni.say(response)
  507. show_title_demand.commands = ['title']
  508. show_title_demand.priority = 'high'
  509.  
  510.  
  511. def collect_links(jenni, input):
  512. link = input.groups()
  513. channel = input.sender
  514. link = link[0]
  515. if not hasattr(jenni, 'last_seen_uri'):
  516. jenni.last_seen_uri = dict()
  517. jenni.last_seen_uri[channel] = link
  518. collect_links.rule = '(?iu).*(%s?(http|https)(://\S+)).*' % (EXCLUSION_CHAR)
  519. collect_links.priority = 'low'
  520.  
  521. re_meta = re.compile('(?i)content="\S+;\s*?url=(\S+)"\s*?>')
  522.  
  523.  
  524. def unbitly(jenni, input):
  525. '''.longurl <link> -- obtain the final destination URL from a short URL'''
  526. url = input.group(2)
  527. if not url:
  528. if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri:
  529. url = jenni.last_seen_uri[input.sender]
  530. else:
  531. return jenni.say('No URL provided')
  532. if not url.startswith(('http://', 'https://')):
  533. url = 'http://' + url
  534.  
  535. status, useful = proxy.get_more(url)
  536. try:
  537. new_url = re_meta.findall(useful['read'])
  538. except:
  539. return jenni.say(str(useful))
  540.  
  541. if new_url:
  542. new_url = new_url[0]
  543. else:
  544. url = url.replace("'", r"\'")
  545. try:
  546. status, results = proxy.get_more(url)
  547. new_url = results['geturl']
  548. except:
  549. return jenni.say('Failed to grab URL: %s' % (url))
  550.  
  551. if new_url.startswith(('http://', 'https://')):
  552. jenni.say(new_url)
  553. else:
  554. jenni.say('Failed to obtain final destination.')
  555. unbitly.commands = ['unbitly', 'untiny', 'longurl', 'st', 'short']
  556. unbitly.priority = 'low'
  557. unbitly.example = '.unbitly http://git.io/6fY4OQ'
  558.  
  559.  
  560. def puny(jenni, input):
  561. '''.puny -- convert to xn-- code for URLs'''
  562. text = input.group(2)
  563. if not text:
  564. return jenni.say('No input provided.')
  565.  
  566. if text.startswith('xn--'):
  567. text = text[4:]
  568. text_ascii = (text).encode('utf-8')
  569. try:
  570. text_unpuny = (text_ascii).decode('punycode')
  571. except:
  572. return jenni.say('Stop being a twat.')
  573. output = (text_unpuny).encode('utf-8')
  574. output = (output).decode('utf-8')
  575. else:
  576. text = (text).encode('utf-8')
  577. text_utf = (text).decode('utf-8')
  578.  
  579. text_puny = (text_utf).encode('punycode')
  580.  
  581. output = 'xn--' + text_puny
  582.  
  583. return jenni.say(output)
  584. puny.commands = ['puny', 'idn', 'idna']
  585.  
  586.  
  587. if __name__ == '__main__':
  588. print __doc__.strip()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement