Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Script to grab titles from webpages
- # Updated version by teel @ IRCnet
- #
- # https://github.com/teeli/urltitle
- #
- # Detects URL from IRC channels and prints out the title
- #
- # Version Log:
- # 0.10 Fixed XPath parsing error and added regex fallback if XPath fails
- # 0.09 HTTPs redirects, case-insensitive HTTP header fix, other small bug fixes
- # 0.08 Changed putserv to puthelp to queue the messages
- # 0.07 Added Content-Type check (text/html only) and exceptino handling for tDom with a fallback to
- # regexp if tDom fails.
- # 0.06 Added XPATH support to title parsing (only if tdom package is available)
- # 0.05 Added SNI support for TLS (with TLS version check)
- # 0.04 HTML parsing for titles added
- # 0.03c HTTPS support is now optional and will be automatically dropeed if TCL TSL package does not exist
- # 0.03b Some formatting
- # 0.03 HTTPS support
- # 0.02 Updated version by teel. Added support for redirects, trimmed titles (remove extra whitespaces),
- # some optimization
- # 0.01a Original version by rosc
- #
- ################################################################################################################
- #
- # Original script:
- # Copyright C.Leonhardt (rosc2112 at yahoo com) Aug.11.2007
- # http://members.dandy.net/~fbn/urltitle.tcl.txt
- # Loosely based on the tinyurl script by Jer and other bits and pieces of my own..
- #
- ################################################################################################################
- #
- # Usage:
- #
- # 1) Set the configs below
- # 2) .chanset #channelname +urltitle ;# enable script
- # 3) .chanset #channelname +logurltitle ;# enable logging
- # Then just input a url in channel and the script will retrieve the title from the corresponding page.
- #
- ################################################################################################################
- namespace eval UrlTitle {
- # CONFIG
- variable ignore "bdkqr|dkqr" ;# User flags script will ignore input from
- variable length 5 ;# minimum url length to trigger channel eggdrop use
- variable delay 1 ;# minimum seconds to wait before another eggdrop use
- variable timeout 5000 ;# geturl timeout (1/1000ths of a second)
- variable fetchLimit 5 ;# How many times to process redirects before erroring
- # BINDS
- bind pubm "-|-" {*://*} UrlTitle::handler
- setudef flag urltitle ;# Channel flag to enable script.
- setudef flag logurltitle ;# Channel flag to enable logging of script.
- # INTERNAL
- variable last 1 ;# Internal variable, stores time of last eggdrop use, don't change..
- variable scriptVersion 0.10
- # PACKAGES
- package require http ;# You need the http package..
- variable httpsSupport false
- variable htmlSupport false
- variable tdomSupport false
- if {![catch {variable tlsVersion [package require tls]}]} {
- set httpsSupport true
- if {[package vcompare $tlsVersion 1.6.4] < 0} {
- putlog "UrlTitle: TCL TLS version 1.6.4 or newer is required for proper https support (SNI)"
- }
- }
- if {![catch {package require htmlparse}]} {
- set htmlSupport true
- }
- if {![catch {package require tdom}]} {
- set tdomSupport true
- }
- # Enable SNI support for TLS if suitable TLS version is installed
- proc socket {args} {
- variable tlsVersion
- set opts [lrange $args 0 end-2]
- set host [lindex $args end-1]
- set port [lindex $args end]
- if {[package vcompare $tlsVersion 1.7.11] >= 0} {
- # tls version 1.7.11 should support autoservername
- ::tls::socket -autoservername true {*}$opts $host $port
- } elseif {[package vcompare $tlsVersion 1.6.4] >= 0} {
- ::tls::socket -ssl3 false -ssl2 false -tls1 true -servername $host {*}$opts $host $port
- } else {
- # default fallback without servername (SNI certs will not work)
- ::tls::socket -ssl3 false -ssl2 false -tls1 true {*}$opts $host $port
- }
- }
- proc handler {nick host user chan text} {
- variable httpsSupport
- variable htmlSupport
- variable delay
- variable last
- variable ignore
- variable length
- set unixtime [clock seconds]
- if {[channel get $chan urltitle] && ($unixtime - $delay) > $last && (![matchattr $user $ignore])} {
- foreach word [split $text] {
- if {[string length $word] >= $length && [regexp {^(f|ht)tp(s|)://} $word] && \
- ![regexp {://([^/:]*:([^/]*@|\d+(/|$))|.*/\.)} $word]} {
- set last $unixtime
- # enable https if supported
- if {$httpsSupport} {
- ::http::register https 443 [list UrlTitle::socket]
- }
- set urtitle [UrlTitle::parse $word]
- if {$htmlSupport} {
- set urtitle [::htmlparse::mapEscapes $urtitle]
- }
- # unregister https if supported
- if {$httpsSupport} {
- ::http::unregister https
- }
- if {$urtitle eq ""} {
- break
- }
- if {[string length $urtitle]} {
- set html_mapping {
- ‘ '
- ’ '
- ’ '
- – '
- ' '
- " "
- 
 " "
- ‚ ‚
- “ “
- ” ”
- „ „
- † †
- ‡ ‡
- ‰ ‰
- ‹ ‹
- › ›
- ♠ ♠
- ♣ ♣
- ♥ ♥
- ♦ ♦
- ‾ ‾
- ← ←
- ← ←
- ↑ ↑
- ↑ ↑
- → →
- → →
- ↓ ↓
- ↓ ↓
- ↖ ↖
- ↖ ↖
- ↗ ↗
- ↗ ↗
- ↙ ↙
- ↙ ↙
- ↘ ↘
- ↘ ↘
- ▲ ▲
- ▲ ▲
- ▴ ▴
- ▴ ▴
- ▶ ▶
- ▶ ▶
- ▸ ▸
- ▸ ▸
- ► ►
- ► ►
- ▼ ▼
- ▼ ▼
- ▾ ▾
- ▾ ▾
- ◀ ◀
- ◀ ◀
- ◂ ◂
- ◂ ◂
- ◄ ◄
- ◄ ◄
- ™ ™
- ' '
- ™ ™
- � -
- � -
- ! !
- ! !
- " {"}
- " {"}
- " {"}
- # {#}
- # {#}
- $ $
- $ $
- % %
- % %
- & &
- & &
- & &
- ' '
- ' '
- ( (
- ( (
- ) )
- ) )
- * *
- * *
- + +
- + +
- , ,
- , ,
- - -
- - -
- . .
- . .
- / /
- / /
- ⁄ /
- 0 -
- 0 -
- : :
- : :
- ; ;
- ; ;
- < <
- < <
- < <
- = =
- = =
- > >
- > >
- > >
- ? ?
- ? ?
- @ @
- @ @
- A -
- A -
- [ [
- [ [
- \ \
- \ \
- ] ]
- ] ]
- ^ ^
- ^ ^
- _ _
- _ _
- ` `
- ` `
- a -
- a -
- { {
- | |
- } }
- ~ ~
- … …
- … …
- – –
- – –
- — —
- — —
- ˜ -
- Ÿ &
-  
- ¡ ¡
- ¡ ¡
- ¢ ¢
- ¢ ¢
- £ £
- £ £
- ¤ ¤
- ¤ ¤
- ¥ ¥
- ¥ ¥
- ¦ ¦
- ¦ ¦
- &brkbar; ¦
- § §
- § §
- ¨ ¨
- ¨ ¨
- ¨ ¨
- © ©
- © ©
- ª ª
- ª ª
- « «
- « «
- ¬ ¬
- ¬ ¬
- ® ®
- ® ®
- ¯ ¯
- ¯ ¯
- &hibar; ¯
- ° °
- ° °
- ± ±
- ± ±
- ² ²
- ² ²
- ³ ³
- ³ ³
- ´ ´
- ´ ´
- µ µ
- µ µ
- ¶ ¶
- ¶ ¶
- · ·
- · ·
- ¸ ¸
- ¸ ¸
- ¹ ¹
- ¹ ¹
- º º
- º º
- » »
- » »
- ¼ ¼
- ¼ ¼
- ½ ½
- ½ ½
- ¾ ¾
- ¾ ¾
- ¿ ¿
- ¿ ¿
- À À
- À À
- Á Á
- Á Á
- Â Â
- Â Â
- Ã Ã
- Ã Ã
- Ä Ä
- Ä Ä
- Å Å
- Å Å
- Æ Æ
- Æ Æ
- Ç Ç
- Ç Ç
- È È
- È È
- É É
- É É
- Ê Ê
- Ê Ê
- Ë Ë
- Ë Ë
- Ì Ì
- Ì Ì
- Í Í
- Í Í
- Î Î
- Î Î
- Ï Ï
- Ï Ï
- Ð Ð
- Ð Ð
- Ñ Ñ
- Ñ Ñ
- Ò Ò
- Ò Ò
- Ó Ó
- Ó Ó
- Ô Ô
- Ô Ô
- Õ Õ
- Õ Õ
- Ö Ö
- Ö Ö
- × ×
- × ×
- Ø Ø
- Ø Ø
- Ù Ù
- Ù Ù
- Ú Ú
- Ú Ú
- Û Û
- Û Û
- Ü Ü
- Ü Ü
- Ý Ý
- Ý Ý
- Þ Þ
- Þ Þ
- ß ß
- ß ß
- à à
- à à
- á á
- á á
- â â
- â â
- ã ã
- ã ã
- ä ä
- ä ä
- å å
- å å
- æ æ
- æ æ
- ç ç
- ç ç
- è è
- è è
- é é
- é é
- ê ê
- ê ê
- ë ë
- ë ë
- ì ì
- ì ì
- í í
- í í
- î î
- î î
- ï ï
- ï ï
- ð ð
- ð ð
- ñ ñ
- ñ ñ
- ò ò
- ò ò
- ó ó
- ó ó
- ô ô
- ô ô
- õ õ
- õ õ
- ö ö
- ö ö
- ÷ ÷
- ÷ ÷
- ø ø
- ø ø
- ù ù
- ù ù
- ú ú
- ú ú
- û û
- û û
- ü ü
- ü ü
- ý ý
- ý ý
- þ þ
- þ þ
- ÿ ÿ
- ÿ ÿ
- Α Α
- α α
- Β Β
- β β
- Γ Γ
- γ γ
- Δ Δ
- δ δ
- Ε Ε
- ε ε
- Ζ Ζ
- ζ ζ
- Η Η
- η η
- Θ Θ
- θ θ
- Ι Ι
- ι ι
- Κ Κ
- κ κ
- Λ Λ
- λ λ
- Μ Μ
- μ μ
- Ν Ν
- ν ν
- Ξ Ξ
- ξ ξ
- Ο Ο
- ο ο
- Π Π
- π π
- Ρ Ρ
- ρ ρ
- Σ Σ
- σ σ
- Τ Τ
- τ τ
- Υ Υ
- υ υ
- Φ Φ
- φ φ
- Χ Χ
- χ χ
- Ψ Ψ
- ψ ψ
- Ω Ω
- ω ω
- ● ●
- • •
- ∞ ∞
- ∞ ∞
- }
- set tit [string map $html_mapping $urtitle]
- puthelp "PRIVMSG $chan :\002$tit"
- }
- break
- }
- }
- }
- # change to return 0 if you want the pubm trigger logged additionally..
- return 1
- }
- # General HTTP redirect handler
- proc Fetch {url args} {
- variable fetchLimit
- for {set count 0} {$count < $fetchLimit} {incr count} {
- set token [::http::geturl $url {*}$args]
- if {[::http::status $token] ne "ok" || ![string match 3?? [::http::ncode $token]]} {
- break
- }
- set meta [::http::meta $token]
- if {[dict exists $meta Location]} {
- set url [dict get $meta Location]
- }
- if {[dict exists $meta location]} {
- set url [dict get $meta location]
- }
- ::http::cleanup $token
- }
- return $token
- }
- proc parseTitleXPath {data} {
- set title ""
- if {[catch {set doc [dom parse -html -simple $data]} results]} {
- # fallback to regex parsing if tdom fails
- set title [parseTitleRegex $data]
- } else {
- # parse dom
- set root [$doc documentElement]
- set node [$root selectNodes {//head/title/text()}]
- if {$node != ""} {
- # return title if XPath was able to parse it
- set title [$node data]
- } else {
- # Fallback to regex if XPath failed
- set title [parseTitleRegex $data]
- }
- }
- }
- proc parseTitleRegex {data} {
- set title ""
- # fallback to regex parsing if tdom fails
- regexp -nocase {<title.*>(.*?)</title>} $data match title
- set title [regsub -all -nocase {\s+} $title " "]
- return $title
- }
- proc parse {url} {
- variable timeout
- variable tdomSupport
- set title ""
- if {[info exists url] && [string length $url]} {
- if {[catch {set http [Fetch $url -timeout $timeout]} results]} {
- putlog "Connection to $url failed"
- putlog "Error: $results"
- } else {
- if { [::http::status $http] == "ok" } {
- set data [::http::data $http]
- set status [::http::code $http]
- set meta [::http::meta $http]
- # only parse html files for titles
- if {
- ([dict exists $meta Content-Type] && [string first "text/html" [dict get $meta Content-Type]] >= 0) ||
- ([dict exists $meta content-type] && [string first "text/html" [dict get $meta content-type]] >= 0)
- } {
- switch -regexp -- $status {
- "HTTP.*200.*" {
- if {$tdomSupport} {
- # use XPATH if tdom is supported
- set title [parseTitleXPath $data]
- } else {
- # fallback to regex parsing if tdom is not enabled
- set title [parseTitleRegex $data]
- }
- }
- "HTTP\/[0-1]\.[0-1].3.*" {
- if {[dict exists $meta Location]} {
- set title [UrlTitle::parse [dict get $meta Location]]
- }
- if {[dict exists $meta location]} {
- set title [UrlTitle::parse [dict get $meta location]]
- }
- }
- }
- }
- } else {
- putlog "Connection to $url failed"
- }
- ::http::cleanup $http
- }
- }
- return $title
- }
- putlog "Initialized Url Title Grabber v$scriptVersion"
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement