ero analysis.py

#Pixiv search URL: http://www.pixiv.net/search.php?s_mode=s_tag&word=<search term, UTF-8>
#The list of tit tags is at:   http://dic.pixiv.net/a/%e6%9d%b1%e6%96%b9%e3%81%8a%e3%81%a3%e3%81%b1%e3%81%84%e3%82%bf%e3%82%b0%e4%b8%80%e8%a6%a7
#Result count is contained in: <span class="count-badge">500results</span>

import sys
import traceback
import re
import os
import cookielib
import urllib, urllib2, time
from datetime import date, timedelta

#Globals
jar = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))

def load_characters():
    infile = open('characters.txt', 'r')

    chars = {}

    for line in infile:
        if line == '\n':
            continue

        data = line.strip().split('\t')

        eng = unicode(data[0].strip(), 'sjis')
        jap = unicode(data[1].strip(), 'sjis')
        tits = unicode(data[2].strip(), 'sjis')     # Big tits. Not flat chest.

        #First line is English name.
        chars[eng] = {'English':eng, 'Japanese':jap, 'Tits':tits}

    infile.close()

    return chars


#r18 is a boolean.
#keyword should be UNICODE text of what to search for.
#Touhou is added automatically.
def do_search(keyword, r18=False, recursion=0):
    global opener

    if recursion > 3:
        raise Exception("Could not authenticate.")

    url = 'http://www.pixiv.net/search.php?s_mode=s_tag&word=' + keyword.encode('utf-8')

    touhou = u'\u6771\u65b9'

    url += '%20' + touhou.encode('utf-8')

    if r18:
        #url += '&r18=1'            #I'm going to assume that this is somehow more reliable than searching for "r-18"
        url += '%20r-18'

    #print "Opening:", url.decode('utf-8').encode('sjis')           #At least on my comp, utf-8 does not display in the console. So this is pointless to print.
    request = urllib2.Request(url)
    response = opener.open(request)
    text = response.read()

    #print text

    if text.find('<a class="signup_button"') != -1 or text.find('<div class="mail-signup"') != -1 or text.find('register-introduction-modal') != -1:
        #print 'trying login...'
        #Quickest way to retry search with the login.
        do_login()
        return do_search(keyword, r18, recursion+1)

    #print text
    #exit()

    m = re.search('<span class="count-badge">([0-9]+)results</span>', text)
    results = int(m.group(1))

    return results

def do_login():
    global opener

    username = raw_input('Pixiv username: ').strip()
    password = raw_input('Pixiv password: ').strip()

    if username == '' or password == '':
        raise Exception("Authentication aborted.")

    postdata = {'mode': 'login', 'pixiv_id': username, 'pass' : password, 'skip' : 1}

    response = opener.open('http://www.pixiv.net/login.php', urllib.urlencode(postdata))
    text = response.read()

outfile = open('results.csv', 'w')
outfile.write("Character,Japanese tag,results,ero,Tits tag,results,ero,futa,normal:lewd,normal:tits,tits:lewdtits,ero:futa\n")

futastr = u'\u3075\u305f\u306a\u308a'

charas = load_characters()
for cx in charas:
    c = charas[cx]

    normal = do_search(c['Japanese'])
    #print 'Regular search for', c['English'], normal

    ero = do_search(c['Japanese'], True)
    #print 'Ero search for', c['English'], ero

    tits = do_search(c['Tits'])
    #print 'Tits search for', c['English'], tits

    erotits = do_search(c['Tits'], True)
    #print 'Ero tits search for', c['English'], erotits

    futaresults = do_search(futastr + '%20' + c['Japanese'] , True) # R-18 is probably redundant, but whatever. If anything isn't R-18 it probably shouldn't be counted.

    #break
    #Write results
    #outfile.write(c['English'] + "," + c['Japanese'].encode('sjis') + "," + str(normal) + "," + str(ero) + ",")
    outfile.write(c['English'])
    outfile.write(",")
    outfile.write(c['Japanese'].encode('sjis'))
    outfile.write(",")
    outfile.write(str(normal))
    outfile.write(",")
    outfile.write(str(ero))
    outfile.write(",")

    #outfile.write(c['Tits'].encode('sjis') + "," + str(tits) + "," + str(erotits) + ",")
    outfile.write(c['Tits'].encode('sjis'))
    outfile.write(",")
    outfile.write(str(tits))
    outfile.write(",")
    outfile.write(str(erotits))
    outfile.write(",")

    outfile.write(str(futaresults))
    outfile.write(",")

    outfile.write("%.2f" % (float(ero) / float(normal) * 100) + ",")    # Normal : Ero percentage
    outfile.write("%.2f" % (float(tits) / float(normal) * 100) + ",")   # Normal : Tits percentage
    if (tits == 0):
        outfile.write("NaN,")
    else:
        outfile.write("%.2f" % (float(erotits) / float(tits) * 100) + ",")  # Tits : Erotits percentage
    outfile.write("%.2f" % (float(futaresults) / float(ero) * 100) + "\n")  # Ero : Futa percentage

    print c['English'], 'is', "%.2f%%" % (float(ero) / float(normal) * 100), 'lewd.'

titsstr = u'\u4e73'

normal = do_search('')      #All touhou images
ero = do_search('', True)   #All r-18 touhou images
tits = do_search(titsstr)       #Kanji for milk, which means tits. This one had more results than "Oppai"
erotits = do_search(titsstr, True)  #Ero milk
futagirls = do_search(futastr, True)    # R-18 is probably redundant, but whatever. If anything isn't R-18 it probably shouldn't be counted.

#Write results
outfile.write('Baseline' + "," + 'Baseline' + "," + str(normal) + "," + str(ero) + ",")
outfile.write(titsstr.encode('sjis'))
outfile.write("," + str(tits) + "," + str(erotits) + "," + str(futagirls) + ",")

outfile.write("%.2f" % (float(ero) / float(normal) * 100) + ",")    # Normal : Ero percentage
outfile.write("%.2f" % (float(tits) / float(normal) * 100) + ",")   # Normal : Tits percentage
outfile.write("%.2f" % (float(erotits) / float(tits) * 100) + ",")  # Tits : Erotits percentage
outfile.write("%.2f" % (float(futagirls) / float(ero) * 100) + ",") # Tits : Erotits percentage

outfile.close()