Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # -*- coding: utf -*-
- import code, urllib, urllib2, os, sys, re, cookielib, time, traceback
- import glob
- try:
- from bazaocen import baza
- except:
- baza=[]
- from HTMLParser import HTMLParser
- from htmlentitydefs import name2codepoint
- from email.header import make_header
- from subprocess import *
- cookielib.debug = True
- #librfmt='%F'
- librfmt='%Y-%m-%d'
- rfc822fmt='%a, %e %b %Y %T %z'
- class MyCookiePolicy(cookielib.DefaultCookiePolicy):
- def set_ok(self, cookie, request):
- return True
- class RedirHandler(urllib2.HTTPRedirectHandler):
- def redirect_request(self, oldreq, fp, code, msg, hdrs, newurl):
- global req
- #print (req,fp,code,msg,hdrs,newurl)
- jar.extract_cookies(fp,oldreq)
- if url.find('?')!=-1:
- newurl+=url[url.find('?'):]
- req=urllib2.Request(newurl, None, hdrs)
- jar.add_cookie_header(req)
- return req
- cookiefn='kukis.txt'
- jar=cookielib.MozillaCookieJar(cookiefn,True)#,MyCookiePolicy)
- direct=urllib2.build_opener(RedirHandler)
- host = "synergia.librus.pl"
- site = 'https://%s/loguj/przenies'%host
- username = '1234567u'
- password = '#jestembogiem'
- url = site + '/przegladaj_oceny/uczen'
- headers = {
- "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0",
- "Host" : host,
- "Referer" : url,
- "Connection" : "keep-alive",
- "Cache-Control" : "max-age=0",
- "Accept-Language" : "pl,en-us;q=0.7,en;q=0.3",
- "Accept-Encoding" : "gzip, deflate",
- "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- #"Accept-Charset" : "utf-8,ISO-8859-2;q=0.8,*;q=0.7"
- }
- data = [
- ("login",username),
- ("passwd",password),
- ("czy_js",1)
- ]
- fn='result.html'
- def safemakeheaderu8(t, quoted=''):
- try:
- return quoted+make_header([(t,'ascii')]).encode()+quoted
- except:
- return make_header([(t,'utf-8')]).encode()
- def dodal(entry):
- x=entry.get('Dodał'.decode('utf'), 'NIEDODAŁ2').split('(')[0].strip()
- return x.split(' ')[-1],' '.join(x.split(' ')[:-1])
- def glue(L):
- ret=''
- prev='\n'
- for i in L:
- if prev.endswith('\n') or i.startswith('\n'):
- pass
- else:
- ret+='\n'
- try:
- ret+=i
- except UnicodeDecodeError,e: # why?! all items are str
- try:
- ret+=repr(i.replace(u'\xa0',' '))[2:-1]
- except UnicodeDecodeError,e2:
- print "UnicodeDecodeError:",e
- ret+="<really stupid unicode text>"
- prev=i
- return (ret[:-1] if ret.endswith('\n') else ret)
- def attrfmt(x):
- x=list(x)
- if x[1]==None: x[1]=''
- x[1]=x[1].replace('&','&').replace('"','"').encode('ascii', 'xmlcharrefreplace')
- return str(' %s="%s"'%tuple(x))
- def wrap_HTML(x):
- return '''
- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN" "http://www.w3.org/TR/REC-html40/strict.dtd">
- <html><head><meta name="qrichtext" content="1" /><style type="text/css">
- p, li { white-space: pre-wrap; }
- table { border: 3px outset; }
- th, td { border: 3px inset; }
- th { text-align: right; }
- </style></head><body><table border="3">
- %s
- </table></body></html>
- '''%(x.replace('\r\n',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ').replace('> <','><'))
- def update():
- global req,fn,url
- headers['Referer'] = url
- postdata = urllib.urlencode(dict(data))
- req = urllib2.Request(url, postdata, dict(headers))
- jar.add_cookie_header(req)
- h = req.get_header('Cookie')
- if h and h.find('DZIENNIKSID') != -1:
- url = url.replace('/loguj/przenies','')
- req = urllib2.Request(url, None, dict(headers))
- jar.add_cookie_header(req)
- print ('Sent headers are: {')
- for it in req.header_items():
- print ('%20r: %r'%it)
- print ('}')
- response = direct.open(req)
- jar.extract_cookies(response,req)
- print ('url is %s'%response.geturl())
- print ('Received headers are: {')
- for it in response.info().items():
- print ('%20r: %r'%it)
- print ('}')
- if response.info().get("transfer-encoding")=='gzip':
- fn+='.gz'
- if response.info().get("content-encoding")=='gzip':
- fn+='.gz'
- with open(fn, 'w+b') as fp:
- buff=response.read(1024)
- while buff!='':
- fp.write(buff)
- buff=response.read(1024)
- response.close()
- if fn.endswith('.gz'):
- Popen(["gunzip","-f",fn]).wait()
- fn=fn[:-3]
- print ('done')
- def get_details(href):
- global fn,url
- oldfn,fn=fn,'oceny_szczegoly/%s.html'%(href.split('/')[-1])
- url = site + href
- if not os.path.isfile(fn):
- update()
- try:
- ret=parse2()
- except:
- print "plik problemu:",fn
- raise
- fn=oldfn
- return ret
- #def get_msg(href):
- #global fn,url
- #oldfn,fn=fn,'msg_szczegoly/%s.html'%(href.split('/')[-1])
- #url = site + href
- #if not os.path.isfile(fn):
- #update()
- #ret=parse2()
- #fn=oldfn
- #return ret
- class MyHTMLNode(object):
- MX_CONTENT_LEN=16
- problem=[]
- def __init__(self):
- self.parentNode=None
- self.childNodes=[]
- def appendChild(self, elem, idx=None):
- if isinstance(elem, MyHTMLData):
- if self.childNodes and isinstance(self.childNodes[-1], MyHTMLData):
- self.childNodes[-1].dane+=elem.dane
- return
- if idx is None:
- self.childNodes.append(elem)
- else:
- self.childNodes.insert(idx,elem)
- elem.parentNode=self
- if hasattr(elem, 'event'):
- elem.event()
- def _HTML(self):
- raise NotImplementedError
- def __repr__(self):
- #return "<%s%s at 0x%x%s%s>"%(self.nodeName+' ' if isinstance(self,MyHTMLElement) else "",type(self).__name__,id(self),", whose attrs=%r"%dict(self.attrs) if isinstance(self,MyHTMLElement) and self.attrs else "",", whose childNodes=%r"%self.childNodes if self.childNodes else '')
- return "<%s%s at 0x%x%s%s>"%(self.nodeName+' ' if isinstance(self,MyHTMLElement) else "",type(self).__name__,id(self),", whose attrs=%r"%dict(self.attrs) if isinstance(self,MyHTMLElement) and self.attrs else ""," with childNodes" if self.childNodes else '')
- HTML=property(lambda self:self._HTML())
- class MyHTMLElement(MyHTMLNode):
- tables=[]
- def __init__(self,nodeName,attrs):
- MyHTMLNode.__init__(self)
- self.nodeName=nodeName
- self.attrs=attrs
- if nodeName=='document':
- del self.tables[:]
- if nodeName=='table':
- self.tables.append(self)
- self.tbody = self
- if nodeName=='a':
- href=dict(attrs).get('href','')
- if href.startswith('/przegladaj_oceny/szczegoly/') and href not in oceny_ids:
- oceny_ids.append(href)
- def event(self):
- if self.nodeName in ('thead','tbody','tfoot'):
- setattr(self.parentNode, self.nodeName, self)
- elif self.nodeName=='a':
- magia="otworz_w_nowym_oknie('/zapytaj_nauczyciela/"
- magia2="', 'ask',700,600)"
- onclick=dict(self.attrs).get('onclick','')
- if onclick.startswith(magia) and onclick.endswith(magia2):
- tree=self.parentNode
- while tree.nodeName!='document':
- tree=tree.parentNode
- nau=MyHTMLElement('nauczyciel',[])
- nau.appendChild(MyHTMLData(onclick[len(magia):-len(magia2)]))
- tree.appendChild(nau,0)
- def _innerHTML(self,indent=0):
- return glue(map(lambda x:x._HTML(indent+1), self.childNodes)).replace('\n','\n'+' ') #*indent)
- innerHTML=property(_innerHTML)
- def _HTML(self, indent=0):
- try:
- innerHTML=self._innerHTML(indent)
- if len(innerHTML)>self.MX_CONTENT_LEN: innerHTML='\n %s\n'%innerHTML
- if innerHTML or self.nodeName not in ("link","meta","img"):
- ret='<%s%s>%s</%s>'% \
- (self.nodeName,''.join(map(attrfmt,self.attrs)),innerHTML,self.nodeName)
- else:
- ret='<%s%s />'% \
- (self.nodeName,''.join(map(attrfmt,self.attrs)) )
- #assert type(ret)==str, "Stupid automatic conversion"
- return ret
- except:
- if not self.problem:
- print ("stupid ascii detected in %r"%self)
- self.problem.append(self)
- raise
- class MyHTMLData(MyHTMLNode):
- nodeName=property(lambda self:type(self).__name__)
- def __init__(self,dane):
- MyHTMLNode.__init__(self)
- ret=''
- for c in dane:
- try:
- ret+=c
- #assert type(ret)==str
- except:
- ret+='~~'
- self.dane=ret
- def _HTML(self, indent=0):
- x=self.dane.replace('\n','\n'+' ') #*indent)
- #if x.startswith('Doda'):
- #print "Doda?",x,type(x)
- return x
- class MyHTMLComment(MyHTMLData):
- def _HTML(self, indent=0):
- return '<!--%s-->'%self.dane.replace('\n','\n'+' ') #*indent)
- class MyHTMLDecl(MyHTMLData):
- def _HTML(self, indent=0):
- return '<!%s>'%self.dane.replace('\n','\n'+' ') #*indent)
- class MyHTMLPI(MyHTMLData):
- def _HTML(self, indent=0):
- return '<?%s>'%self.dane.replace('\n','\n'+' ') #*indent)
- class MyHTMLParse(HTMLParser):
- good1=[
- ('html', {}), ('body', {'onload': 'ImageSwaper.GetInstance().Load(); '}), ('div', {'id': 'page'}), ('div', {'id': 'body'}), ('form', {'action': '/przegladaj_oceny/uczen', 'method': 'POST', 'name': 'PrzegladajOceny'}),
- ('div', {'class': 'container'}), ('div', {'class': 'container-background'}), ('table', {'class': 'decorated stretch'}), ('tr', {'class': 'line0'}), ('td', {})
- ]
- good2=good1[:]
- good2[-2]=('tr',{'class':'line1'})
- autoclose=set(["img","link","meta","br","hr"])
- def __init__(self, typ=0, *args, **arg2):
- self.typ=typ
- HTMLParser.__init__(self,*args,**arg2)
- self.stack=[]
- self.tree=self.curr_elem=MyHTMLElement("document",[])
- self.lastx="line1"
- self.happened=False
- def handle_starttag(self, tag, attrs):
- if self.curr_elem.nodeName in self.autoclose: self.handle_endtag(self.curr_elem.nodeName)
- elem=MyHTMLElement(tag,attrs)
- self.curr_elem.appendChild(elem)
- self.curr_elem=elem
- self.happened=True
- flag=(self.typ==0 and self.isok())
- self.stack.append((tag,dict(attrs)))
- if self.typ==0 and self.isok():
- if flag:
- self.fp2.write('","<%s%s>'%
- (tag,''.join(map(lambda x:' %s="%s"'%x,attrs)))
- )
- else:
- if oceny:
- #oceny[-1][-1].pop()
- oceny[-1]+=([],)
- self.fp2.write('","###')
- def handle_endtag(self, tag):
- self.happened=True
- flag=self.isok()
- if tag in map(lambda x:x[0],self.stack):
- self.curr_elem=self.curr_elem.parentNode
- x=self.stack.pop()[0]
- while x!=tag:
- #print ("Auto-closing unclosed tag of %s"%x)
- self.curr_elem=self.curr_elem.parentNode
- x=self.stack.pop()[0]
- else:
- pass#print ("Closed unopen tag of %s"%tag)
- if self.typ==0 and flag:
- if self.isok():
- self.fp2.write('","</%s>'%tag)
- else:
- self.fp2.write('"\n"%%%')
- def handle_entityref(self, name):
- c = unichr(name2codepoint.get(name,0))
- if c == '\0':
- print "&%s; -> &#x%x;"%(name,ord(c))
- self.handle_data(c,True,name)
- def handle_charref(self, name):
- if name.startswith('x'):
- c = unichr(int(name[1:], 16))
- else:
- c = unichr(int(name))
- #print "&#%s; -> &#x%x;"%(name,ord(c))
- self.handle_data(c,True,"#"+name)
- def handle_data(self, dane, special=False, ent=None):
- if self.curr_elem.nodeName in self.autoclose: self.handle_endtag(self.curr_elem.nodeName)
- if self.curr_elem.nodeName!="script":
- dane=dane.replace('\n',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
- if True or not special:
- danne=MyHTMLData(dane)
- else:
- danne=MyHTMLData("&%s;"%ent)
- self.curr_elem.appendChild(danne)
- if self.typ==0 and self.isok():
- if self.happened:
- if self.lastx!=self.stack[len(self.good1)-2][1]['class']:
- if oceny:
- oceny[-1]=oceny[-1][:1]+oceny[-1][2:-1]
- oceny.append((dane,[]))
- self.fp2.write('"\n"')
- self.lastx=self.stack[len(self.good1)-2][1]['class']
- else:
- self.fp2.write('","')
- oceny[-1][-1].append('')
- self.happened=False
- if special:
- if ord(dane)==0xa0:
- oceny[-1][-1][-1]+=' '
- self.fp2.write(' ')
- else:
- oceny[-1][-1][-1]+='&#x%x;'%ord(dane)
- self.fp2.write('&#x%x;'%ord(dane))
- else:
- oceny[-1][-1][-1]+=dane
- self.fp2.write(dane)
- def handle_decl(self, dane):
- decl=MyHTMLDecl(dane)
- if self.curr_elem.nodeName in self.autoclose: self.handle_endtag(self.curr_elem.nodeName)
- self.curr_elem.appendChild(decl)
- def handle_pi(self, dane):
- pi=MyHTMLPI(dane)
- if self.curr_elem.nodeName in self.autoclose: self.handle_endtag(self.curr_elem.nodeName)
- self.curr_elem.appendChild(pi)
- def handle_comment(self, comm):
- comment=MyHTMLComment(comm)
- if self.curr_elem.nodeName in self.autoclose: self.handle_endtag(self.curr_elem.nodeName)
- self.curr_elem.appendChild(comment)
- def isok(self):
- for i in range(len(self.stack)):
- if i>=len(self.good1):
- return True
- if self.stack[i]==self.good1[i]:
- pass
- elif self.stack[i][0]!=self.good1[i][0]:
- return False
- elif self.stack[i]!=self.good2[i]:
- return False
- else:
- if len(self.stack)==len(self.good1):
- return True
- return False
- def parse():
- global oceny, oceny_ids
- oceny=[]
- oceny_ids=[]
- parser=MyHTMLParse()
- with open(fn,'rt') as fp, open(os.path.splitext(fn)[0]+".csv",'wb') as parser.fp2:
- parser.fp2.write('"Wyciąg z Librusa')
- buff=fp.read(1024)
- while buff!='':
- buff=buff.replace('> <','><').replace('>\r\n<','><')
- while ord(buff[-1])>127:
- buff+=fp.read(1)
- parser.feed(buff.decode('utf'))
- buff=fp.read(1024)
- parser.fp2.write('"\n')
- #oceny=dict(oceny)
- #oceny: <p class="... ..." ...><a title="WAZNE..." class="..." ...>6+</a></p>
- with open(fn+".html",'wb') as fp:
- x=parser.tree.innerHTML
- x=x.replace('\r\n','\n').replace('\r','\n').replace('\n','\r\n').replace('\n\t\t','\n'+' '*16).replace('\n\t','\n'+' '*8)
- for i in range(40):
- for j in range(2):
- x=x.replace('\n'+' '*i+'\r\n','\n')
- fp.write(x.encode('utf'))
- def parse3():
- # var chartDataGradeAverangeGraphDiv = [
- parser=MyHTMLParse(1)
- with open(fn,'rt') as fp:
- buff=fp.read(1024)
- while buff!='':
- buff=buff.replace('> <','><').replace('>\r\n<','><')
- while ord(buff[-1])>127:
- buff+=fp.read(1)
- parser.feed(buff.decode('utf'))
- buff=fp.read(1024)
- tree=parser.tree
- txt=tree.childNodes[1].innerHTML
- txt='field='+txt[txt.find('['):txt.find(']')+1]
- d={'columnGradeAverangeGraphDiv':'d','x0':'u','x1':'c','__builtins__':None}
- exec txt in d
- return d['field'] #[{'d':x['d'],'u':d['u'],'c':d['c']} for x in d['field']]
- def parse2():
- parser=MyHTMLParse(1)
- with open(fn,'rt') as fp:
- buff=fp.read(1024)
- while buff!='':
- buff=buff.replace('> <','><').replace('>\r\n<','><')
- while ord(buff[-1])>127:
- buff+=fp.read(1)
- parser.feed(buff.decode('utf'))
- buff=fp.read(1024)
- #magic_idxes=[2,2,10,3,5,0,3,0,3]
- #magic_nodnams_wzorzec=['html','body','div','div','form','div','div','table','tbody']
- tree=parser.tree
- #magic_nodnams=[]
- #for i,j in zip(magic_idxes,magic_nodnams_wzorzec):
- #magic_nodnams.append(tree.nodeName)
- #try:
- #if i<len(tree.childNodes) and tree.childNodes[i].nodeName==j:
- #tree=tree.childNodes[i]
- #elif i+1<len(tree.childNodes) and tree.childNodes[i+1].nodeName==j:
- #tree=tree.childNodes[i+1]
- #print ('Warn: num for %r is now %r'%(j,i+1) )
- #else:
- #print ('Mistyped num for %r=%r; it\'s instead=%r'%(j,i,tree.childNodes[i] if i<len(tree.childNodes) else Ellipsis) )
- #for isitok,indeks in zip(tree.childNodes, range(len(tree.childNodes)) ):
- #if isitok.nodeName==j:
- #tree=isitok
- #print ('Gut num is %r [%r]'%(indeks,fn) )
- #break
- #else:
- #print ('so far magic_nodnams=%r; librus ma problem z %r'%(magic_nodnams,fn))
- #except:
- #print ('so far magic=%r'%tree)
- #raise
- #print ('magic_nodnams=%r'%magic_nodnams)
- if len(tree.tables)<1:
- print "#err in %s"%fn
- return NotImplemented
- tree = tree.tables[-1].tbody
- #print tree, tree.childNodes
- outree={}
- for i in tree.childNodes:
- if i.nodeName=='tr':
- tdtd=thth=''
- for x in i.childNodes:
- if x.nodeName=='td' and not thth:
- x.nodeName='th'
- if x.nodeName=='th':
- thth=x.innerHTML.strip()
- elif x.nodeName=='td':
- tdtd=x.innerHTML.replace('\n ','\n').replace('\n','').replace('\r','').strip()
- outree[thth]=tdtd
- outree.setdefault('Ocena','BR')
- outree.setdefault('Przedmiot','BRAK')
- outree.setdefault(u'Dodał','BRAK')
- outree.setdefault('Data','1999-11-11')
- if len(outree.get('Ocena',''))>2:
- outree['Komentarz']=outree.setdefault('Komentarz','')+outree['Ocena']
- outree['Ocena']='T'
- if parser.tree.childNodes[0].nodeName!='nauczyciel':
- print "#err brak zapytaj_nauczyciela in %s"%fn
- return NotImplemented
- outree['nauczyciel_id']=int(parser.tree.childNodes[0].innerHTML)
- myfn=fn[:fn.rfind('.')]
- outree['ocena_id']=int(myfn.split('/')[-1])
- try:
- if glob.glob(myfn.replace('oceny_szczegoly','oceny_export/new')+'*') or glob.glob(myfn.replace('oceny_szczegoly','oceny_export/cur')+'*'):
- pass
- else:
- with open(myfn.replace('oceny_szczegoly','oceny_export/new'),'w') as fp82:
- fp82.write('Subject: %s'%safemakeheaderu8('%s: %s'%(outree['Przedmiot'],outree['Ocena'])))
- fp82.write('''
- Content-Type: text/html; charset="utf-8"
- Content-Transfer-Encoding: quoted-printable
- Date: %s
- '''%(time.strftime(rfc822fmt, time.strptime(outree['Data'][:outree['Data'].find(' ')],librfmt))))
- # FIXME: nauczyciel_id = id(nauczyciel); id(dodał)=?
- head='%s <%s@%s>'%(safemakeheaderu8(outree['Dodał'.decode('utf')],'"'),outree['nauczyciel_id'],host)
- fp82.write('From: %s'%head)
- fp82.write('''
- MIME-Version: 1.0
- ''')
- x=wrap_HTML(tree.innerHTML.encode('utf')).encode('quoted-printable')
- print (x)
- fp82.write(x)
- except KeyError,e:
- print 'Smutne strasznie ;(',e,myfn,outree
- os.remove(myfn.replace('oceny_szczegoly','oceny_export/new'))
- #oceny[outree['Przedmiot']][0].append(outree)
- return outree
- def update_graph():
- #href='/uczen/graph_ajax.php?type=wykres_sredniej&classId=37253&userId=918577'
- href='/uczen/graph_ajax.php?type=wykres_sredniej&classId=50832&userId=1261841'
- global fn,url
- oldfn,fn=fn,'graph.html'
- url = site + href
- update()
- print (parse3())
- fn=oldfn
- def main():
- global url
- jar.set_cookie(cookielib.Cookie(0, 'TestCookie', '1',
- None, False,
- host, True, False,
- '/', False,
- False,
- None,
- False,
- None,
- None,
- {}))
- if os.path.isfile(cookiefn):
- jar.load(ignore_discard=True)
- if not os.path.isfile(fn):
- update()
- parse()
- for an_id in oceny_ids:
- if int(an_id[an_id.rfind('/')+1:])==0: continue
- x=get_details(an_id)
- if x not in baza:
- baza.append(x)
- #with open('bazaocen.py','w') as fp99:
- #fp99.write('baza=%r\n'%baza)
- with open('db/gradonly_auto.sql','w') as fp137:
- mapa=[(entry['ocena_id'],entry['Ocena'],entry['Kategoria'].replace("'","''"),entry['Data'][:10],
- entry['nauczyciel_id'],entry['Przedmiot'],'1' if entry.get('Licz do średniej') else '0',
- entry.get('Waga','1'),dodal(entry)[0],dodal(entry)[1],entry.get('Komentarz','').replace("'","''")) for entry in baza]
- fp137.write((u'''INSERT OR REPLACE INTO oceny (
- ocena_id, ocena, kategoria, data_dodania, nauczyciel, przedmiot, licz_do_sr, waga, dodal, komentarz
- ) VALUES (''' + u'),('.join([u'''
- %s, '%s', '%s', '%s', %s, (SELECT przedmiot_id FROM przedmioty WHERE nazwa='%s'), %s, %s, (SELECT uzytkownik_id FROM uzytkownicy WHERE imie='%s' AND nazwisko='%s'), '%s'
- '''%x for x in mapa])+
- ''');
- ''').encode('utf'))
- update_graph()
- with open(os.path.splitext(fn)[0]+".json",'wb') as fp:
- fp.write(repr(oceny).replace(',',',\n')
- .replace('[','[\n ').replace(']','\n]')
- .replace("'",'"').replace("\n ","\n ")
- .replace('],\n ','],\n ').replace(']',' ]')
- .replace('{','{\n ').replace('}','\n}')
- )
- jar.save(ignore_discard=True)
- if __name__=="__main__":
- x=os.path.dirname(sys.argv[0])
- if x: os.chdir(x)
- interact = False
- try:
- main()
- except:
- traceback.print_exc()
- interact = True
- if interact:
- filename = os.environ.get('PYTHONSTARTUP')
- if filename and os.path.isfile(filename):
- with open(filename) as fobj:
- startup_file = fobj.read()
- exec(startup_file)
- code.interact(local=globals())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement