8chan image scraper

import requests
from bs4 import BeautifulSoup
import urllib
import sys
import re
import time
from yattag import Doc, indent
import xml.etree.ElementTree
import os
import urllib.request as urllib2
import random


while(True):
    #read from text file the html input
    #f = open('input.txt','r', encoding='utf-8')
    #z = f.readline()
    url='https://www.8ch.net/v'
    print(url)
    headers='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36'

    req = urllib2.Request(url, headers={ 'User-Agent': headers })
    html = urllib2.urlopen(req).read()

    soup = BeautifulSoup(html, "html.parser")

    title = soup.html.head.title.text

    #removes non-valid characters
    def clean(text):
        return re.sub(r'[^\x00-\x7F]',' ', text)

    #used in isolating the images for the OP
    def OPimgs(index):
        tmparr=[]
        for images in soup.findAll(('div'),{"class":"thread"})[index].findAll('div',{"class":"files"})[0].findAll('div',{"class":"file"}):
            for answers in images.findAll('a'):
                if (str)(answers.text):
                    tmparr.append(clean((str)(answers.text)))
        return tmparr

    #filling OP posts into array for indexing
    OPnums=[]
    for threads in soup.findAll(('div'),{"class":"thread"}):
        op=True
        for posts in threads.findAll(('div'),{"class":"post"}):
            if op==True:
                op=False
                OPnums.append(posts.findAll('a',{"class":"post_no"})[1].text)


    #setting up xml
    doc, tag, text = Doc().tagtext()
    i=True
    j=0
    with tag('board', name=title):
        for num in range(0,len(OPnums)):
            i=True
            with tag('thread', name=OPnums[num]):
                for threads in soup.findAll(('div'),{"class":"thread"})[num].findAll(('div'),{"class":"post"}):
                    if i==True:
                        i=False
                        with tag('Post',name=OPnums[num]):
                            for image in OPimgs(j):
                                with tag('Image'):
                                    text(image)
                        j+=1
                    else:
                        for number in threads.findAll('a',{"class":"post_no"})[1]:
                            with tag('Post', name=(str)(number)):
                                for images in threads.findAll('div',{"class":"file"}):
                                    for pclass in images.findAll('p',{"class":"fileinfo"}):
                                        with tag('Image'):
                                            text(pclass.find('a').text)

    result = indent(
        doc.getvalue(),
        indentation = ' '*4,
        newline = '\r\n'
    )

    print(result)

    #write xml to file
    file = open('myfile.xml', 'w+')
    file.write(result)
    file.close()

    time.sleep(5)

    #reading myfile(the xml tree just built)
    e = xml.etree.ElementTree.parse('myfile.xml').getroot()
    postsActive=[]
    postsSaved=[]

    #returns active posts in an array
    for atype in e.findall('thread'):
        for find in atype.findall('Post'):
            postsActive.append((int)(find.get('name')))

    #return posts we've already saved
    file = open('saved.txt', 'r')
    postsSaved = file.readlines()
    postsSaved = list(map(int, postsSaved))

    #converting to a set for filtering
    s1=set(postsSaved)
    s2=set(postsActive)

    #remove posts found in saved.csv array
    for x in s1:
        for y in s2:
            if x == y:
                postsActive.remove(x)

    print(postsActive,  ' is what\'s left')
    print('Hello?')

    #downloading function
    def download(link, newDir):
        url='https://media.8ch.net/' + 'v' + '/src/' + link
        print(url)
        headers='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36'
        req = urllib2.Request(url, headers={ 'User-Agent': headers })
        html = urllib2.urlopen(req).read()
        with open(os.path.join(newDir, link),'wb') as output:
            output.write(html)
        time.sleep(2)

    for threads in e.findall('thread'):
        for posts in threads.findall('Post'):
            for result in postsActive:
                if posts.get('name')==(str)(result):
                    for images in posts.findall('Image'):
                        print(images.text, '; Post Number: ', result)
                        print('OP Post Number: ', threads.get('name'))
                        if not os.path.exists('./' + threads.get('name')):
                            os.makedirs('./' + threads.get('name'))
                        download(images.text, './' + threads.get('name'))

    #saving csv
    outputa=postsActive + postsSaved
    outputa = list(map(str, outputa))
    newfile = open('saved.txt', 'w+')
    print(outputa, ' is our output\n\n\n')
    newfile.write("\n".join(outputa))
    newfile.close


    print(outputa)
    print('Finished, waiting now')
    time.sleep(random.randint(60, 160))
    print('Finished, going to next')