Untitled

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup, Comment
import requests, re, sys, os, json

maleware_name_file = ""
info_dir = "info"
all_infos = []

if len(sys.argv) <= 1:
    print('Usage: [target_file]')
    exit(0)
if len(sys.argv) == 2:
    maleware_name_file = sys.argv[1]
    if not os.path.exists(info_dir):
        os.makedirs(info_dir)

def wrtie2json(d, info,text):
    text = str(text.encode("utf8").decode("cp950", "ignore"))
    text = text.replace('What to do now', '\n[What to do now]')
    text = re.sub(r'\n+', '\n', text).strip() +'\n\n'
    text = text.replace('\n', ' <br> ')
    d[info.lower()] = text

def crawl(maleware_name):
    search_item = maleware_name
    base = "http://www.google.com"
    url = "http://www.google.com/search?q="+ search_item

    response = requests.get(url)
    soup = BeautifulSoup(response.text,"html.parser")
    fileCount = 0
    mal = {}
    mal['name'] = maleware_name
    mal['content'] = []
    for item in soup.select(".r a"):
        # FOR MICROSOFT
        if "microsoft" in item['href']:
            d = {}
            r  = requests.get(base + item['href'])
            data = r.text
            soup = BeautifulSoup(data, "html.parser")
            # for br in soup.find_all("br"):
            #     br.replace_with("<br>")
            def writeText(file, text):
                text = str(text.encode("utf8").decode("cp950", "ignore"))
                text = text.replace('What to do now', '\n[What to do now]')
                text = re.sub(r'\n+', '\n', text).strip() +'\n\n'
                text = text.replace('\n', '<br>')
                file.write(text)

            with open(os.path.join(info_dir, maleware_name +'_' + str(fileCount+1) + '.txt'), 'w') as file:
                # summary id:SummaryDrawerStub
                summary = soup.find("button", {"id": "SummaryDrawerStub"})
                if(summary):
                    summary = summary.find_next_sibling("div")
                    file.write('[Summary]\n')
                    writeText(file, summary.text)
                    wrtie2json(d, 'summary', summary.text)

                # # tech-description class:tech-info-content
                techInfo = soup.find("div", {"class": "tech-info-content"})
                if(techInfo):
                    file.write('[Technical description]\n')
                    writeText(file, techInfo.text)
                    wrtie2json(d, 'tech', techInfo.text)

                # symptoms
                symptoms = soup.find('h2', {'class': 'drawer-headings-d'}, text='Symptoms')
                if(symptoms):
                    file.write('[Symptoms]\n')
                    symptoms = symptoms.parent.find_next_sibling('div')
                    writeText(file, symptoms.text)
                    wrtie2json(d, 'symptoms', symptoms.text)

                # what to do now
                wtdn = soup.find('h2', {'class': 'drawer-headings-d'}, text='What to do now')
                if(wtdn):
                    file.write('[What to do now]\n')
                    wtdn= wtdn.parent.find_next_sibling('div')
                    writeText(file, wtdn.text)
                    wrtie2json(d, 'what2do', wtdn.text)

            fileCount = fileCount + 1
            print(d)
            mal['content'].append(d)

    # remove blank files
    for i in range(fileCount):
        filename = maleware_name + '_' + str(i+1) + '.txt'
        if(os.path.getsize(os.path.join(info_dir,filename)) < 30):
            os.remove(os.path.join(info_dir,filename))
    print("Done.")
    return mal


with open(maleware_name_file, 'r') as f:
    all_infos = []
    for line in f:
        maleware_name = line.rstrip('\n')
        all_infos.append(crawl(maleware_name))
    with open(os.path.join(info_dir, 'malware_info.json') , 'w') as outfile:
        json.dump(all_infos, outfile)