Untitled

# place this script in the same folder that have a lot of voice work folders, i.e.
# the voice work foldes must contain RJ numbers in its name
# this_scrip.py <-- this is what you're reading
# RJ0001 <-- first folder
# ABC RJ0002 <-- second folder, the position of RJ code or other text isn't important, just have RJ code in the name is fine
# XYZ RJ0003 WAV+MP3 <-- this is fine too because it has RJ code in its name
# ...etc.

import os
import shutil
import re
import pandas as pd
import json
import requests
import bs4

#todo add abs path of the folder
#todo while scan folder, looking if it still exists (not just 'pass') and have a new column to store this data. (Optional). ie. check if we accidentally delete or move to a new location or what
#todo partial search
#todo case insensitive search
#todo random chosen work
#todo copy select work to another folder
#todo make another function to change selected works' SFW field to True

def pretty_json(file_name):
    with open(file_name, 'r', encoding='utf-8') as json_file:
        json_dict = json.load(json_file)

    with open(file_name, 'w', encoding='utf-8') as json_out:
        json.dump(json_dict, json_out, indent=4, ensure_ascii=False)

def init():
    db = pd.DataFrame(columns=["code", "title", "title_en", "circle", "tags", "sfw"])

    for folder in os.listdir('.'):
        if os.path.isdir(folder) and (folder != 'out'):
            try:
                rj_code = re.findall(r'RJ\d{4,}', folder)[0]
            except:
                rj_code = None
            db = scap_data(rj_code, db)

    # to avoid unicode shit
    # https://stackoverflow.com/questions/39612240/writing-pandas-dataframe-to-json-in-unicode
    with open('db.json', 'w', encoding='utf-8') as file:
        db.to_json(file, force_ascii=False, orient='records')
    pretty_json('db.json')

def update():
    db = pd.read_json('db.json', orient='records')
    # we have to manually swap column because FUCK YOU PANDAS
    # https://stackoverflow.com/questions/25649429/how-to-swap-two-dataframe-columns
    columns_like_this = ["code", "title", "title_en", "circle", "tags", "sfw"]
    db = db.reindex(columns=columns_like_this)
    old_works = db['code'].to_list()

    # goto each folder
    for folder in os.listdir('.'):
        if os.path.isdir(folder) and (folder != 'out'):
            try:
                rj_code = re.findall(r'RJ\d{4,}', folder)[0]
            except:
                rj_code = None

            if rj_code not in old_works:
                db = scap_data(rj_code, db)

    with open('db.json', 'w', encoding='utf-8') as file:
        db.to_json(file, force_ascii=False, orient='records')

    pretty_json('db.json')

def make_set(init_list, if_fail_list):
    try:
        this = set(init_list)
    except:
        this = set(if_fail_list) # just pick whatever not None, becaues we don't want to have a set(None)
    return this

def search_result(result):
    #! The old method return what's have one of many seach terms.
    non_None = []
    for item in result:
        if type(item) == str:
            if item:
                non_None.append(item)
        elif type(item) == list:
            for i in item:
                non_None.append(i)

    if len(non_None) == 0:
        return None # ALL the results are empty. ie. we can't find shit

    code = make_set(result[0], non_None)
    title = make_set(result[1], non_None)
    title_en = make_set(result[2], non_None)
    circle = make_set(result[3], non_None)
    tags = make_set(result[4], non_None)
    sfw = make_set(result[5], non_None)

    return set.intersection(code, title, title_en, circle, tags, sfw)

def search(code=None, title=None, title_en=None, circle=None, tags=None, sfw=None):
    """
    Usage:
    seach(code='RJXYZ',title='ABC', title_en='EN', circle='CR', tags='TAG', sfw=None)
    Each seach term can be multiple ex. seach(code='RJXYZ',title='ABC', title_en='EN', circle='CR', tags=['TAG1', TAG2], sfw=None)
    """
    result_code = []
    result_title = []
    result_title_en = []
    result_circle = []
    result_tags = []
    result_sfw = []
    db = pd.read_json('db.json', orient='records')

    for _, work in db.iterrows():
        result_code = search_term(code, 'code', result_code, work)
        result_title = search_term(title, 'title', result_title, work)
        result_title_en = search_term(title_en, 'title_en', result_title_en, work)
        result_circle = search_term(circle, 'circle', result_circle, work)
        result_tags = search_term(tags, 'tags', result_tags, work)
        result_sfw = search_term(sfw, 'sfw', result_sfw, work)

    return result_code, result_title, result_title_en, result_circle, result_tags, result_sfw

def search_term(s, s_db, result, work):
    """
    s: search term. ex. VA's name
    s_db: column in db. ex. title
    resutl: list that result will contain
    work: specific work. ex "123":"ABC":"oyster":["A","B"]:False
    """
    if s:
        if type(s) == str:
            if s in work[s_db]:
                result.append(work['code'])
            return result
        elif type(s) == list:
            for se in s:
                if se in work[s_db]:
                    result.append(work['code'])
                return result
        else:
            return None

def scap_data(code, db):
    last_row = len(db)

    print('Scapping:', code)
    res = requests.get('http://hvdb.me/Dashboard/Details/' + code[2:])
    soup = bs4.BeautifulSoup(res.text, features='lxml')
    try:
        title = soup.select('div > div > label[id="circleLabel"]')[0].getText().strip()
    except:
        title = "N/A"
    try:
        title_en = soup.select('div > div > label[id="circleLabel"]')[1].getText().strip()
    except IndexError:
        title_en = "N/A"
    try:
        circle = soup.select('a[class="detailCircle"]')[0].getText()
    except IndexError:
        circle = "N/A"
    try:
        tag_list = soup.select('div[class="col-md-10 infoLabel"] a')
        tags = []
        for t in range(len(tag_list)):
            tags.append(tag_list[t].getText())
    except:
        tags = ["N/A"]
    sfw = False

    work_detail = [code, title, title_en, circle , tags, sfw]
    db.loc[last_row] = work_detail

    return db


if __name__ == "__main__":
    init()
    # update()
    # pass
    print(search_result(search(tags=['school', 'binaural audio'])))