Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # place this script in the same folder that have a lot of voice work folders, i.e.
- # the voice work foldes must contain RJ numbers in its name
- # this_scrip.py <-- this is what you're reading
- # RJ0001 <-- first folder
- # ABC RJ0002 <-- second folder, the position of RJ code or other text isn't important, just have RJ code in the name is fine
- # XYZ RJ0003 WAV+MP3 <-- this is fine too because it has RJ code in its name
- # ...etc.
- import os
- import shutil
- import re
- import pandas as pd
- import json
- import requests
- import bs4
- #todo add abs path of the folder
- #todo while scan folder, looking if it still exists (not just 'pass') and have a new column to store this data. (Optional). ie. check if we accidentally delete or move to a new location or what
- #todo partial search
- #todo case insensitive search
- #todo random chosen work
- #todo copy select work to another folder
- #todo make another function to change selected works' SFW field to True
- def pretty_json(file_name):
- with open(file_name, 'r', encoding='utf-8') as json_file:
- json_dict = json.load(json_file)
- with open(file_name, 'w', encoding='utf-8') as json_out:
- json.dump(json_dict, json_out, indent=4, ensure_ascii=False)
- def init():
- db = pd.DataFrame(columns=["code", "title", "title_en", "circle", "tags", "sfw"])
- for folder in os.listdir('.'):
- if os.path.isdir(folder) and (folder != 'out'):
- try:
- rj_code = re.findall(r'RJ\d{4,}', folder)[0]
- except:
- rj_code = None
- db = scap_data(rj_code, db)
- # to avoid unicode shit
- # https://stackoverflow.com/questions/39612240/writing-pandas-dataframe-to-json-in-unicode
- with open('db.json', 'w', encoding='utf-8') as file:
- db.to_json(file, force_ascii=False, orient='records')
- pretty_json('db.json')
- def update():
- db = pd.read_json('db.json', orient='records')
- # we have to manually swap column because FUCK YOU PANDAS
- # https://stackoverflow.com/questions/25649429/how-to-swap-two-dataframe-columns
- columns_like_this = ["code", "title", "title_en", "circle", "tags", "sfw"]
- db = db.reindex(columns=columns_like_this)
- old_works = db['code'].to_list()
- # goto each folder
- for folder in os.listdir('.'):
- if os.path.isdir(folder) and (folder != 'out'):
- try:
- rj_code = re.findall(r'RJ\d{4,}', folder)[0]
- except:
- rj_code = None
- if rj_code not in old_works:
- db = scap_data(rj_code, db)
- with open('db.json', 'w', encoding='utf-8') as file:
- db.to_json(file, force_ascii=False, orient='records')
- pretty_json('db.json')
- def make_set(init_list, if_fail_list):
- try:
- this = set(init_list)
- except:
- this = set(if_fail_list) # just pick whatever not None, becaues we don't want to have a set(None)
- return this
- def search_result(result):
- #! The old method return what's have one of many seach terms.
- non_None = []
- for item in result:
- if type(item) == str:
- if item:
- non_None.append(item)
- elif type(item) == list:
- for i in item:
- non_None.append(i)
- if len(non_None) == 0:
- return None # ALL the results are empty. ie. we can't find shit
- code = make_set(result[0], non_None)
- title = make_set(result[1], non_None)
- title_en = make_set(result[2], non_None)
- circle = make_set(result[3], non_None)
- tags = make_set(result[4], non_None)
- sfw = make_set(result[5], non_None)
- return set.intersection(code, title, title_en, circle, tags, sfw)
- def search(code=None, title=None, title_en=None, circle=None, tags=None, sfw=None):
- """
- Usage:
- seach(code='RJXYZ',title='ABC', title_en='EN', circle='CR', tags='TAG', sfw=None)
- Each seach term can be multiple ex. seach(code='RJXYZ',title='ABC', title_en='EN', circle='CR', tags=['TAG1', TAG2], sfw=None)
- """
- result_code = []
- result_title = []
- result_title_en = []
- result_circle = []
- result_tags = []
- result_sfw = []
- db = pd.read_json('db.json', orient='records')
- for _, work in db.iterrows():
- result_code = search_term(code, 'code', result_code, work)
- result_title = search_term(title, 'title', result_title, work)
- result_title_en = search_term(title_en, 'title_en', result_title_en, work)
- result_circle = search_term(circle, 'circle', result_circle, work)
- result_tags = search_term(tags, 'tags', result_tags, work)
- result_sfw = search_term(sfw, 'sfw', result_sfw, work)
- return result_code, result_title, result_title_en, result_circle, result_tags, result_sfw
- def search_term(s, s_db, result, work):
- """
- s: search term. ex. VA's name
- s_db: column in db. ex. title
- resutl: list that result will contain
- work: specific work. ex "123":"ABC":"oyster":["A","B"]:False
- """
- if s:
- if type(s) == str:
- if s in work[s_db]:
- result.append(work['code'])
- return result
- elif type(s) == list:
- for se in s:
- if se in work[s_db]:
- result.append(work['code'])
- return result
- else:
- return None
- def scap_data(code, db):
- last_row = len(db)
- print('Scapping:', code)
- res = requests.get('http://hvdb.me/Dashboard/Details/' + code[2:])
- soup = bs4.BeautifulSoup(res.text, features='lxml')
- try:
- title = soup.select('div > div > label[id="circleLabel"]')[0].getText().strip()
- except:
- title = "N/A"
- try:
- title_en = soup.select('div > div > label[id="circleLabel"]')[1].getText().strip()
- except IndexError:
- title_en = "N/A"
- try:
- circle = soup.select('a[class="detailCircle"]')[0].getText()
- except IndexError:
- circle = "N/A"
- try:
- tag_list = soup.select('div[class="col-md-10 infoLabel"] a')
- tags = []
- for t in range(len(tag_list)):
- tags.append(tag_list[t].getText())
- except:
- tags = ["N/A"]
- sfw = False
- work_detail = [code, title, title_en, circle , tags, sfw]
- db.loc[last_row] = work_detail
- return db
- if __name__ == "__main__":
- init()
- # update()
- # pass
- print(search_result(search(tags=['school', 'binaural audio'])))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement