Advertisement
Guest User

Untitled

a guest
Jul 16th, 2019
222
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.42 KB | None | 0 0
  1. # place this script in the same folder that have a lot of voice work folders, i.e.
  2. # the voice work foldes must contain RJ numbers in its name
  3. # this_scrip.py <-- this is what you're reading
  4. # RJ0001 <-- first folder
  5. # ABC RJ0002 <-- second folder, the position of RJ code or other text isn't important, just have RJ code in the name is fine
  6. # XYZ RJ0003 WAV+MP3 <-- this is fine too because it has RJ code in its name
  7. # ...etc.
  8.  
  9. import os
  10. import shutil
  11. import re
  12. import pandas as pd
  13. import json
  14. import requests
  15. import bs4
  16.  
  17. #todo add abs path of the folder
  18. #todo while scan folder, looking if it still exists (not just 'pass') and have a new column to store this data. (Optional). ie. check if we accidentally delete or move to a new location or what
  19. #todo partial search
  20. #todo case insensitive search
  21. #todo random chosen work
  22. #todo copy select work to another folder
  23. #todo make another function to change selected works' SFW field to True
  24.  
  25. def pretty_json(file_name):
  26.     with open(file_name, 'r', encoding='utf-8') as json_file:
  27.         json_dict = json.load(json_file)
  28.    
  29.     with open(file_name, 'w', encoding='utf-8') as json_out:
  30.         json.dump(json_dict, json_out, indent=4, ensure_ascii=False)
  31.  
  32. def init():
  33.     db = pd.DataFrame(columns=["code", "title", "title_en", "circle", "tags", "sfw"])
  34.  
  35.     for folder in os.listdir('.'):
  36.         if os.path.isdir(folder) and (folder != 'out'):
  37.             try:
  38.                 rj_code = re.findall(r'RJ\d{4,}', folder)[0]
  39.             except:
  40.                 rj_code = None
  41.             db = scap_data(rj_code, db)
  42.  
  43.     # to avoid unicode shit
  44.     # https://stackoverflow.com/questions/39612240/writing-pandas-dataframe-to-json-in-unicode
  45.     with open('db.json', 'w', encoding='utf-8') as file:
  46.         db.to_json(file, force_ascii=False, orient='records')
  47.     pretty_json('db.json')
  48.  
  49. def update():
  50.     db = pd.read_json('db.json', orient='records')
  51.     # we have to manually swap column because FUCK YOU PANDAS
  52.     # https://stackoverflow.com/questions/25649429/how-to-swap-two-dataframe-columns
  53.     columns_like_this = ["code", "title", "title_en", "circle", "tags", "sfw"]
  54.     db = db.reindex(columns=columns_like_this)
  55.     old_works = db['code'].to_list()
  56.  
  57.     # goto each folder
  58.     for folder in os.listdir('.'):
  59.         if os.path.isdir(folder) and (folder != 'out'):
  60.             try:
  61.                 rj_code = re.findall(r'RJ\d{4,}', folder)[0]
  62.             except:
  63.                 rj_code = None
  64.  
  65.             if rj_code not in old_works:
  66.                 db = scap_data(rj_code, db)
  67.    
  68.     with open('db.json', 'w', encoding='utf-8') as file:
  69.         db.to_json(file, force_ascii=False, orient='records')
  70.  
  71.     pretty_json('db.json')
  72.  
  73. def make_set(init_list, if_fail_list):
  74.     try:
  75.         this = set(init_list)
  76.     except:
  77.         this = set(if_fail_list) # just pick whatever not None, becaues we don't want to have a set(None)
  78.     return this
  79.  
  80. def search_result(result):
  81.     #! The old method return what's have one of many seach terms.
  82.     non_None = []
  83.     for item in result:
  84.         if type(item) == str:
  85.             if item:
  86.                 non_None.append(item)
  87.         elif type(item) == list:
  88.             for i in item:
  89.                 non_None.append(i)
  90.  
  91.     if len(non_None) == 0:
  92.         return None # ALL the results are empty. ie. we can't find shit
  93.  
  94.     code = make_set(result[0], non_None)
  95.     title = make_set(result[1], non_None)
  96.     title_en = make_set(result[2], non_None)
  97.     circle = make_set(result[3], non_None)
  98.     tags = make_set(result[4], non_None)
  99.     sfw = make_set(result[5], non_None)
  100.  
  101.     return set.intersection(code, title, title_en, circle, tags, sfw)
  102.  
  103. def search(code=None, title=None, title_en=None, circle=None, tags=None, sfw=None):
  104.     """
  105.    Usage:
  106.    seach(code='RJXYZ',title='ABC', title_en='EN', circle='CR', tags='TAG', sfw=None)
  107.    Each seach term can be multiple ex. seach(code='RJXYZ',title='ABC', title_en='EN', circle='CR', tags=['TAG1', TAG2], sfw=None)
  108.    """
  109.     result_code = []
  110.     result_title = []
  111.     result_title_en = []
  112.     result_circle = []
  113.     result_tags = []
  114.     result_sfw = []
  115.     db = pd.read_json('db.json', orient='records')
  116.  
  117.     for _, work in db.iterrows():
  118.         result_code = search_term(code, 'code', result_code, work)
  119.         result_title = search_term(title, 'title', result_title, work)
  120.         result_title_en = search_term(title_en, 'title_en', result_title_en, work)
  121.         result_circle = search_term(circle, 'circle', result_circle, work)
  122.         result_tags = search_term(tags, 'tags', result_tags, work)
  123.         result_sfw = search_term(sfw, 'sfw', result_sfw, work)
  124.  
  125.     return result_code, result_title, result_title_en, result_circle, result_tags, result_sfw
  126.  
  127. def search_term(s, s_db, result, work):
  128.     """
  129.    s: search term. ex. VA's name
  130.    s_db: column in db. ex. title
  131.    resutl: list that result will contain
  132.    work: specific work. ex "123":"ABC":"oyster":["A","B"]:False
  133.    """
  134.     if s:
  135.         if type(s) == str:
  136.             if s in work[s_db]:
  137.                 result.append(work['code'])
  138.             return result
  139.         elif type(s) == list:
  140.             for se in s:
  141.                 if se in work[s_db]:
  142.                     result.append(work['code'])
  143.                 return result
  144.         else:
  145.             return None
  146.  
  147. def scap_data(code, db):
  148.     last_row = len(db)
  149.  
  150.     print('Scapping:', code)
  151.     res = requests.get('http://hvdb.me/Dashboard/Details/' + code[2:])
  152.     soup = bs4.BeautifulSoup(res.text, features='lxml')
  153.     try:
  154.         title = soup.select('div > div > label[id="circleLabel"]')[0].getText().strip()
  155.     except:
  156.         title = "N/A"
  157.     try:
  158.         title_en = soup.select('div > div > label[id="circleLabel"]')[1].getText().strip()
  159.     except IndexError:
  160.         title_en = "N/A"
  161.     try:
  162.         circle = soup.select('a[class="detailCircle"]')[0].getText()
  163.     except IndexError:
  164.         circle = "N/A"
  165.     try:
  166.         tag_list = soup.select('div[class="col-md-10 infoLabel"] a')
  167.         tags = []
  168.         for t in range(len(tag_list)):
  169.             tags.append(tag_list[t].getText())
  170.     except:
  171.         tags = ["N/A"]
  172.     sfw = False
  173.  
  174.     work_detail = [code, title, title_en, circle , tags, sfw]
  175.     db.loc[last_row] = work_detail
  176.  
  177.     return db
  178.  
  179.  
  180. if __name__ == "__main__":
  181.     init()
  182.     # update()
  183.     # pass
  184.     print(search_result(search(tags=['school', 'binaural audio'])))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement