## This script WAS NOT USED on the weights released by ProjectAI Touhou on 8th of september, 2022. ## This script CAN convert tags to human-readable-text BUT IT IS NOT REQUIRED. import argparse import string #Stolen code from https://stackoverflow.com/a/43357954 def str2bool(v): if isinstance(v, bool): return v if v.lower() in ('yes', 'true', 't', 'y', '1'): return True elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') def ratingparsing(input): v = input.lower() ratingsSelected = " " if "a" in v: ratingsSelected = "e g q s" if "e" in v: ratingsSelected = ratingsSelected + "e " if "g" in v: ratingsSelected = ratingsSelected + "g " if "q" in v: ratingsSelected = ratingsSelected + "q " if "s" in v: ratingsSelected = ratingsSelected + "s " if ratingsSelected == " ": raise Exception('a/e/g/q/s expected') print("Ratings selected: " + ratingsSelected) return(ratingsSelected) ## In the future someone might want to access this via import. Consider adding support for that parser = argparse.ArgumentParser() parser.add_argument('--jsonpath', '-J', type=str, help='Path to JSONL file with the metadata', required = True) parser.add_argument('--extractpath', '-E', type=str, help='Path to the folder where to extract the images and text files', required = True) parser.add_argument('--imagespath', '-I', type=str, help='Path to the folder with the images', required = False, default="512px") parser.add_argument('--convtohuman', '-H', type=str2bool, help='Convert to human-readable-text', required = False, default=False) parser.add_argument('--rating', '-R', type=ratingparsing, help='Extract specific rating/s [a/e/g/q/s]', required = False, default='a') args = parser.parse_args() if args.convtohuman == True: print("tag conversion to human is currently somewhat broken. If you still want to use it remove line 25") #Q: What is broken? #A: tag_separator sometimes appears at to_write without anything behind it. It should be an easy fix where tag_separator simply does not appear if the variable behind it is blank #but right now its not important, plus many tokens are lost when converting to human text. its more effective doing tag based inputs rather than human-readable text exit() print("Arguments: " + str(args)) import json import os import shutil if os.path.exists(args.extractpath) == False: os.mkdir(args.extractpath) def writefile(filename, text): f = open(filename, "w") f.write(text) print('Saved the following: ' + text) f.close() #Converts tags to T2I-like prompts (blue_dress, 1girl -> A blue dress, one girl) def ConvCommaAndUnderscoreToHuman(convtohuman, input): tars = input if convtohuman: tars = tars.replace(' ', ', ') tars = tars.replace('_', ' ') elif convtohuman == False: print("CommaAndUnderscoreToHuman: convtohuman is false hence not doing anything") return tars def ConvTagsToHuman(convtohuman, input): tars = input if convtohuman: tars = tars.replace('1girl', 'one girl') tars = tars.replace('2girls', 'two girls') tars = tars.replace('3girls', 'three girls') tars = tars.replace('4girls', 'four girls') tars = tars.replace('5girls', 'five girls') ##Implying it will ever be able to differentiate so many entities tars = tars.replace('6girls', 'six girls') #Almost forgot about boys tags... I wonder if theres also for other entities? tars = tars.replace('1boy', 'one boy') tars = tars.replace('2boys', 'two boys') tars = tars.replace('3boys', 'three boys') tars = tars.replace('4boys', 'four boys') tars = tars.replace('5boys', 'five boys') tars = tars.replace('6boys', 'six boys') elif convtohuman == False: print("ConvTagsToHuman: convtohuman is false hence not doing anything") print("TARS is: " + tars) return tars #Converts ratings to X content def ConvRatingToHuman(convtohuman, input): if convtohuman: if input == "e": return "explicit content" if input == "g": return "general content" if input == "q": return "questionable content" if input == "s": return "sensitive content" ##This will be the start of everything unethical elif convtohuman == False: if input == "e": return "explicit_content" if input == "g": return "general_content" if input == "q": return "questionable_content" if input == "s": return "sensitive_content" def ConvCharacterToHuman(convtohuman, input): tars = input if convtohuman: tars = tars.replace('_(', ' from ') tars = tars.replace(')', '') elif convtohuman == False: print("ConvCharacterToHuman: convtohuman is false hence not doing anything") return tars # unrecog_ans = True # while unrecog_ans: # inputans = input("Convert tags to human-readable-text? (smiley_face blue_hair -> smiley face, blue hair) [y/n]") # if inputans == "y": # convtohuman = True # unrecog_ans = False # elif inputans == "n": # convtohuman = False # unrecog_ans = False # else: # print("unrecognizable input. only y or n.") # unrecog_ans = True convtohuman = args.convtohuman acceptedRatings = args.rating ##Open the file json_file_path = args.jsonpath ##Name of the JSON file to use, converted into parser arg with open(json_file_path, 'r', encoding="utf8") as json_file: json_list = list(json_file) ##Read line current_saved_file_count = 0 current_line_count = 0 for json_str in json_list: current_line_count = current_line_count + 1 ##415627 last line of 00.json, ignore ##TODO: Add a line counter to print progress accurately print("Current Line:" + str(current_line_count) + '/415000 (aprox) | Current saved files count: ' + str(current_saved_file_count) ) #here, result = line result = json.loads(json_str) try: img_id = str(result['id']) except Exception: img_id = "nan" print("img_id RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.") continue try: tmp_img_id = img_id[-3:] img_id_last3 = tmp_img_id.zfill(3) except Exception: img_id_last3 = "nan" print("img_id_last3 RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.") continue # try: # img_tags = result['tag_string'] # except Exception: # img_tags = "none" # print("failed to get img_tags") # continue ##JohannesGaessler SUGGESTIONS: harubaru/waifu-diffusion/pull/11 ## TAG_STRING_GENERAL: ONLY TAGS HERE try: img_tag_string_general = result['tag_string_general'] except Exception: img_tag_string_general = None print("img_tag_string_general RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.") continue ## TAG_STRING_ARTIST: ONLY ARTISTS TAGS HERE try: img_tag_string_artist = result['tag_string_artist'] except Exception: img_tag_string_artist = None print("img_tag_string_artist RETRIVAL FAILED. Var is not essential so just skipping var.") pass ## TAG_STRING_COPYRIGHT: ONLY COPYRIGHT TAGS HERE try: img_tag_string_copyright = result['tag_string_copyright'] except Exception: img_tag_string_copyright = None print("img_tag_string_copyright RETRIVAL FAILED. Var is not essential so just skipping var.") pass ## TAG_STRING_CHARACTER: ONLY CHARACTER TAGS HERE try: img_tag_string_character = result['tag_string_character'] except Exception: img_tag_string_character = None print("img_tag_string_character RETRIVAL FAILED. Var is not essential so just skipping var.") pass try: img_ext = result['file_ext'] except Exception: img_ext = None print("img_ext RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.") continue try: img_rating = result['rating'] except Exception: img_rating = None print("img_rating RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.") continue baru = img_rating in acceptedRatings # print("HEYYYYYYYYYYYYYYYY " + str(baru)) if str(baru) == "False": print("Entry rating' is not in acceptedRatings, skipping entry.") continue elif str(baru) == "True": print("Entry rating matches!") file_path = str(args.imagespath) + "/0" + img_id_last3 + "/" + img_id + "." + img_ext if os.path.exists(file_path): shutil.copyfile(file_path, args.extractpath + '/' + img_id + "." + img_ext) ##Essential FinalTagStringGeneral = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_general) FinalTagStringGeneral = ConvTagsToHuman(convtohuman, FinalTagStringGeneral) ##Not essential if img_tag_string_artist != None: FinalTagStringArtist = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_artist) elif img_tag_string_artist == None: print("img_tag_string_artist is none") else: print("CE 1NE") if img_tag_string_character != None: FinalTagStringCharacter = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_character) FinalTagStringCharacter = ConvCharacterToHuman(convtohuman, FinalTagStringCharacter) elif img_tag_string_character == None: print("img_tag_string_character is none") else: print("CE 2NE") if img_tag_string_copyright != None: FinalTagStringCopyright = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_copyright) elif img_tag_string_copyright == None: print("img_tag_string_copyright is none") else: print("CE 3NE") print("IMAGE RATING IS: " + img_rating) if img_rating != None: FinalTagStringRating = ConvRatingToHuman(convtohuman, img_rating) elif img_rating == None: print("img_rating is none") else: print("CE 4NE") if convtohuman == True: dan_iden = 'uploaded on danbooru' tag_separator = ', ' elif convtohuman == False: dan_iden = 'danbooru' tag_separator = ' ' # print('FinalTagStringCharacter is: ' + FinalTagStringCharacter) # print('tag_separator is: ' + tag_separator) # print('FinalTagStringArtist is: ' + FinalTagStringArtist) # print('FinalTagStringRating is: ' + FinalTagStringRating) # print('FinalTagStringGeneral is: ' + FinalTagStringGeneral) # print('FinalTagStringCopyright is: ' + FinalTagStringCopyright) to_write = FinalTagStringCharacter + tag_separator + FinalTagStringArtist + tag_separator + FinalTagStringRating + tag_separator + FinalTagStringGeneral + tag_separator + FinalTagStringCopyright txt_name = args.extractpath + "/" + img_id + '.txt' writefile(txt_name, to_write) current_saved_file_count = current_saved_file_count + 1 elif os.path.exists(file_path) == False: print("Failed to find path.") print("finished process. Your extracted data should be in " + str(args.extractpath) + " !")