waifu-diffusion/dataset/download/local/extractfromjson_danboo21.py

302 lines
11 KiB
Python
Raw Permalink Normal View History

2022-09-09 02:49:52 -06:00
## This script WAS NOT USED on the weights released by ProjectAI Touhou on 8th of september, 2022.
## This script CAN convert tags to human-readable-text BUT IT IS NOT REQUIRED.
2022-09-10 18:38:22 -06:00
2022-09-09 02:49:52 -06:00
import argparse
2022-09-10 18:39:49 -06:00
2022-09-09 02:49:52 -06:00
#Stolen code from https://stackoverflow.com/a/43357954
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
2022-09-10 18:38:22 -06:00
def ratingparsing(input):
v = input.lower()
ratingsSelected = " "
if "a" in v:
ratingsSelected = "e g q s"
if "e" in v:
ratingsSelected = ratingsSelected + "e "
if "g" in v:
ratingsSelected = ratingsSelected + "g "
if "q" in v:
ratingsSelected = ratingsSelected + "q "
if "s" in v:
ratingsSelected = ratingsSelected + "s "
if ratingsSelected == " ":
raise Exception('a/e/g/q/s expected')
print("Ratings selected: " + ratingsSelected)
return(ratingsSelected)
2022-09-09 02:49:52 -06:00
## In the future someone might want to access this via import. Consider adding support for that
2022-09-10 18:38:22 -06:00
2022-09-09 02:49:52 -06:00
parser = argparse.ArgumentParser()
parser.add_argument('--jsonpath', '-J', type=str, help='Path to JSONL file with the metadata', required = True)
parser.add_argument('--extractpath', '-E', type=str, help='Path to the folder where to extract the images and text files', required = True)
parser.add_argument('--imagespath', '-I', type=str, help='Path to the folder with the images', required = False, default="512px")
2022-09-10 18:38:22 -06:00
parser.add_argument('--convtohuman', '-H', type=str2bool, help='Convert to human-readable-text', required = False, default=False)
parser.add_argument('--rating', '-R', type=ratingparsing, help='Extract specific rating/s [a/e/g/q/s]', required = False, default='a')
2022-09-09 02:49:52 -06:00
args = parser.parse_args()
2022-09-10 18:38:22 -06:00
if args.convtohuman == True:
print("tag conversion to human is currently somewhat broken. If you still want to use it remove line 25")
#Q: What is broken?
#A: tag_separator sometimes appears at to_write without anything behind it. It should be an easy fix where tag_separator simply does not appear if the variable behind it is blank
#but right now its not important, plus many tokens are lost when converting to human text. its more effective doing tag based inputs rather than human-readable text
exit()
2022-09-09 02:49:52 -06:00
print("Arguments: " + str(args))
import json
import os
import shutil
2022-09-09 02:49:52 -06:00
if os.path.exists(args.extractpath) == False:
os.mkdir(args.extractpath)
def writefile(filename, text):
f = open(filename, "w")
f.write(text)
print('Saved the following: ' + text)
f.close()
#Converts tags to T2I-like prompts (blue_dress, 1girl -> A blue dress, one girl)
2022-09-09 02:49:52 -06:00
def ConvCommaAndUnderscoreToHuman(convtohuman, input):
tars = input
2022-09-09 02:49:52 -06:00
if convtohuman:
tars = tars.replace(' ', ', ')
tars = tars.replace('_', ' ')
elif convtohuman == False:
print("CommaAndUnderscoreToHuman: convtohuman is false hence not doing anything")
2022-09-10 18:38:22 -06:00
return tars
2022-09-09 02:49:52 -06:00
def ConvTagsToHuman(convtohuman, input):
tars = input
if convtohuman:
tars = tars.replace('1girl', 'one girl')
tars = tars.replace('2girls', 'two girls')
tars = tars.replace('3girls', 'three girls')
tars = tars.replace('4girls', 'four girls')
tars = tars.replace('5girls', 'five girls')
##Implying it will ever be able to differentiate so many entities
tars = tars.replace('6girls', 'six girls')
#Almost forgot about boys tags... I wonder if theres also for other entities?
tars = tars.replace('1boy', 'one boy')
tars = tars.replace('2boys', 'two boys')
tars = tars.replace('3boys', 'three boys')
tars = tars.replace('4boys', 'four boys')
tars = tars.replace('5boys', 'five boys')
tars = tars.replace('6boys', 'six boys')
elif convtohuman == False:
print("ConvTagsToHuman: convtohuman is false hence not doing anything")
2022-09-10 18:38:22 -06:00
print("TARS is: " + tars)
return tars
#Converts ratings to X content
2022-09-09 02:49:52 -06:00
def ConvRatingToHuman(convtohuman, input):
if convtohuman:
if input == "e":
return "explicit content"
if input == "g":
return "general content"
if input == "q":
return "questionable content"
if input == "s":
return "sensitive content"
##This will be the start of everything unethical
elif convtohuman == False:
if input == "e":
return "explicit_content"
if input == "g":
return "general_content"
if input == "q":
return "questionable_content"
if input == "s":
return "sensitive_content"
def ConvCharacterToHuman(convtohuman, input):
tars = input
if convtohuman:
tars = tars.replace('_(', ' from ')
tars = tars.replace(')', '')
elif convtohuman == False:
print("ConvCharacterToHuman: convtohuman is false hence not doing anything")
2022-09-10 18:38:22 -06:00
return tars
2022-09-09 02:49:52 -06:00
# unrecog_ans = True
# while unrecog_ans:
# inputans = input("Convert tags to human-readable-text? (smiley_face blue_hair -> smiley face, blue hair) [y/n]")
# if inputans == "y":
# convtohuman = True
# unrecog_ans = False
# elif inputans == "n":
# convtohuman = False
# unrecog_ans = False
# else:
# print("unrecognizable input. only y or n.")
# unrecog_ans = True
convtohuman = args.convtohuman
2022-09-10 18:38:22 -06:00
acceptedRatings = args.rating
##Open the file
2022-09-09 02:49:52 -06:00
json_file_path = args.jsonpath ##Name of the JSON file to use, converted into parser arg
with open(json_file_path, 'r', encoding="utf8") as json_file:
json_list = list(json_file)
##Read line
current_saved_file_count = 0
current_line_count = 0
for json_str in json_list:
current_line_count = current_line_count + 1
##415627 last line of 00.json, ignore
2022-09-09 02:49:52 -06:00
##TODO: Add a line counter to print progress accurately
print("Current Line:" + str(current_line_count) + '/415000 (aprox) | Current saved files count: ' + str(current_saved_file_count) )
#here, result = line
result = json.loads(json_str)
try:
img_id = str(result['id'])
except Exception:
img_id = "nan"
2022-09-09 02:49:52 -06:00
print("img_id RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.")
continue
try:
tmp_img_id = img_id[-3:]
img_id_last3 = tmp_img_id.zfill(3)
except Exception:
img_id_last3 = "nan"
2022-09-09 02:49:52 -06:00
print("img_id_last3 RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.")
continue
2022-09-09 02:49:52 -06:00
# try:
# img_tags = result['tag_string']
# except Exception:
# img_tags = "none"
# print("failed to get img_tags")
# continue
##JohannesGaessler SUGGESTIONS: harubaru/waifu-diffusion/pull/11
## TAG_STRING_GENERAL: ONLY TAGS HERE
try:
2022-09-09 02:49:52 -06:00
img_tag_string_general = result['tag_string_general']
except Exception:
2022-09-09 02:49:52 -06:00
img_tag_string_general = None
print("img_tag_string_general RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.")
continue
2022-09-09 02:49:52 -06:00
## TAG_STRING_ARTIST: ONLY ARTISTS TAGS HERE
try:
img_tag_string_artist = result['tag_string_artist']
except Exception:
img_tag_string_artist = None
print("img_tag_string_artist RETRIVAL FAILED. Var is not essential so just skipping var.")
pass
## TAG_STRING_COPYRIGHT: ONLY COPYRIGHT TAGS HERE
try:
img_tag_string_copyright = result['tag_string_copyright']
except Exception:
img_tag_string_copyright = None
print("img_tag_string_copyright RETRIVAL FAILED. Var is not essential so just skipping var.")
pass
## TAG_STRING_CHARACTER: ONLY CHARACTER TAGS HERE
try:
img_tag_string_character = result['tag_string_character']
except Exception:
img_tag_string_character = None
print("img_tag_string_character RETRIVAL FAILED. Var is not essential so just skipping var.")
pass
try:
img_ext = result['file_ext']
except Exception:
2022-09-10 18:38:22 -06:00
img_ext = None
print("img_ext RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.")
continue
try:
img_rating = result['rating']
except Exception:
2022-09-09 02:49:52 -06:00
img_rating = None
2022-09-10 18:38:22 -06:00
print("img_rating RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.")
continue
2022-09-10 18:38:22 -06:00
baru = img_rating in acceptedRatings
# print("HEYYYYYYYYYYYYYYYY " + str(baru))
if str(baru) == "False":
print("Entry rating' is not in acceptedRatings, skipping entry.")
continue
elif str(baru) == "True":
print("Entry rating matches!")
2022-09-09 02:49:52 -06:00
file_path = str(args.imagespath) + "/0" + img_id_last3 + "/" + img_id + "." + img_ext
if os.path.exists(file_path):
2022-09-09 02:49:52 -06:00
shutil.copyfile(file_path, args.extractpath + '/' + img_id + "." + img_ext)
##Essential
FinalTagStringGeneral = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_general)
FinalTagStringGeneral = ConvTagsToHuman(convtohuman, FinalTagStringGeneral)
##Not essential
if img_tag_string_artist != None:
FinalTagStringArtist = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_artist)
elif img_tag_string_artist == None:
print("img_tag_string_artist is none")
else:
print("CE 1NE")
if img_tag_string_character != None:
FinalTagStringCharacter = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_character)
FinalTagStringCharacter = ConvCharacterToHuman(convtohuman, FinalTagStringCharacter)
elif img_tag_string_character == None:
print("img_tag_string_character is none")
else:
print("CE 2NE")
if img_tag_string_copyright != None:
FinalTagStringCopyright = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_copyright)
elif img_tag_string_copyright == None:
print("img_tag_string_copyright is none")
else:
print("CE 3NE")
2022-09-10 18:38:22 -06:00
print("IMAGE RATING IS: " + img_rating)
2022-09-09 02:49:52 -06:00
if img_rating != None:
FinalTagStringRating = ConvRatingToHuman(convtohuman, img_rating)
elif img_rating == None:
print("img_rating is none")
else:
print("CE 4NE")
if convtohuman == True:
dan_iden = 'uploaded on danbooru'
tag_separator = ', '
elif convtohuman == False:
dan_iden = 'danbooru'
tag_separator = ' '
2022-09-10 18:38:22 -06:00
# print('FinalTagStringCharacter is: ' + FinalTagStringCharacter)
# print('tag_separator is: ' + tag_separator)
# print('FinalTagStringArtist is: ' + FinalTagStringArtist)
# print('FinalTagStringRating is: ' + FinalTagStringRating)
# print('FinalTagStringGeneral is: ' + FinalTagStringGeneral)
# print('FinalTagStringCopyright is: ' + FinalTagStringCopyright)
2022-09-09 02:49:52 -06:00
to_write = FinalTagStringCharacter + tag_separator + FinalTagStringArtist + tag_separator + FinalTagStringRating + tag_separator + FinalTagStringGeneral + tag_separator + FinalTagStringCopyright
txt_name = args.extractpath + "/" + img_id + '.txt'
writefile(txt_name, to_write)
current_saved_file_count = current_saved_file_count + 1
2022-09-10 18:38:22 -06:00
elif os.path.exists(file_path) == False:
print("Failed to find path.")
2022-09-10 18:38:22 -06:00
print("finished process. Your extracted data should be in " + str(args.extractpath) + " !")
2022-09-09 02:49:52 -06:00