waifu-diffusion/scripts/danbooru21_extract.py

import json
import os
import shutil
json_file_path = "metadata/posts000000000000.json" ##Name of the JSON file to use

os.mkdir("labeled_data")

def writefile(filename, text):
    f = open(filename, "w")
    f.write(text)
    print('Saved the following: ' + text)
    f.close()
#Converts tags to T2I-like prompts (blue_dress, 1girl -> A blue dress, one girl)
def convert_tags_to_humantxt(input):
    tars = input
    tars = tars.replace(' ', ', ')
    tars = tars.replace('_', ' ')
    tars = tars.replace('1girl', 'one girl')
    tars = tars.replace('2girls', 'two girls')
    tars = tars.replace('3girls', 'three girls')
    tars = tars.replace('4girls', 'four girls')
    tars = tars.replace('5girls', 'five girls')
    ##Implying it will ever be able to differentiate so many entities
    tars = tars.replace('6girls', 'six girls')

    #Almost forgot about boys tags... I wonder if theres also for other entities?
    tars = tars.replace('1boy', 'one boy')
    tars = tars.replace('2boys', 'two boys')
    tars = tars.replace('3boys', 'three boys')
    tars = tars.replace('4boys', 'four boys')
    tars = tars.replace('5boys', 'five boys')
    tars = tars.replace('6boys', 'six boys')
    print("FINAL TARS: " + tars)
    return tars

#Converts ratings to X content
def convert_rating_to_humanrating(input):
    if input == "e":
        return "explicit content"
    if input == "g":
        return "general content"
    if input == "q":
        return "questionable content"
    if input == "s":
        return "sensitive content"
        ##This will be the start of everything unethical

##Open the file
with open(json_file_path, 'r', encoding="utf8") as json_file:
    json_list = list(json_file)

##Read line
current_saved_file_count = 0
current_line_count = 0
for json_str in json_list:
    current_line_count = current_line_count + 1
    ##415627 last line of 00.json, ignore
    print("Current Line:" + str(current_line_count) + '/415627 | Current saved files count: ' + str(current_saved_file_count) )
    #here, result = line
    result = json.loads(json_str)

    try:
        img_id = str(result['id'])
    except Exception:
        img_id = "nan"
        print("failed to get img_id")
        continue

    try:
        tmp_img_id = img_id[-3:]
        img_id_last3 = tmp_img_id.zfill(3)
    except Exception:
        img_id_last3 = "nan"
        print("failed to get img_id_last3")
        continue
    
    try:
        img_tags = result['tag_string']
    except Exception:
        img_tags = "none"
        print("failed to get img_tags")
        continue

    try:
        img_ext = result['file_ext']
    except Exception:
        print("failed to get img_ext")
        continue

    try:
        img_rating = result['rating']
    except Exception:
        print("failed to get img_rating")
        continue

    file_path = "512px/0" + img_id_last3 + "/" + img_id + "." + img_ext
    if os.path.exists(file_path):
        shutil.copyfile(file_path, 'labeled_data/' + img_id + "." + img_ext)
        humanoid_tags = convert_tags_to_humantxt(img_tags)
        humanoid_rating = convert_rating_to_humanrating(img_rating)
        to_write = humanoid_tags + ', ' + humanoid_rating + ', uploaded on Danbooru'
        txt_name = "labeled_data/" + img_id + '.txt'
        writefile(txt_name, to_write)
        current_saved_file_count = current_saved_file_count + 1
Added docs and script to extract from danbooru JSON Will continue documentation once my instance goes back online 2022-09-05 18:16:48 -06:00			`import json`
			`import os`
			`import shutil`
			`json_file_path = "metadata/posts000000000000.json" ##Name of the JSON file to use`

			`os.mkdir("labeled_data")`

			`def writefile(filename, text):`
			`f = open(filename, "w")`
			`f.write(text)`
			`print('Saved the following: ' + text)`
			`f.close()`
			`#Converts tags to T2I-like prompts (blue_dress, 1girl -> A blue dress, one girl)`
			`def convert_tags_to_humantxt(input):`
			`tars = input`
			`tars = tars.replace(' ', ', ')`
			`tars = tars.replace('_', ' ')`
			`tars = tars.replace('1girl', 'one girl')`
			`tars = tars.replace('2girls', 'two girls')`
			`tars = tars.replace('3girls', 'three girls')`
			`tars = tars.replace('4girls', 'four girls')`
			`tars = tars.replace('5girls', 'five girls')`
			`##Implying it will ever be able to differentiate so many entities`
			`tars = tars.replace('6girls', 'six girls')`

			`#Almost forgot about boys tags... I wonder if theres also for other entities?`
fix tag replace typo in danbooru21_extract.py 1boy was being replaced with 'one girl', now it correctly gets replaced with 'one boy'. 2022-09-08 12:52:38 -06:00			`tars = tars.replace('1boy', 'one boy')`
Added docs and script to extract from danbooru JSON Will continue documentation once my instance goes back online 2022-09-05 18:16:48 -06:00			`tars = tars.replace('2boys', 'two boys')`
			`tars = tars.replace('3boys', 'three boys')`
			`tars = tars.replace('4boys', 'four boys')`
			`tars = tars.replace('5boys', 'five boys')`
			`tars = tars.replace('6boys', 'six boys')`
			`print("FINAL TARS: " + tars)`
			`return tars`

			`#Converts ratings to X content`
			`def convert_rating_to_humanrating(input):`
			`if input == "e":`
			`return "explicit content"`
			`if input == "g":`
			`return "general content"`
			`if input == "q":`
			`return "questionable content"`
			`if input == "s":`
			`return "sensitive content"`
			`##This will be the start of everything unethical`

			`##Open the file`
			`with open(json_file_path, 'r', encoding="utf8") as json_file:`
			`json_list = list(json_file)`

			`##Read line`
			`current_saved_file_count = 0`
			`current_line_count = 0`
			`for json_str in json_list:`
			`current_line_count = current_line_count + 1`
			`##415627 last line of 00.json, ignore`
			`print("Current Line:" + str(current_line_count) + '/415627 \| Current saved files count: ' + str(current_saved_file_count) )`
			`#here, result = line`
			`result = json.loads(json_str)`

			`try:`
			`img_id = str(result['id'])`
			`except Exception:`
			`img_id = "nan"`
			`print("failed to get img_id")`
			`continue`

			`try:`
			`tmp_img_id = img_id[-3:]`
			`img_id_last3 = tmp_img_id.zfill(3)`
			`except Exception:`
			`img_id_last3 = "nan"`
			`print("failed to get img_id_last3")`
			`continue`

			`try:`
			`img_tags = result['tag_string']`
			`except Exception:`
			`img_tags = "none"`
			`print("failed to get img_tags")`
			`continue`

			`try:`
			`img_ext = result['file_ext']`
			`except Exception:`
			`print("failed to get img_ext")`
			`continue`

			`try:`
			`img_rating = result['rating']`
			`except Exception:`
			`print("failed to get img_rating")`
			`continue`

			`file_path = "512px/0" + img_id_last3 + "/" + img_id + "." + img_ext`
			`if os.path.exists(file_path):`
			`shutil.copyfile(file_path, 'labeled_data/' + img_id + "." + img_ext)`
			`humanoid_tags = convert_tags_to_humantxt(img_tags)`
			`humanoid_rating = convert_rating_to_humanrating(img_rating)`
			`to_write = humanoid_tags + ', ' + humanoid_rating + ', uploaded on Danbooru'`
			`txt_name = "labeled_data/" + img_id + '.txt'`
			`writefile(txt_name, to_write)`
			`current_saved_file_count = current_saved_file_count + 1`