From 4904ba4db3e177e0460fd8d7561ec0caf8848942 Mon Sep 17 00:00:00 2001
From: chavinlo <85657083+chavinlo@users.noreply.github.com>
Date: Fri, 9 Sep 2022 03:49:52 -0500
Subject: [PATCH 1/3] in the works

---
 scripts/danbooru21_extract.py | 233 +++++++++++++++++++++++++++-------
 1 file changed, 188 insertions(+), 45 deletions(-)

diff --git a/scripts/danbooru21_extract.py b/scripts/danbooru21_extract.py
index 1d383dd..c664214 100644
--- a/scripts/danbooru21_extract.py
+++ b/scripts/danbooru21_extract.py
@@ -1,9 +1,31 @@
+## This script WAS NOT USED on the weights released by ProjectAI Touhou on 8th of september, 2022.
+## This script CAN convert tags to human-readable-text BUT IT IS NOT REQUIRED.
+import argparse
+#Stolen code from https://stackoverflow.com/a/43357954
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+## In the future someone might want to access this via import. Consider adding support for that
+parser = argparse.ArgumentParser()
+parser.add_argument('--jsonpath', '-J', type=str, help='Path to JSONL file with the metadata', required = True)
+parser.add_argument('--extractpath', '-E', type=str, help='Path to the folder where to extract the images and text files', required = True)
+parser.add_argument('--imagespath', '-I', type=str, help='Path to the folder with the images', required = False, default="512px")
+parser.add_argument('--convtohuman', '-H', type=str2bool, help='Convert to human-readable-text', required = False, default=True)
+args = parser.parse_args()
+print("Arguments: " + str(args))
+
 import json
 import os
 import shutil
-json_file_path = "metadata/posts000000000000.json" ##Name of the JSON file to use
-
-os.mkdir("labeled_data")
+if os.path.exists(args.extractpath) == False:
+    os.mkdir(args.extractpath)
 
 def writefile(filename, text):
     f = open(filename, "w")
@@ -11,41 +33,84 @@ def writefile(filename, text):
     print('Saved the following: ' + text)
     f.close()
 #Converts tags to T2I-like prompts (blue_dress, 1girl -> A blue dress, one girl)
-def convert_tags_to_humantxt(input):
-    tars = input
-    tars = tars.replace(' ', ', ')
-    tars = tars.replace('_', ' ')
-    tars = tars.replace('1girl', 'one girl')
-    tars = tars.replace('2girls', 'two girls')
-    tars = tars.replace('3girls', 'three girls')
-    tars = tars.replace('4girls', 'four girls')
-    tars = tars.replace('5girls', 'five girls')
-    ##Implying it will ever be able to differentiate so many entities
-    tars = tars.replace('6girls', 'six girls')
 
-    #Almost forgot about boys tags... I wonder if theres also for other entities?
-    tars = tars.replace('1boy', 'one boy')
-    tars = tars.replace('2boys', 'two boys')
-    tars = tars.replace('3boys', 'three boys')
-    tars = tars.replace('4boys', 'four boys')
-    tars = tars.replace('5boys', 'five boys')
-    tars = tars.replace('6boys', 'six boys')
-    print("FINAL TARS: " + tars)
+def ConvCommaAndUnderscoreToHuman(convtohuman, input):
+    tars = input
+    if convtohuman:
+        tars = tars.replace(' ', ', ')
+        tars = tars.replace('_', ' ')
+    elif convtohuman == False:
+        print("CommaAndUnderscoreToHuman: convtohuman is false hence not doing anything")
+
+def ConvTagsToHuman(convtohuman, input):
+    tars = input
+    if convtohuman:
+        tars = tars.replace('1girl', 'one girl')
+        tars = tars.replace('2girls', 'two girls')
+        tars = tars.replace('3girls', 'three girls')
+        tars = tars.replace('4girls', 'four girls')
+        tars = tars.replace('5girls', 'five girls')
+        ##Implying it will ever be able to differentiate so many entities
+        tars = tars.replace('6girls', 'six girls')
+
+        #Almost forgot about boys tags... I wonder if theres also for other entities?
+        tars = tars.replace('1boy', 'one boy')
+        tars = tars.replace('2boys', 'two boys')
+        tars = tars.replace('3boys', 'three boys')
+        tars = tars.replace('4boys', 'four boys')
+        tars = tars.replace('5boys', 'five boys')
+        tars = tars.replace('6boys', 'six boys')
+    elif convtohuman == False:
+        print("ConvTagsToHuman: convtohuman is false hence not doing anything")
     return tars
 
 #Converts ratings to X content
-def convert_rating_to_humanrating(input):
-    if input == "e":
-        return "explicit content"
-    if input == "g":
-        return "general content"
-    if input == "q":
-        return "questionable content"
-    if input == "s":
-        return "sensitive content"
-        ##This will be the start of everything unethical
+def ConvRatingToHuman(convtohuman, input):
+    if convtohuman:
+        if input == "e":
+            return "explicit content"
+        if input == "g":
+            return "general content"
+        if input == "q":
+            return "questionable content"
+        if input == "s":
+            return "sensitive content"
+            ##This will be the start of everything unethical
+    elif convtohuman == False:
+        if input == "e":
+            return "explicit_content"
+        if input == "g":
+            return "general_content"
+        if input == "q":
+            return "questionable_content"
+        if input == "s":
+            return "sensitive_content"
+
+def ConvCharacterToHuman(convtohuman, input):
+    tars = input
+    if convtohuman:
+        tars = tars.replace('_(', ' from ')
+        tars = tars.replace(')', '')
+    elif convtohuman == False:
+        print("ConvCharacterToHuman: convtohuman is false hence not doing anything")
+
+# unrecog_ans = True
+# while unrecog_ans:
+#     inputans = input("Convert tags to human-readable-text? (smiley_face blue_hair -> smiley face, blue hair) [y/n]")
+#     if inputans == "y":
+#         convtohuman = True
+#         unrecog_ans = False
+#     elif inputans == "n":
+#         convtohuman = False
+#         unrecog_ans = False
+#     else:
+#         print("unrecognizable input. only y or n.")
+#         unrecog_ans = True
+
+convtohuman = args.convtohuman
 
 ##Open the file
+json_file_path = args.jsonpath ##Name of the JSON file to use, converted into parser arg
 with open(json_file_path, 'r', encoding="utf8") as json_file:
     json_list = list(json_file)
 
@@ -55,7 +120,8 @@ current_line_count = 0
 for json_str in json_list:
     current_line_count = current_line_count + 1
     ##415627 last line of 00.json, ignore
-    print("Current Line:" + str(current_line_count) + '/415627 | Current saved files count: ' + str(current_saved_file_count) )
+    ##TODO: Add a line counter to print progress accurately
+    print("Current Line:" + str(current_line_count) + '/415000 (aprox) | Current saved files count: ' + str(current_saved_file_count) )
     #here, result = line
     result = json.loads(json_str)
 
@@ -63,7 +129,7 @@ for json_str in json_list:
         img_id = str(result['id'])
     except Exception:
         img_id = "nan"
-        print("failed to get img_id")
+        print("img_id RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.")
         continue
 
     try:
@@ -71,36 +137,113 @@ for json_str in json_list:
         img_id_last3 = tmp_img_id.zfill(3)
     except Exception:
         img_id_last3 = "nan"
-        print("failed to get img_id_last3")
+        print("img_id_last3 RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.")
         continue
     
+    # try:
+    #     img_tags = result['tag_string']
+    # except Exception:
+    #     img_tags = "none"
+    #     print("failed to get img_tags")
+    #     continue
+
+    ##JohannesGaessler SUGGESTIONS: harubaru/waifu-diffusion/pull/11
+
+        ## TAG_STRING_GENERAL: ONLY TAGS HERE
     try:
-        img_tags = result['tag_string']
+        img_tag_string_general = result['tag_string_general']
     except Exception:
-        img_tags = "none"
-        print("failed to get img_tags")
+        img_tag_string_general = None
+        print("img_tag_string_general RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.")
         continue
 
+        ## TAG_STRING_ARTIST: ONLY ARTISTS TAGS HERE
+    try:
+        img_tag_string_artist = result['tag_string_artist']
+    except Exception:
+        img_tag_string_artist = None
+        print("img_tag_string_artist RETRIVAL FAILED. Var is not essential so just skipping var.")
+        pass
+
+        ## TAG_STRING_COPYRIGHT: ONLY COPYRIGHT TAGS HERE
+    try:
+        img_tag_string_copyright = result['tag_string_copyright']
+    except Exception:
+        img_tag_string_copyright = None
+        print("img_tag_string_copyright RETRIVAL FAILED. Var is not essential so just skipping var.")
+        pass
+
+        ## TAG_STRING_CHARACTER: ONLY CHARACTER TAGS HERE
+    try:
+        img_tag_string_character = result['tag_string_character']
+    except Exception:
+        img_tag_string_character = None
+        print("img_tag_string_character RETRIVAL FAILED. Var is not essential so just skipping var.")
+        pass
+
     try:
         img_ext = result['file_ext']
     except Exception:
+        file_ext = None
         print("failed to get img_ext")
         continue
 
     try:
         img_rating = result['rating']
     except Exception:
+        img_rating = None
         print("failed to get img_rating")
         continue
 
-    file_path = "512px/0" + img_id_last3 + "/" + img_id + "." + img_ext
+    file_path =  str(args.imagespath) + "/0" + img_id_last3 + "/" + img_id + "." + img_ext
     if os.path.exists(file_path):
-        shutil.copyfile(file_path, 'labeled_data/' + img_id + "." + img_ext)
-        humanoid_tags = convert_tags_to_humantxt(img_tags)
-        humanoid_rating = convert_rating_to_humanrating(img_rating)
-        to_write = humanoid_tags + ', ' + humanoid_rating + ', uploaded on Danbooru'
-        txt_name = "labeled_data/" + img_id + '.txt'
+        shutil.copyfile(file_path, args.extractpath + '/' + img_id + "." + img_ext)
+
+        ##Essential
+        FinalTagStringGeneral = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_general)
+        print(FinalTagStringGeneral)
+        FinalTagStringGeneral = ConvTagsToHuman(convtohuman, FinalTagStringGeneral)
+
+        ##Not essential
+        if img_tag_string_artist != None:
+            FinalTagStringArtist = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_artist)
+        elif img_tag_string_artist == None:
+            print("img_tag_string_artist is none")
+        else:
+            print("CE 1NE")
+        
+        if img_tag_string_character != None:
+            FinalTagStringCharacter = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_character)
+            FinalTagStringCharacter = ConvCharacterToHuman(convtohuman, FinalTagStringCharacter)
+        elif img_tag_string_character == None:
+            print("img_tag_string_character is none")
+        else:
+            print("CE 2NE")
+
+        if img_tag_string_copyright != None:
+             FinalTagStringCopyright = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_copyright)
+        elif img_tag_string_copyright == None:
+            print("img_tag_string_copyright is none")
+        else:
+            print("CE 3NE")
+
+        if img_rating != None:
+             FinalTagStringRating = ConvRatingToHuman(convtohuman, img_rating)
+        elif img_rating == None:
+            print("img_rating is none")
+        else:
+            print("CE 4NE")
+
+        if convtohuman == True:
+            dan_iden = 'uploaded on danbooru'
+            tag_separator = ', '
+        elif convtohuman == False:
+            dan_iden = 'danbooru'
+            tag_separator = ' '
+        to_write = FinalTagStringCharacter + tag_separator + FinalTagStringArtist + tag_separator + FinalTagStringRating + tag_separator + FinalTagStringGeneral + tag_separator + FinalTagStringCopyright
+        txt_name = args.extractpath +  "/" + img_id + '.txt'
         writefile(txt_name, to_write)
         current_saved_file_count = current_saved_file_count + 1
-        
 
+print("finished process. Your extracted data should be in " + args.extractpath + " !")
+        

From f07fb01490793e5fa31be572b40139e57ca423aa Mon Sep 17 00:00:00 2001
From: chavinlo <85657083+chavinlo@users.noreply.github.com>
Date: Sat, 10 Sep 2022 19:38:22 -0500
Subject: [PATCH 2/3] Added rating selection and now works

---
 scripts/danbooru21_extract.py | 64 +++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/scripts/danbooru21_extract.py b/scripts/danbooru21_extract.py
index c664214..567b6e6 100644
--- a/scripts/danbooru21_extract.py
+++ b/scripts/danbooru21_extract.py
@@ -1,6 +1,8 @@
 ## This script WAS NOT USED on the weights released by ProjectAI Touhou on 8th of september, 2022.
 ## This script CAN convert tags to human-readable-text BUT IT IS NOT REQUIRED.
+
 import argparse
+import string
 #Stolen code from https://stackoverflow.com/a/43357954
 def str2bool(v):
     if isinstance(v, bool):
@@ -12,13 +14,38 @@ def str2bool(v):
     else:
         raise argparse.ArgumentTypeError('Boolean value expected.')
 
+def ratingparsing(input):
+    v = input.lower()
+    ratingsSelected = " "
+    if "a" in v:
+        ratingsSelected = "e g q s"
+    if "e" in v:
+        ratingsSelected = ratingsSelected + "e "
+    if "g" in v:
+        ratingsSelected = ratingsSelected + "g "
+    if "q" in v:
+        ratingsSelected = ratingsSelected + "q "
+    if "s" in v:
+        ratingsSelected = ratingsSelected + "s "
+    if ratingsSelected == " ":
+        raise Exception('a/e/g/q/s expected')
+    print("Ratings selected: " + ratingsSelected)
+    return(ratingsSelected)
 ## In the future someone might want to access this via import. Consider adding support for that
+
 parser = argparse.ArgumentParser()
 parser.add_argument('--jsonpath', '-J', type=str, help='Path to JSONL file with the metadata', required = True)
 parser.add_argument('--extractpath', '-E', type=str, help='Path to the folder where to extract the images and text files', required = True)
 parser.add_argument('--imagespath', '-I', type=str, help='Path to the folder with the images', required = False, default="512px")
-parser.add_argument('--convtohuman', '-H', type=str2bool, help='Convert to human-readable-text', required = False, default=True)
+parser.add_argument('--convtohuman', '-H', type=str2bool, help='Convert to human-readable-text', required = False, default=False)
+parser.add_argument('--rating', '-R', type=ratingparsing, help='Extract specific rating/s [a/e/g/q/s]', required = False, default='a')
 args = parser.parse_args()
+if args.convtohuman == True:
+    print("tag conversion to human is currently somewhat broken. If you still want to use it remove line 25")
+    #Q: What is broken?
+    #A: tag_separator sometimes appears at to_write without anything behind it. It should be an easy fix where tag_separator simply does not appear if the variable behind it is blank
+    #but right now its not important, plus many tokens are lost when converting to human text. its more effective doing tag based inputs rather than human-readable text
+    exit()
 print("Arguments: " + str(args))
 
 import json
@@ -41,6 +68,7 @@ def ConvCommaAndUnderscoreToHuman(convtohuman, input):
         tars = tars.replace('_', ' ')
     elif convtohuman == False:
         print("CommaAndUnderscoreToHuman: convtohuman is false hence not doing anything")
+    return tars
 
 def ConvTagsToHuman(convtohuman, input):
     tars = input
@@ -62,6 +90,7 @@ def ConvTagsToHuman(convtohuman, input):
         tars = tars.replace('6boys', 'six boys')
     elif convtohuman == False:
         print("ConvTagsToHuman: convtohuman is false hence not doing anything")
+        print("TARS is: " + tars)
     return tars
 
 #Converts ratings to X content
@@ -93,6 +122,7 @@ def ConvCharacterToHuman(convtohuman, input):
         tars = tars.replace(')', '')
     elif convtohuman == False:
         print("ConvCharacterToHuman: convtohuman is false hence not doing anything")
+    return tars
 
 # unrecog_ans = True
 # while unrecog_ans:
@@ -108,6 +138,7 @@ def ConvCharacterToHuman(convtohuman, input):
 #         unrecog_ans = True
 
 convtohuman = args.convtohuman
+acceptedRatings = args.rating
 
 ##Open the file
 json_file_path = args.jsonpath ##Name of the JSON file to use, converted into parser arg
@@ -184,24 +215,35 @@ for json_str in json_list:
     try:
         img_ext = result['file_ext']
     except Exception:
-        file_ext = None
-        print("failed to get img_ext")
+        img_ext = None
+        print("img_ext RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.")
         continue
 
     try:
         img_rating = result['rating']
     except Exception:
         img_rating = None
-        print("failed to get img_rating")
+        print("img_rating RETRIVAL FAILED. VAR IS ESSENTIAL SO SKIPPING ENTRY.")
         continue
 
+    baru = img_rating in acceptedRatings
+
+    # print("HEYYYYYYYYYYYYYYYY " + str(baru))
+
+    if str(baru) == "False":
+        print("Entry rating' is not in acceptedRatings, skipping entry.")
+        continue
+    elif str(baru) == "True":
+        print("Entry rating matches!")
+
+
+
     file_path =  str(args.imagespath) + "/0" + img_id_last3 + "/" + img_id + "." + img_ext
     if os.path.exists(file_path):
         shutil.copyfile(file_path, args.extractpath + '/' + img_id + "." + img_ext)
 
         ##Essential
         FinalTagStringGeneral = ConvCommaAndUnderscoreToHuman(convtohuman, img_tag_string_general)
-        print(FinalTagStringGeneral)
         FinalTagStringGeneral = ConvTagsToHuman(convtohuman, FinalTagStringGeneral)
 
         ##Not essential
@@ -227,6 +269,8 @@ for json_str in json_list:
         else:
             print("CE 3NE")
 
+        print("IMAGE RATING IS: " + img_rating)
+
         if img_rating != None:
              FinalTagStringRating = ConvRatingToHuman(convtohuman, img_rating)
         elif img_rating == None:
@@ -240,10 +284,18 @@ for json_str in json_list:
         elif convtohuman == False:
             dan_iden = 'danbooru'
             tag_separator = ' '
+        # print('FinalTagStringCharacter is: ' + FinalTagStringCharacter)
+        # print('tag_separator is: ' + tag_separator)
+        # print('FinalTagStringArtist is: ' + FinalTagStringArtist)
+        # print('FinalTagStringRating is: ' + FinalTagStringRating)
+        # print('FinalTagStringGeneral is: ' + FinalTagStringGeneral)
+        # print('FinalTagStringCopyright is: ' + FinalTagStringCopyright)
         to_write = FinalTagStringCharacter + tag_separator + FinalTagStringArtist + tag_separator + FinalTagStringRating + tag_separator + FinalTagStringGeneral + tag_separator + FinalTagStringCopyright
         txt_name = args.extractpath +  "/" + img_id + '.txt'
         writefile(txt_name, to_write)
         current_saved_file_count = current_saved_file_count + 1
+    elif os.path.exists(file_path) == False:
+        print("Failed to find path.")
 
-print("finished process. Your extracted data should be in " + args.extractpath + " !")
+print("finished process. Your extracted data should be in " + str(args.extractpath) + " !")
         

From 7f901bf25211010812003e12fe99c785b57b6aa9 Mon Sep 17 00:00:00 2001
From: chavinlo <85657083+chavinlo@users.noreply.github.com>
Date: Sat, 10 Sep 2022 19:39:49 -0500
Subject: [PATCH 3/3] removed string import

---
 scripts/danbooru21_extract.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/danbooru21_extract.py b/scripts/danbooru21_extract.py
index 567b6e6..b81ac5b 100644
--- a/scripts/danbooru21_extract.py
+++ b/scripts/danbooru21_extract.py
@@ -2,7 +2,7 @@
 ## This script CAN convert tags to human-readable-text BUT IT IS NOT REQUIRED.
 
 import argparse
-import string
+
 #Stolen code from https://stackoverflow.com/a/43357954
 def str2bool(v):
     if isinstance(v, bool):