#UPDATE 统一模型加载精度，修复模型第二次无法切换的BUG，高级批量字幕增加重命名开关

EvilBT · Oct 18, 2024 · 62a132a · 62a132a
1 parent a366a0d
commit 62a132a
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 18 deletions.
diff --git a/joy_caption_two_node.py b/joy_caption_two_node.py
@@ -1,3 +1,4 @@
+import shutil
 import time
 
 import numpy as np
@@ -130,6 +131,8 @@ def __init__(self, load_device, offload_device):
         image_adapter = ImageAdapter(1152, 4096, False, False, 38,
                                      False)  # ImageAdapter(clip_model.config.hidden_size, 4096)
         image_adapter.load_state_dict(torch.load(adapter_path, map_location=self.offload_device, weights_only=True))
+        img_dtype = text_encoder_dtype()
+        image_adapter = image_adapter.to(img_dtype)
         image_adapter.eval()
         self.image_adapter = image_adapter
 
@@ -148,6 +151,7 @@ def __init__(self, load_device, offload_device, model_id):
         self.offload_device = offload_device
         self.type = text_encoder_dtype()
         self.model_id = model_id
+        self.current_model_id = None
 
         print("Loading tokenizer")
         tokenizer = AutoTokenizer.from_pretrained(os.path.join(BASE_MODEL_PATH, "text_model"), use_fast=True)
@@ -158,8 +162,8 @@ def __init__(self, load_device, offload_device, model_id):
         self.text_model = None
 
     def load_llm_model(self):
-        if self.text_model is None:
-            print("Loading LLM")
+        if self.text_model is None or self.current_model_id != self.model_id:
+            print(f"Loading LLM: {self.model_id}")
             LLM_PATH = download_hg_model(self.model_id, "LLM")
             text_model_path = os.path.join(BASE_MODEL_PATH, "text_model")
             modify_json_value(os.path.join(text_model_path, "adapter_config.json"), "base_model_name_or_path",
@@ -191,11 +195,13 @@ def load_llm_model(self):
                         break
                     time.sleep(1 + retries / 2)
             # print(f"现在呢:{get_free_memory()/1024/1024}")
+            self.current_model_id = self.model_id
         return self.text_model
 
     def clear_gpu(self, low_vram):
         del self.text_model
         self.text_model = None
+        self.current_model_id = None
         torch.cuda.empty_cache()
         import gc
         gc.collect()
@@ -225,7 +231,10 @@ def loadModels(self):
         self.image_adapter = JoyImageAdapter(self.load_device, self.offload_device)
 
     def loadLLM(self, model_id):
-        self.llm = JoyLLM(self.load_device, self.offload_device, model_id)
+        if self.llm is None:
+            self.llm = JoyLLM(self.load_device, self.offload_device, model_id)
+        else:
+            self.llm.model_id = model_id
 
 
 class Joy_caption_two_load:
@@ -348,8 +357,7 @@ def generate(self, joy_two_pipeline: JoyTwoPipeline, image, caption_type, captio
             },
         ]
 
-        if joy_two_pipeline.llm is None:
-            joy_two_pipeline.loadLLM(joy_two_pipeline.model)
+        joy_two_pipeline.loadLLM(joy_two_pipeline.model)
 
         tokenizer = joy_two_pipeline.llm.tokenizer
         # Format the conversation
@@ -512,8 +520,7 @@ def generate(self, joy_two_pipeline: JoyTwoPipeline, image, extra_options, capti
             },
         ]
 
-        if joy_two_pipeline.llm is None:
-            joy_two_pipeline.loadLLM(joy_two_pipeline.model)
+        joy_two_pipeline.loadLLM(joy_two_pipeline.model)
 
         tokenizer = joy_two_pipeline.llm.tokenizer
         # Format the conversation
@@ -628,8 +635,7 @@ def generate_caption(self, joy_two_pipeline: JoyTwoPipeline, image, prompt, low_
             },
         ]
 
-        if joy_two_pipeline.llm is None:
-            joy_two_pipeline.loadLLM(joy_two_pipeline.model)
+        joy_two_pipeline.loadLLM(joy_two_pipeline.model)
 
         tokenizer = joy_two_pipeline.llm.tokenizer
         # Format the conversation
@@ -779,6 +785,9 @@ def INPUT_TYPES(s):
                 "joy_two_pipeline": ("JoyTwoPipeline",),
                 "input_dir": ("STRING", {"default": ""}),
                 "output_dir": ("STRING", {"default": ""}),
+                "rename": ("BOOLEAN", {"default": False}),
+                "prefix_name": ("STRING", {"default": ""}),
+                "start_index": ("INT", {"default": 1, "min": 0, "max": 9999999, "step": 1}),
                 "extra_options": ("Extra_Options", ),
                 "caption_type": (caption_types, {}),
                 "caption_length": (caption_lengths, {"default": "long"}),
@@ -822,8 +831,7 @@ def generate_caption(self, joy_two_pipeline: JoyTwoPipeline, image, prompt, top_
             },
         ]
 
-        if joy_two_pipeline.llm is None:
-            joy_two_pipeline.loadLLM(joy_two_pipeline.model)
+        joy_two_pipeline.loadLLM(joy_two_pipeline.model)
 
         tokenizer = joy_two_pipeline.llm.tokenizer
         # Format the conversation
@@ -883,7 +891,7 @@ def generate_caption(self, joy_two_pipeline: JoyTwoPipeline, image, prompt, top_
 
         return caption.strip()
 
-    def generate(self, joy_two_pipeline: JoyTwoPipeline, input_dir, output_dir, extra_options, caption_type, caption_length, name, custom_prompt, low_vram, top_p, temperature):
+    def generate(self, joy_two_pipeline: JoyTwoPipeline, input_dir, output_dir, rename, prefix_name, start_index, extra_options, caption_type, caption_length, name, custom_prompt, low_vram, top_p, temperature):
         torch.cuda.empty_cache()
 
         if joy_two_pipeline.clip_model == None:
@@ -946,10 +954,17 @@ def generate(self, joy_two_pipeline: JoyTwoPipeline, input_dir, output_dir, extr
                     print(f"打开{image_path}")
                     with Image.open(image_path) as img:
                         if img.mode == 'RGBA':
-                            img = img.convert('RGB')
+                            image = img.convert('RGB').resize((384, 384), Image.LANCZOS)
+                        else:
+                            image = img.resize((384, 384), Image.LANCZOS)
                         pbar.update_absolute(step, image_count)
-                        image = img.resize((384, 384), Image.LANCZOS)
                         caption = self.generate_caption(joy_two_pipeline, image, prompt_str, top_p, temperature)
+                        if rename:
+                            new_filename = f"{prefix_name}_{start_index + finished_image_count}{os.path.splitext(filename)[1]}"
+                            new_image_path = os.path.join(output_dir, new_filename)
+                            shutil.copyfile(image_path, new_image_path)
+                            text_path = os.path.join(output_dir, os.path.splitext(new_filename)[0] + '.txt')
+
                         with open(text_path, 'w', encoding='utf-8') as f:
                             f.write(caption)
                     finished_image_count += 1

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "comfyui_slk_joy_caption_two"
 description = "NODES:Joy Caption Two, Joy Caption Two Advanced, Joy Caption Two Load, Joy Caption Extra Options"
-version = "0.0.6"
+version = "0.0.7"
 license = {file = "LICENSE"}
 dependencies = ["huggingface_hub==0.23.4", "transformers>=4.44.0", "numpy", "sentencepiece", "pillow>=10.1.0", "bitsandbytes>=0.44.1", "peft==0.12.0"]
 

diff --git a/readme.md b/readme.md
@@ -1,7 +1,10 @@
 # JoyCaptionAlpha Two for ComfyUI
 [English](./readme_us.md) | 中文
 
+Joy Caption 原作者在这：https://github.com/fpgaminer/joycaption ，非常感谢他的开源！
+
 ## Recent changes
+* [2024-10-16] v0.0.7: 统一模型加载精度，修复模型第二次无法切换的BUG，高级批量字幕增加重命名开关。
 * [2024-10-16] v0.0.6: 高级模式增加top_p与temperature，给予更多的选择，添加更多的大模型选择，我试了一下 [John6666/Llama-3.1-8B-Lexi-Uncensored-V2-nf4](https://huggingface.co/John6666/Llama-3.1-8B-Lexi-Uncensored-V2-nf4)
 效果不错，你们也可以尝试使用，另外也添加了原版的模型 [Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2](https://huggingface.co/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2)，可以自行选择
 * [2024-10-15] v0.0.5: 修复批处理时图片有透明通道 RGBA 时的BUG

diff --git a/readme_us.md b/readme_us.md
@@ -1,6 +1,10 @@
 # JoyCaptionAlpha Two for ComfyUI
 English | [中文](./readme.md)
+
+Huge thanks to the original Joy Caption author, fpgaminer, whose work is available at: https://github.com/fpgaminer/joycaption. We deeply appreciate their contribution to open source!
+
 ## Recent changes
+* [2024-10-16] v0.0.7: Unified model loading precision, fixed the bug where the model could not be switched the second time, and added a rename switch to advanced batch captioning.
 * [2024-10-16] v0.0.6: Added `top_p` and `temperature` parameters to the advanced mode for greater control. Expanded the selection of large language models. I tested [John6666/Llama-3.1-8B-Lexi-Uncensored-V2-nf4](https://huggingface.co/John6666/Llama-3.1-8B-Lexi-Uncensored-V2-nf4) and found the results quite good; you can also try it out. Additionally, the original [Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2](https://huggingface.co/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2) model has been added as an option.
 * [2024-10-15] v0.0.5: Fix the bug when processing images with an alpha channel (RGBA) in batch.
 * [2024-10-15] v0.0.4: Added batch processing nodes: When the output directory is empty, it will be saved in the image folder.  You can find the example workflow in the examples directory.

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 huggingface_hub==0.23.4
 transformers>=4.44.0
-numpy
-sentencepiece
-pillow>=10.1.0
+numpy==1.26.4
+sentencepiece==0.2.0
+pillow>=10.4.0
 bitsandbytes>=0.44.1
 peft>=0.12.0