Skip to content

Commit

Permalink
#UPDATE 统一模型加载精度,修复模型第二次无法切换的BUG,高级批量字幕增加重命名开关
Browse files Browse the repository at this point in the history
  • Loading branch information
EvilBT committed Oct 18, 2024
1 parent a366a0d commit 62a132a
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 18 deletions.
43 changes: 29 additions & 14 deletions joy_caption_two_node.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import shutil
import time

import numpy as np
Expand Down Expand Up @@ -130,6 +131,8 @@ def __init__(self, load_device, offload_device):
image_adapter = ImageAdapter(1152, 4096, False, False, 38,
False) # ImageAdapter(clip_model.config.hidden_size, 4096)
image_adapter.load_state_dict(torch.load(adapter_path, map_location=self.offload_device, weights_only=True))
img_dtype = text_encoder_dtype()
image_adapter = image_adapter.to(img_dtype)
image_adapter.eval()
self.image_adapter = image_adapter

Expand All @@ -148,6 +151,7 @@ def __init__(self, load_device, offload_device, model_id):
self.offload_device = offload_device
self.type = text_encoder_dtype()
self.model_id = model_id
self.current_model_id = None

print("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(os.path.join(BASE_MODEL_PATH, "text_model"), use_fast=True)
Expand All @@ -158,8 +162,8 @@ def __init__(self, load_device, offload_device, model_id):
self.text_model = None

def load_llm_model(self):
if self.text_model is None:
print("Loading LLM")
if self.text_model is None or self.current_model_id != self.model_id:
print(f"Loading LLM: {self.model_id}")
LLM_PATH = download_hg_model(self.model_id, "LLM")
text_model_path = os.path.join(BASE_MODEL_PATH, "text_model")
modify_json_value(os.path.join(text_model_path, "adapter_config.json"), "base_model_name_or_path",
Expand Down Expand Up @@ -191,11 +195,13 @@ def load_llm_model(self):
break
time.sleep(1 + retries / 2)
# print(f"现在呢:{get_free_memory()/1024/1024}")
self.current_model_id = self.model_id
return self.text_model

def clear_gpu(self, low_vram):
del self.text_model
self.text_model = None
self.current_model_id = None
torch.cuda.empty_cache()
import gc
gc.collect()
Expand Down Expand Up @@ -225,7 +231,10 @@ def loadModels(self):
self.image_adapter = JoyImageAdapter(self.load_device, self.offload_device)

def loadLLM(self, model_id):
self.llm = JoyLLM(self.load_device, self.offload_device, model_id)
if self.llm is None:
self.llm = JoyLLM(self.load_device, self.offload_device, model_id)
else:
self.llm.model_id = model_id


class Joy_caption_two_load:
Expand Down Expand Up @@ -348,8 +357,7 @@ def generate(self, joy_two_pipeline: JoyTwoPipeline, image, caption_type, captio
},
]

if joy_two_pipeline.llm is None:
joy_two_pipeline.loadLLM(joy_two_pipeline.model)
joy_two_pipeline.loadLLM(joy_two_pipeline.model)

tokenizer = joy_two_pipeline.llm.tokenizer
# Format the conversation
Expand Down Expand Up @@ -512,8 +520,7 @@ def generate(self, joy_two_pipeline: JoyTwoPipeline, image, extra_options, capti
},
]

if joy_two_pipeline.llm is None:
joy_two_pipeline.loadLLM(joy_two_pipeline.model)
joy_two_pipeline.loadLLM(joy_two_pipeline.model)

tokenizer = joy_two_pipeline.llm.tokenizer
# Format the conversation
Expand Down Expand Up @@ -628,8 +635,7 @@ def generate_caption(self, joy_two_pipeline: JoyTwoPipeline, image, prompt, low_
},
]

if joy_two_pipeline.llm is None:
joy_two_pipeline.loadLLM(joy_two_pipeline.model)
joy_two_pipeline.loadLLM(joy_two_pipeline.model)

tokenizer = joy_two_pipeline.llm.tokenizer
# Format the conversation
Expand Down Expand Up @@ -779,6 +785,9 @@ def INPUT_TYPES(s):
"joy_two_pipeline": ("JoyTwoPipeline",),
"input_dir": ("STRING", {"default": ""}),
"output_dir": ("STRING", {"default": ""}),
"rename": ("BOOLEAN", {"default": False}),
"prefix_name": ("STRING", {"default": ""}),
"start_index": ("INT", {"default": 1, "min": 0, "max": 9999999, "step": 1}),
"extra_options": ("Extra_Options", ),
"caption_type": (caption_types, {}),
"caption_length": (caption_lengths, {"default": "long"}),
Expand Down Expand Up @@ -822,8 +831,7 @@ def generate_caption(self, joy_two_pipeline: JoyTwoPipeline, image, prompt, top_
},
]

if joy_two_pipeline.llm is None:
joy_two_pipeline.loadLLM(joy_two_pipeline.model)
joy_two_pipeline.loadLLM(joy_two_pipeline.model)

tokenizer = joy_two_pipeline.llm.tokenizer
# Format the conversation
Expand Down Expand Up @@ -883,7 +891,7 @@ def generate_caption(self, joy_two_pipeline: JoyTwoPipeline, image, prompt, top_

return caption.strip()

def generate(self, joy_two_pipeline: JoyTwoPipeline, input_dir, output_dir, extra_options, caption_type, caption_length, name, custom_prompt, low_vram, top_p, temperature):
def generate(self, joy_two_pipeline: JoyTwoPipeline, input_dir, output_dir, rename, prefix_name, start_index, extra_options, caption_type, caption_length, name, custom_prompt, low_vram, top_p, temperature):
torch.cuda.empty_cache()

if joy_two_pipeline.clip_model == None:
Expand Down Expand Up @@ -946,10 +954,17 @@ def generate(self, joy_two_pipeline: JoyTwoPipeline, input_dir, output_dir, extr
print(f"打开{image_path}")
with Image.open(image_path) as img:
if img.mode == 'RGBA':
img = img.convert('RGB')
image = img.convert('RGB').resize((384, 384), Image.LANCZOS)
else:
image = img.resize((384, 384), Image.LANCZOS)
pbar.update_absolute(step, image_count)
image = img.resize((384, 384), Image.LANCZOS)
caption = self.generate_caption(joy_two_pipeline, image, prompt_str, top_p, temperature)
if rename:
new_filename = f"{prefix_name}_{start_index + finished_image_count}{os.path.splitext(filename)[1]}"
new_image_path = os.path.join(output_dir, new_filename)
shutil.copyfile(image_path, new_image_path)
text_path = os.path.join(output_dir, os.path.splitext(new_filename)[0] + '.txt')

with open(text_path, 'w', encoding='utf-8') as f:
f.write(caption)
finished_image_count += 1
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "comfyui_slk_joy_caption_two"
description = "NODES:Joy Caption Two, Joy Caption Two Advanced, Joy Caption Two Load, Joy Caption Extra Options"
version = "0.0.6"
version = "0.0.7"
license = {file = "LICENSE"}
dependencies = ["huggingface_hub==0.23.4", "transformers>=4.44.0", "numpy", "sentencepiece", "pillow>=10.1.0", "bitsandbytes>=0.44.1", "peft==0.12.0"]

Expand Down
3 changes: 3 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# JoyCaptionAlpha Two for ComfyUI
[English](./readme_us.md) | 中文

Joy Caption 原作者在这:https://github.com/fpgaminer/joycaption ,非常感谢他的开源!

## Recent changes
* [2024-10-16] v0.0.7: 统一模型加载精度,修复模型第二次无法切换的BUG,高级批量字幕增加重命名开关。
* [2024-10-16] v0.0.6: 高级模式增加top_p与temperature,给予更多的选择,添加更多的大模型选择,我试了一下 [John6666/Llama-3.1-8B-Lexi-Uncensored-V2-nf4](https://huggingface.co/John6666/Llama-3.1-8B-Lexi-Uncensored-V2-nf4)
效果不错,你们也可以尝试使用,另外也添加了原版的模型 [Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2](https://huggingface.co/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2),可以自行选择
* [2024-10-15] v0.0.5: 修复批处理时图片有透明通道 RGBA 时的BUG
Expand Down
4 changes: 4 additions & 0 deletions readme_us.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# JoyCaptionAlpha Two for ComfyUI
English | [中文](./readme.md)

Huge thanks to the original Joy Caption author, fpgaminer, whose work is available at: https://github.com/fpgaminer/joycaption. We deeply appreciate their contribution to open source!

## Recent changes
* [2024-10-16] v0.0.7: Unified model loading precision, fixed the bug where the model could not be switched the second time, and added a rename switch to advanced batch captioning.
* [2024-10-16] v0.0.6: Added `top_p` and `temperature` parameters to the advanced mode for greater control. Expanded the selection of large language models. I tested [John6666/Llama-3.1-8B-Lexi-Uncensored-V2-nf4](https://huggingface.co/John6666/Llama-3.1-8B-Lexi-Uncensored-V2-nf4) and found the results quite good; you can also try it out. Additionally, the original [Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2](https://huggingface.co/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2) model has been added as an option.
* [2024-10-15] v0.0.5: Fix the bug when processing images with an alpha channel (RGBA) in batch.
* [2024-10-15] v0.0.4: Added batch processing nodes: When the output directory is empty, it will be saved in the image folder. You can find the example workflow in the examples directory.
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
huggingface_hub==0.23.4
transformers>=4.44.0
numpy
sentencepiece
pillow>=10.1.0
numpy==1.26.4
sentencepiece==0.2.0
pillow>=10.4.0
bitsandbytes>=0.44.1
peft>=0.12.0

0 comments on commit 62a132a

Please sign in to comment.