mrhuggin_codet5.py

# -*- coding: utf-8 -*-
"""mrhuggin-CodeT5.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1GRAZKjqcY97b5Q88wl0FQEl1biCctG0L
"""

# Commented out IPython magic to ensure Python compatibility.
# %pip install torch torchvision torchaudio

# Commented out IPython magic to ensure Python compatibility.
# %pip install git+https://github.com/huggingface/transformers

# Commented out IPython magic to ensure Python compatibility.
# %pip install git+https://github.com/huggingface/peft.git

# Commented out IPython magic to ensure Python compatibility.
# %pip install datasets accelerate huggingface_hub

# Commented out IPython magic to ensure Python compatibility.
# %pip install transformers

"""### Getting single inference

This block of code performs inference on a single input string. This example is with language translation.

The code in this section is adapted from https://huggingface.co/docs/transformers/main/en/model_doc/t5#inference
"""

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

input_ids = tokenizer("translate English to French: I would like to go to the store.", return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

"""### Small Batch Inference

Now the model will do inference on a small batch of inputs.
"""

# Prefix for each small sentence.
task_prefix = "translate English to French: "

# List of 20 sentences to translate. Generated by ChatGPT
sentences = ["The sun shines brightly.", "Birds sing in the morning.",
             "Leaves rustle in the wind.", "Flowers bloom in spring.",
             "The sky is crystal blue.", "Stars twinkle at night.",
             "Rain patters on the roof.", "Cats purr softly.",
             "Dogs bark excitedly.", "Books open new worlds.",
             "Music fills the air.", "Children laugh and play.",
             "Art inspires the soul.", "Friends share secrets.",
             "Trains whistle in the distance.", "Bicycles glide on the path.",
             "Snowflakes fall gently.", "Cookies bake in the oven.",
             "Leaves change in autumn.", "Waves crash on the shore."]

inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)

output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    do_sample=False,  # disable sampling to test if batching affects output
)

print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))