Skip to content

Commit

Permalink
refactor tests directoryto root directory
Browse files Browse the repository at this point in the history
add tests for model trainer on GPU and its workflow
  • Loading branch information
KevKibe committed Nov 15, 2024
1 parent 1354b9b commit 5e782cc
Show file tree
Hide file tree
Showing 16 changed files with 163 additions and 21 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Ruff formatting
name: Run Linting with Ruff on Multiple OS Environments

on: [pull_request]

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/training_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ jobs:
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
WANDB_TOKEN: ${{ secrets.WANDB_TOKEN }}
run: pytest -vv src/tests/test_audio_processor.py src/tests/test_data_prep.py src/tests/test_load_dataset.py
run: pytest -vv tests/test_audio_processor.py src/tests/test_data_prep.py src/tests/test_load_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ jobs:
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
WANDB_TOKEN: ${{ secrets.WANDB_TOKEN }}
run: pytest -vv src/tests/test_model_optimization.py src/tests/test_transcription_pipeline.py
run: pytest -vv tests/test_model_optimization.py src/tests/test_transcription_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ jobs:
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
WANDB_TOKEN: ${{ secrets.WANDB_TOKEN }}
run: pytest -vv src/tests/test_model_prep.py
run: pytest -vv tests/test_model_prep.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Test training.model_trainer Module.
name: Test training.model_trainer Module on CPU

on: [pull_request]

Expand Down Expand Up @@ -44,4 +44,4 @@ jobs:
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
WANDB_TOKEN: ${{ secrets.WANDB_TOKEN }}
run: pytest -vv src/tests/test_model_trainer.py
run: pytest -vv tests/test_model_trainer.py
27 changes: 27 additions & 0 deletions .github/workflows/unit_test_model_trainer_gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Test training.model_trainer Module with GPU.

on: [pull_request]

jobs:
run_kaggle_script_action:
runs-on: ubuntu-latest

steps:
- name: Checkout Repository
uses: actions/checkout@v3

- name: Execute Tests with GPU Support
uses: KevKibe/kaggle-script-action@v1.0.1
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
WANDB_TOKEN: ${{ secrets.WANDB_TOKEN }}
with:
username: ${{ secrets.KAGGLE_USERNAME }}
key: ${{ secrets.KAGGLE_KEY }}
title: "Test PEFT Finetuning"
custom_script: |
pytest -vv tests/test_model_trainer.py
enable_internet: true
enable_gpu: true
enable_tpu: false
sleep_time: 60
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest
from training.audio_data_processor import AudioDataProcessor
from training.whisper_model_prep import WhisperModelPrep
from training.load_data import Dataset
from src.training.audio_data_processor import AudioDataProcessor
from src.training.whisper_model_prep import WhisperModelPrep
from src.training.load_data import Dataset
import os
from dotenv import load_dotenv
load_dotenv()
Expand Down
6 changes: 3 additions & 3 deletions src/tests/test_data_prep.py → tests/test_data_prep.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import unittest
# from datasets import Dataset
from training.data_prep import DataPrep
from training.load_data import Dataset
from training.whisper_model_prep import WhisperModelPrep
from src.training.data_prep import DataPrep
from src.training.load_data import Dataset
from src.training.whisper_model_prep import WhisperModelPrep
from datasets import IterableDataset
import os
from dotenv import load_dotenv
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import unittest
from training.load_data import Dataset
from src.training.load_data import Dataset
import os
from dotenv import load_dotenv
load_dotenv()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import unittest
from deployment.speech_inference import ModelOptimization
from src.deployment.speech_inference import ModelOptimization
import torch
import os
from deployment.faster_whisper.asr import FasterWhisperPipeline
Expand Down
2 changes: 1 addition & 1 deletion src/tests/test_model_prep.py → tests/test_model_prep.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import unittest
from training.whisper_model_prep import WhisperModelPrep
from src.training.whisper_model_prep import WhisperModelPrep
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration

class TestDatasetManager(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest
from training.model_trainer import Trainer
from training.data_prep import DataPrep
from src.training.model_trainer import Trainer
from src.training.data_prep import DataPrep
import os
from dotenv import load_dotenv
load_dotenv()
Expand Down Expand Up @@ -47,7 +47,7 @@ def setUp(self) -> None:
tokenizer=tokenizer,
wandb_api_key=os.environ.get("WANDB_TOKEN"),
use_peft=False,
processing_task="translate"
processing_task="transcribe"
)
self.trainer_batch = Trainer(
language =["af"],
Expand All @@ -60,7 +60,7 @@ def setUp(self) -> None:
tokenizer=tokenizer,
wandb_api_key="e0fda284061622e0f7858d6c684281d48fa05ecf",
use_peft=False,
processing_task="translate"
processing_task="transcribe"
)

return super().setUp()
Expand Down
115 changes: 115 additions & 0 deletions tests/test_model_trainer_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import unittest
from src.training.model_trainer import Trainer
from src.training.data_prep import DataPrep
import os
from dotenv import load_dotenv
load_dotenv()

class TestTrainerManager(unittest.TestCase):
"""Test cases for the Trainer class."""

def setUp(self) -> None:
# Common setup for both test cases
self.model_id = "openai/whisper-tiny"
process = DataPrep(
huggingface_token=os.environ.get("HF_TOKEN"),
dataset_name="mozilla-foundation/common_voice_16_1",
language_abbr=["af"],
model_id=self.model_id,
processing_task="transcribe",
use_peft=True,
)
tokenizer, feature_extractor, feature_processor, model = process.prepare_model()

# Load datasets
self.dataset_streaming = process.load_dataset(
feature_extractor, tokenizer, feature_processor, streaming=True,
train_num_samples=10, test_num_samples=10
)
self.dataset_batch = process.load_dataset(
feature_extractor, tokenizer, feature_processor, streaming=False,
train_num_samples=10, test_num_samples=10
)

# Check if train/test samples exist in both streaming and batch datasets
self._validate_dataset(self.dataset_streaming, "streaming")
self._validate_dataset(self.dataset_batch, "batch")

# Set up trainers for both streaming and batch datasets
self.trainer_streaming = Trainer(
language=["af"],
huggingface_token=os.environ.get("HF_TOKEN"),
model_id=self.model_id,
dataset=self.dataset_streaming,
model=model,
feature_processor=feature_processor,
feature_extractor=feature_extractor,
tokenizer=tokenizer,
wandb_api_key=os.environ.get("WANDB_TOKEN"),
use_peft=False,
processing_task="transcribe"
)
self.trainer_batch = Trainer(
language =["af"],
huggingface_token="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",
model_id=self.model_id,
dataset=self.dataset_batch,
model=model,
feature_processor=feature_processor,
feature_extractor=feature_extractor,
tokenizer=tokenizer,
wandb_api_key="e0fda284061622e0f7858d6c684281d48fa05ecf",
use_peft=False,
processing_task="transcribe"
)

return super().setUp()

def _validate_dataset(self, dataset, dataset_type):
"""Helper function to validate that datasets are not empty."""
has_train_sample = any(True for _ in dataset["train"])
assert has_train_sample, f"Train dataset for {dataset_type} is empty!"

has_test_sample = any(True for _ in dataset["test"])
assert has_test_sample, f"Test dataset for {dataset_type} is empty!"

def test_01_train_streaming(self):
"""Test case for training with the streaming dataset."""
self.trainer_streaming.train(
max_steps=15,
learning_rate=1e-5,
save_steps=10,
eval_steps=10,
logging_steps=10,
output_dir=f"../{self.model_id}-finetuned",
report_to=None,
push_to_hub=False,
use_cpu=False,
optim="adamw_hf",
per_device_train_batch_size=4
)
# Check if output files exist after training
assert os.path.exists(f"../{self.model_id}-finetuned/preprocessor_config.json")
assert os.path.exists(f"../{self.model_id}-finetuned/tokenizer_config.json")

def test_02_train_batch(self):
"""Test case for training with the batch dataset."""
self.trainer_batch.train(
max_steps=10,
learning_rate=1e-5,
save_steps=10,
eval_steps=10,
logging_steps=10,
output_dir=f"../{self.model_id}-finetuned",
report_to=None,
push_to_hub=False,
use_cpu=True,
optim="adamw_hf"
)
# Check if output files exist after training
assert os.path.exists(f"../{self.model_id}-finetuned/preprocessor_config.json")
assert os.path.exists(f"../{self.model_id}-finetuned/tokenizer_config.json")


if __name__ == '__main__':
unittest.main()
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import unittest
from deployment.speech_inference import SpeechTranscriptionPipeline, ModelOptimization
from src.deployment.speech_inference import SpeechTranscriptionPipeline, ModelOptimization
import torch
import os
from dotenv import load_dotenv
Expand All @@ -24,7 +24,7 @@ def setUp(self):

self.model_initialization = ModelOptimization(model_name=self.model_name)

audio_file_path = "src/tests/samples_jfk.wav"
audio_file_path = "./samples_jfk.wav"
task = "transcribe"

self.speech_transcription_pipeline = SpeechTranscriptionPipeline(
Expand Down

0 comments on commit 5e782cc

Please sign in to comment.