From 1640d395ca7867ecb47adf7745372a294355be33 Mon Sep 17 00:00:00 2001 From: tanishq-ids Date: Tue, 1 Oct 2024 16:01:10 +0200 Subject: [PATCH 1/3] added function to handle multiple values in answer_start Signed-off-by: tanishq-ids --- .../kpi_detection/train_kpi_detection.py | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py index c2604e5..ab46f1f 100644 --- a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py +++ b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py @@ -105,12 +105,31 @@ def train_kpi_detection( output_dir (str): Directory where the model will be saved during training. save_steps (int): Number of steps before saving the model during training. """ + # Load the data df = pd.read_csv(data_path) df["annotation_answer"] = df["annotation_answer"].astype(str) - df = df[["question", "context", "annotation_answer"]] + df = df[["question", "context", "annotation_answer", "answer_start"]] + + def expand_rows(df, column): + # Create a new DataFrame where each list element becomes a separate row + rows = [] + for _, row in df.iterrows(): + if isinstance(row[column], list): + for value in row[column]: + new_row = row.copy() + new_row[column] = value + rows.append(new_row) + else: + rows.append(row) + + # Convert the list of rows back to a DataFrame + return pd.DataFrame(rows) + + # Apply the function to the DataFrame + new_df = expand_rows(df, 'answer_start') # Split the DataFrame into train and test sets - train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) + train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42) train_df = train_df.reset_index(drop=True) test_df = test_df.reset_index(drop=True) @@ -121,6 +140,7 @@ def train_kpi_detection( # Create a DatasetDict data = DatasetDict({"train": train_dataset, "test": test_dataset}) + # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForQuestionAnswering.from_pretrained(model_name) @@ -128,6 +148,7 @@ def preprocess_function(examples, max_length): questions = examples["question"] contexts = examples["context"] answers = examples["annotation_answer"] + answer_starts = examples["answer_start"] # Tokenize questions and contexts tokenized_inputs = tokenizer( @@ -144,9 +165,9 @@ def preprocess_function(examples, max_length): # Loop through each example for i in range(len(questions)): - # Get the answer text + # Get the answer start index + answer_start = answer_starts[i] answer = answers[i] - answer_start = contexts[i].find(answer) if answer_start == -1: start_positions.append(0) @@ -179,13 +200,13 @@ def preprocess_function(examples, max_length): # Apply the preprocessing function to the dataset processed_datasets = data.map(preprocess_function_with_max_length, batched=True) - # Remove columns that are not needed - """processed_datasets = processed_datasets.remove_columns( - ["question", "context", "answer"] - )""" - data_collator = DefaultDataCollator() + # Remove columns that are not needed + processed_datasets = processed_datasets.remove_columns( + ["question", "context", "annotation_answer", "answer_start"] + ) + data_collator = DefaultDataCollator() saved_model_path = os.path.join(output_dir, "saved_model") os.makedirs(saved_model_path, exist_ok=True) From 25d03af3584e31bf21e5c9ababf27313c5b23d44 Mon Sep 17 00:00:00 2001 From: tanishq-ids Date: Tue, 1 Oct 2024 16:01:29 +0200 Subject: [PATCH 2/3] added function to handle multiple values in answer_start Signed-off-by: tanishq-ids --- .../kpi_detection/train_kpi_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py index ab46f1f..4e8312b 100644 --- a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py +++ b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py @@ -121,7 +121,7 @@ def expand_rows(df, column): rows.append(new_row) else: rows.append(row) - + # Convert the list of rows back to a DataFrame return pd.DataFrame(rows) From 52c9db2171f340771bebdc02be735c045d9c6bc8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 08:02:25 +0000 Subject: [PATCH 3/3] Chore: pre-commit autoupdate --- .../kpi_detection/train_kpi_detection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py index 4e8312b..8686946 100644 --- a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py +++ b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py @@ -121,12 +121,12 @@ def expand_rows(df, column): rows.append(new_row) else: rows.append(row) - + # Convert the list of rows back to a DataFrame return pd.DataFrame(rows) # Apply the function to the DataFrame - new_df = expand_rows(df, 'answer_start') + new_df = expand_rows(df, "answer_start") # Split the DataFrame into train and test sets train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42) @@ -206,7 +206,7 @@ def preprocess_function(examples, max_length): ["question", "context", "annotation_answer", "answer_start"] ) - data_collator = DefaultDataCollator() + data_collator = DefaultDataCollator() saved_model_path = os.path.join(output_dir, "saved_model") os.makedirs(saved_model_path, exist_ok=True)