From 1640d395ca7867ecb47adf7745372a294355be33 Mon Sep 17 00:00:00 2001
From: tanishq-ids <tanishq.more@incestmentdataservices.com>
Date: Tue, 1 Oct 2024 16:01:10 +0200
Subject: [PATCH 1/3] added function to handle multiple values in answer_start

Signed-off-by: tanishq-ids <tanishq.more@incestmentdataservices.com>
---
 .../kpi_detection/train_kpi_detection.py      | 39 ++++++++++++++-----
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
index c2604e5..ab46f1f 100644
--- a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
+++ b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
@@ -105,12 +105,31 @@ def train_kpi_detection(
         output_dir (str): Directory where the model will be saved during training.
         save_steps (int): Number of steps before saving the model during training.
     """
+    # Load the data
     df = pd.read_csv(data_path)
     df["annotation_answer"] = df["annotation_answer"].astype(str)
-    df = df[["question", "context", "annotation_answer"]]
+    df = df[["question", "context", "annotation_answer", "answer_start"]]
+
+    def expand_rows(df, column):
+        # Create a new DataFrame where each list element becomes a separate row
+        rows = []
+        for _, row in df.iterrows():
+            if isinstance(row[column], list):
+                for value in row[column]:
+                    new_row = row.copy()
+                    new_row[column] = value
+                    rows.append(new_row)
+            else:
+                rows.append(row)
+        
+        # Convert the list of rows back to a DataFrame
+        return pd.DataFrame(rows)
+
+    # Apply the function to the DataFrame
+    new_df = expand_rows(df, 'answer_start')
 
     # Split the DataFrame into train and test sets
-    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
+    train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42)
     train_df = train_df.reset_index(drop=True)
     test_df = test_df.reset_index(drop=True)
 
@@ -121,6 +140,7 @@ def train_kpi_detection(
     # Create a DatasetDict
     data = DatasetDict({"train": train_dataset, "test": test_dataset})
 
+    # Load tokenizer and model
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForQuestionAnswering.from_pretrained(model_name)
 
@@ -128,6 +148,7 @@ def preprocess_function(examples, max_length):
         questions = examples["question"]
         contexts = examples["context"]
         answers = examples["annotation_answer"]
+        answer_starts = examples["answer_start"]
 
         # Tokenize questions and contexts
         tokenized_inputs = tokenizer(
@@ -144,9 +165,9 @@ def preprocess_function(examples, max_length):
 
         # Loop through each example
         for i in range(len(questions)):
-            # Get the answer text
+            # Get the answer start index
+            answer_start = answer_starts[i]
             answer = answers[i]
-            answer_start = contexts[i].find(answer)
 
             if answer_start == -1:
                 start_positions.append(0)
@@ -179,13 +200,13 @@ def preprocess_function(examples, max_length):
 
     # Apply the preprocessing function to the dataset
     processed_datasets = data.map(preprocess_function_with_max_length, batched=True)
-    # Remove columns that are not needed
-    """processed_datasets = processed_datasets.remove_columns(
-        ["question", "context", "answer"]
-    )"""
 
-    data_collator = DefaultDataCollator()
+    # Remove columns that are not needed
+    processed_datasets = processed_datasets.remove_columns(
+        ["question", "context", "annotation_answer", "answer_start"]
+    )
 
+    data_collator = DefaultDataCollator()    
     saved_model_path = os.path.join(output_dir, "saved_model")
     os.makedirs(saved_model_path, exist_ok=True)
 

From 25d03af3584e31bf21e5c9ababf27313c5b23d44 Mon Sep 17 00:00:00 2001
From: tanishq-ids <tanishq.more@incestmentdataservices.com>
Date: Tue, 1 Oct 2024 16:01:29 +0200
Subject: [PATCH 2/3] added function to handle multiple values in answer_start

Signed-off-by: tanishq-ids <tanishq.more@incestmentdataservices.com>
---
 .../kpi_detection/train_kpi_detection.py                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
index ab46f1f..4e8312b 100644
--- a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
+++ b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
@@ -121,7 +121,7 @@ def expand_rows(df, column):
                     rows.append(new_row)
             else:
                 rows.append(row)
-        
+                
         # Convert the list of rows back to a DataFrame
         return pd.DataFrame(rows)
 

From 52c9db2171f340771bebdc02be735c045d9c6bc8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 15 Oct 2024 08:02:25 +0000
Subject: [PATCH 3/3] Chore: pre-commit autoupdate

---
 .../kpi_detection/train_kpi_detection.py                    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
index 4e8312b..8686946 100644
--- a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
+++ b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
@@ -121,12 +121,12 @@ def expand_rows(df, column):
                     rows.append(new_row)
             else:
                 rows.append(row)
-                
+
         # Convert the list of rows back to a DataFrame
         return pd.DataFrame(rows)
 
     # Apply the function to the DataFrame
-    new_df = expand_rows(df, 'answer_start')
+    new_df = expand_rows(df, "answer_start")
 
     # Split the DataFrame into train and test sets
     train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42)
@@ -206,7 +206,7 @@ def preprocess_function(examples, max_length):
         ["question", "context", "annotation_answer", "answer_start"]
     )
 
-    data_collator = DefaultDataCollator()    
+    data_collator = DefaultDataCollator()
     saved_model_path = os.path.join(output_dir, "saved_model")
     os.makedirs(saved_model_path, exist_ok=True)