diff --git a/test_benchmark/quantitative_evaluation/benchmark_dataset_generation/generate_consistency_qa.py b/test_benchmark/quantitative_evaluation/benchmark_dataset_generation/generate_consistency_qa.py
deleted file mode 100644
index 4ddb066..0000000
--- a/test_benchmark/quantitative_evaluation/benchmark_dataset_generation/generate_consistency_qa.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import openai
-import os
-import argparse
-import warnings
-import json
-import ast
-from multiprocessing.pool import Pool
-
-# Disable warnings.
-warnings.filterwarnings('ignore')
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
-    parser.add_argument("--gt_caption_folder", required=True, help="The path to captions")
-    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
-    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
-    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
-    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
-    args = parser.parse_args()
-    return args
-
-
-def annotate(gt_file, caption_files, output_dir):
-    """
-    Generate questions and answers for each caption file using GPT-3.
-    """
-    for file in caption_files:
-        key = file[:-5] # Strip file extension.
-        caption = gt_file[key]
-        try:
-            # Generate GPT-3 response.
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "system",
-                        "content":
-                            "Your primary task is to formulate two distinct but conceptually similar questions, such that when asked about the same video-information, they correspond to the same answer. "
-                            "------"
-                            "##TASK:"
-                            "When given details about a video, your task is to generate two questions asked in different ways. The crucial aspect is to frame these questions so that they are conceptually alike but phrased differently, leading to the exact same answer. "
-                            "The questions should be cleverly designed to extract the same information directly from the video details given, so that the provided information or parts of it can serve as the answer. It's important that both questions yield the SAME answer. "
-                            "- Generate TWO questions and ONE answer. The purpose is to extract identical information from both questions. Therefore, formulate your questions in a way that the given details can serve directly as the answer. "
-                            "------"
-                            "##SAMPLE QUESTIONS:"
-                            "- {'Q1': 'What is the colour of the cycle the boy rides?', 'Q2': 'Can you describe the cycle the boy is riding?', 'A': 'The boy is riding a red bicycle with a basket.'}"
-                            "- {'Q1': 'What is the baby girl doing in the video?', 'Q2': 'Can you see the baby girl engaged in an activity in the video?', 'A': 'The baby girl is reading a book in the video.'}"
-                    },
-                    {
-                        "role": "user",
-                        "content":
-                            f"The user input is: {caption}. "
-                            f"Please generate the response in the form of a Python dictionary string with keys 'Q1', 'Q2', and 'A', where value of 'Q1' is  first question, 'Q2' for second question and 'A' is the answer to both questions. Each corresponding value should be the question or answer text respectively. "
-                            "For example, your response should look like this: {'Q1': 'Your first question here...', 'Q2': 'Your second question here...', 'A': 'Your answer to both questions here...'}. "
-                            "Remember, it's critical to ensure that both questions are designed to extract the same details from the video, leading to the same answer."
-                    }
-                ]
-            )
-            # Convert response to a Python dictionary.
-            response_message = completion["choices"][0]["message"]["content"]
-            response_dict = ast.literal_eval(response_message)
-
-            # Save the question-answer pairs to a json file.
-            with open(f"{output_dir}/{key}.json", "w") as f:
-                json.dump(response_dict, f)
-        except Exception as e:
-            print(f"Error processing file '{key}': {e}")
-
-
-def main():
-    """
-    Main function to control the flow of the program.
-    """
-    # Parse arguments.
-    args = parse_args()
-
-    # Read ground truth captions.
-    gt_captions = {}
-    gt_files = os.listdir(args.gt_caption_folder)
-    for file in gt_files:
-        # Read human-assisted annotations from individual text files.
-        with open(os.path.join(args.gt_caption_folder, file), mode='r', encoding='utf-8-sig') as f:
-            caption = f.read().replace('\n', '').replace('‘', "'").replace('’', "'")
-            video_id = file[:-4]
-            gt_captions[video_id] = caption
-
-    caption_files = [f"{video_id}.json" for video_id in gt_captions.keys()]
-
-    output_dir = args.output_dir
-    # Generate output directory if not exists.
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # Set the OpenAI API key.
-    openai.api_key = args.api_key
-    num_tasks = args.num_tasks
-
-    # While loop to ensure that all captions are processed.
-    while True:
-        try:
-            # Files that have already been completed.
-            completed_files = os.listdir(output_dir)
-            print(f"completed_files: {len(completed_files)}")
-
-            # Files that have not been processed yet.
-            incomplete_files = [f for f in caption_files if f not in completed_files]
-            print(f"incomplete_files: {len(incomplete_files)}")
-
-            if len(incomplete_files) == 0:
-                break
-            if len(incomplete_files) <= num_tasks:
-                num_tasks = 1
-
-            # Split tasks into parts.
-            part_len = len(incomplete_files) // num_tasks
-            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
-            task_args = [(gt_captions, part, args.output_dir) for part in all_parts]
-
-            # Use a pool of workers to process the files in parallel.
-            with Pool() as pool:
-                pool.starmap(annotate, task_args)
-
-        except Exception as e:
-            print(f"Error: {e}")
-
-    # Combine qa pairs into single file when individual qa generation completes
-    all_data = {}
-    for filename in os.listdir(output_dir):
-        if filename.endswith(".json"):
-            with open(os.path.join(output_dir, filename)) as f:
-                key = filename[:-5]
-                all_data[key] = json.load(f)
-
-    with open(args.output_json, 'w') as f:
-        json.dump(all_data, f, indent=4)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test_benchmark/quantitative_evaluation/benchmark_dataset_generation/generate_correctness_detailed_context_qa.py b/test_benchmark/quantitative_evaluation/benchmark_dataset_generation/generate_correctness_detailed_context_qa.py
deleted file mode 100644
index 12c01a8..0000000
--- a/test_benchmark/quantitative_evaluation/benchmark_dataset_generation/generate_correctness_detailed_context_qa.py
+++ /dev/null
@@ -1,134 +0,0 @@
-import openai
-import os
-import argparse
-import warnings
-import json
-import ast
-from multiprocessing.pool import Pool
-
-warnings.filterwarnings('ignore')
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
-    parser.add_argument("--gt_caption_folder", required=True, help="The path to captions")
-    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
-    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
-    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
-    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
-    args = parser.parse_args()
-    return args
-
-
-def annotate(gt_file, caption_files, output_dir):
-    """
-    Generate generic descriptive type questions and answers for each caption file using GPT-3.
-    """
-    for file in caption_files:
-        key = file[:-5] # Strip file extension.
-        caption = gt_file[key]
-        try:
-            # Generate GPT-3 response.
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "system",
-                        "content": 
-                            "You will play two roles: a human asking questions related to describing a video and an intelligent chatbot designed for video description and dense captioning. "
-                            "Your task is to generate a detailed and descriptive paragraph based on the provided fragmented information about a video. "
-                            "------"
-                            "##TASK:"
-                            "Users will provide a descriptions of a video, and you will generate ONE conversation-like question and answer related to describing the video in detail. "
-                            "The question should ask to describe the video content in detail. "
-                            "The answer should be a paraphrased and well-structured paragraph based on the provided description, as detailed as possible. "
-                    },
-                    {
-                        "role": "user",
-                        "content":
-                            f"The user input is: {caption}. "
-                            f"Please generate the response in the form of a Python dictionary string with keys 'Q' for question and 'A' for answer. Each corresponding value should be the question and answer text respectively. "
-                            "For example, your response should look like this: {'Q': 'Your question here...', 'A': 'Your answer here...'}. "
-                            f"Emphasize that the answer should focus on describing the video content as detailed as possible."
-                    }
-                ]
-            )
-            # Convert response to a Python dictionary.
-            response_message = completion["choices"][0]["message"]["content"]
-            response_dict = ast.literal_eval(response_message)
-
-            # Save the question-answer pairs to a json file.
-            with open(f"{output_dir}/{key}.json", "w") as f:
-                json.dump(response_dict, f)
-        except Exception as e:
-            print(f"Error processing file '{key}': {e}")
-
-
-def main():
-    """
-    Main function to control the flow of the program.
-    """
-    # Parse arguments.
-    args = parse_args()
-
-    # Read ground truth captions.
-    gt_captions = {}
-    gt_files = os.listdir(args.gt_caption_folder)
-    for file in gt_files:
-        with open(os.path.join(args.gt_caption_folder, file), mode='r', encoding='utf-8-sig') as f:
-            caption = f.read().replace('\n', '').replace('‘', "'").replace('’', "'")
-            video_id = file[:-4]
-            gt_captions[video_id] = caption
-
-    caption_files = [f"{video_id}.json" for video_id in gt_captions.keys()]
-    output_dir = args.output_dir
-    # Generate output directory if not exists.
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # Set the OpenAI API key.
-    openai.api_key = args.api_key
-    num_tasks = args.num_tasks
-
-    # While loop to ensure that all captions are processed.
-    while True:
-        try:
-            # Files that have not been processed yet.
-            completed_files = os.listdir(output_dir)
-            print(f"completed_files: {len(completed_files)}")
-
-            # Files that have not been processed yet.
-            incomplete_files = [f for f in caption_files if f not in completed_files]
-            print(f"incomplete_files: {len(incomplete_files)}")
-
-            if len(incomplete_files) == 0:
-                break
-            if len(incomplete_files) <= num_tasks:
-                num_tasks = 1
-
-            # Split tasks into parts.
-            part_len = len(incomplete_files) // num_tasks
-            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
-            task_args = [(gt_captions, part, args.output_dir) for part in all_parts]
-
-            # Use a pool of workers to process the files in parallel.
-            with Pool() as pool:
-                pool.starmap(annotate, task_args)
-
-        except Exception as e:
-            print(f"Error: {e}")
-
-    # Combine qa pairs into single file when individual qa generation completes
-    all_data = {}
-    for filename in os.listdir(output_dir):
-        if filename.endswith(".json"):
-            with open(os.path.join(output_dir, filename)) as f:
-                key = filename[:-5]
-                all_data[key] = json.load(f)
-
-    with open(args.output_json, 'w') as f:
-        json.dump(all_data, f, indent=4)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test_benchmark/quantitative_evaluation/benchmark_dataset_generation/generate_temporal_qa.py b/test_benchmark/quantitative_evaluation/benchmark_dataset_generation/generate_temporal_qa.py
deleted file mode 100644
index 6e0f2ca..0000000
--- a/test_benchmark/quantitative_evaluation/benchmark_dataset_generation/generate_temporal_qa.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import openai
-import os
-import argparse
-import warnings
-import json
-import ast
-from multiprocessing.pool import Pool
-
-warnings.filterwarnings('ignore')
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
-    parser.add_argument("--gt_caption_folder", required=True, help="The path to captions")
-    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
-    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
-    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
-    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
-    args = parser.parse_args()
-    return args
-
-
-def annotate(gt_file, caption_files, output_dir):
-    """
-    Generate questions and answers for each caption file using GPT-3.
-    """
-    for file in caption_files:
-        key = file[:-5] # Strip file extension.
-        caption = gt_file[key]
-        try:
-            # Generate GPT-3 response.
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "system",
-                        "content": 
-                            "You play two roles: a human asking questions related to a video and an intelligent chatbot designed to help people find information from a given video. "
-                            "Your task is to generate a question-answer pair specifically related to temporal understanding from the video content. "
-                            "Your task is to first play the role of a human who asks a question about the temporal sequence or timing of events in the video and then play the role of an AI assistant that provides information based on the video content."
-                            "------"
-                            "##TASK: "
-                            "Users will provide some information about a video, and you will generate a conversation-like question and answers pair specifically focusing on the temporal sequence of events in the video. "
-                            "The question should be designed to extract temporal sequence information directly from the given information, so that the provided information or parts of it can serve as the answer. "
-                            "Generate ONE descriptive and conversational style question and detailed answer based on the given information, specifically related to the temporal understanding in the video."
-                            "------"
-                            "##INSTRUCTIONS:"
-                            "- The question must be like a human conversation and directly related to the temporal sequence of events in the video. "
-                            "- The question should be designed to extract temporal sequence information DIRECTLY from the given information, so that it or parts of it can serve as the answer. "
-                            "- The answer must be detailed and descriptive, and should directly reference the information provided with respect to the temporal sequence of events in the video."
-                    },
-                    {
-                        "role": "user",
-                        "content":
-                            f"The user input is: {caption}. "
-                            "Please generate the response in the form of a Python dictionary string with keys 'Q' for question and 'A' for answer. Each corresponding value should be the question and answer text respectively. "
-                            "For example, your response should look like this: {'Q': 'Your question here...', 'A': 'Your answer here...'}. "
-                    }
-                ]
-            )
-            # Convert response to a Python dictionary.
-            response_message = completion["choices"][0]["message"]["content"]
-            response_dict = ast.literal_eval(response_message)
-
-            # Save the question-answer pairs to a json file.
-            with open(f"{output_dir}/{key}.json", "w") as f:
-                json.dump(response_dict, f)
-        except Exception as e:
-            print(f"Error processing file '{key}': {e}")
-
-
-def main():
-    """
-    Main function to control the flow of the program.
-    """
-    # Parse arguments.
-    args = parse_args()
-
-    # Read ground truth captions.
-    gt_captions = {}
-    gt_files = os.listdir(args.gt_caption_folder)
-    for file in gt_files:
-        with open(os.path.join(args.gt_caption_folder, file), mode='r', encoding='utf-8-sig') as f:
-            caption = f.read().replace('\n', '').replace('‘', "'").replace('’', "'")
-            video_id = file[:-4]
-            gt_captions[video_id] = caption
-
-    caption_files = [f"{video_id}.json" for video_id in gt_captions.keys()]
-    output_dir = args.output_dir
-    # Generate output directory if not exists.
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # Set the OpenAI API key.
-    openai.api_key = args.api_key
-    num_tasks = args.num_tasks
-
-    # While loop to ensure that all captions are processed.
-    while True:
-        try:
-            # Files that have not been processed yet.
-            completed_files = os.listdir(output_dir)
-            print(f"completed_files: {len(completed_files)}")
-
-            # Files that have not been processed yet.
-            incomplete_files = [f for f in caption_files if f not in completed_files]
-            print(f"incomplete_files: {len(incomplete_files)}")
-
-            if len(incomplete_files) == 0:
-                break
-            if len(incomplete_files) <= num_tasks:
-                num_tasks = 1
-
-            # Split tasks into parts.
-            part_len = len(incomplete_files) // num_tasks
-            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
-            task_args = [(gt_captions, part, args.output_dir) for part in all_parts]
-
-            # Use a pool of workers to process the files in parallel.
-            with Pool() as pool:
-                pool.starmap(annotate, task_args)
-
-        except Exception as e:
-            print(f"Error: {e}")
-
-    # Combine qa pairs into single file when individual qa generation completes
-    all_data = {}
-    for filename in os.listdir(output_dir):
-        if filename.endswith(".json"):
-            with open(os.path.join(output_dir, filename)) as f:
-                key = filename[:-5]
-                all_data[key] = json.load(f)
-
-    with open(args.output_json, 'w') as f:
-        json.dump(all_data, f, indent=4)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test_benchmark/quantitative_evaluation/evaluate_activitynet_qa.py b/test_benchmark/quantitative_evaluation/evaluate_activitynet_qa.py
deleted file mode 100644
index 581eb7b..0000000
--- a/test_benchmark/quantitative_evaluation/evaluate_activitynet_qa.py
+++ /dev/null
@@ -1,207 +0,0 @@
-import openai
-import os
-import argparse
-import json
-import ast
-from multiprocessing.pool import Pool
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
-    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
-    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
-    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
-    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
-    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
-    args = parser.parse_args()
-    return args
-
-
-def annotate(prediction_set, caption_files, output_dir):
-    """
-    Evaluates question and answer pairs using GPT-3
-    Returns a score for correctness.
-    """
-    for file in caption_files:
-        key = file[:-5] # Strip file extension
-        qa_set = prediction_set[key]
-        question = qa_set['q']
-        answer = qa_set['a']
-        pred = qa_set['pred']
-        try:
-            # Compute the correctness score
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "system",
-                        "content": 
-                            "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
-                            "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
-                            "------"
-                            "##INSTRUCTIONS: "
-                            "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
-                            "- Consider synonyms or paraphrases as valid matches.\n"
-                            "- Evaluate the correctness of the prediction compared to the answer."
-                    },
-                    {
-                        "role": "user",
-                        "content":
-                            "Please evaluate the following video-based question-answer pair:\n\n"
-                            f"Question: {question}\n"
-                            f"Correct Answer: {answer}\n"
-                            f"Predicted Answer: {pred}\n\n"
-                            "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
-                            "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is  a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
-                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
-                            "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
-                    }
-                ]
-            )
-            # Convert response to a Python dictionary.
-            response_message = completion["choices"][0]["message"]["content"]
-            response_dict = ast.literal_eval(response_message)
-            result_qa_pair = [response_dict, qa_set]
-
-            # Save the question-answer pairs to a json file.
-            with open(f"{output_dir}/{key}.json", "w") as f:
-                json.dump(result_qa_pair, f)
-
-        except Exception as e:
-            print(f"Error processing file '{key}': {e}")
-
-
-def main():
-    """
-    Main function to control the flow of the program.
-    """
-    # Parse arguments.
-    args = parse_args()
-
-    file = open(args.pred_path)
-    pred_contents = json.load(file)
-
-    # Dictionary to store the count of occurrences for each video_id
-    video_id_counts = {}
-    new_pred_contents = []
-
-    # Iterate through each sample in pred_contents
-    for sample in pred_contents:
-        video_id = sample['video_name']
-        if video_id in video_id_counts:
-            video_id_counts[video_id] += 1
-        else:
-            video_id_counts[video_id] = 0
-
-        # Create a new sample with the modified key
-        new_sample = sample
-        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
-        new_pred_contents.append(new_sample)
-
-    # Generating list of id's and corresponding files
-    id_list = [x['video_name'] for x in new_pred_contents]
-    caption_files = [f"{id}.json" for id in id_list]
-
-    output_dir = args.output_dir
-    # Generate output directory if not exists.
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # Preparing dictionary of question-answer sets
-    prediction_set = {}
-    for sample in new_pred_contents:
-        id = sample['video_name']
-        question = sample['Q']
-        answer = sample['A']
-        pred = sample['pred']
-        qa_set = {"q": question, "a": answer, "pred": pred}
-        prediction_set[id] = qa_set
-
-    # Set the OpenAI API key.
-    openai.api_key = args.api_key
-    num_tasks = args.num_tasks
-
-    # While loop to ensure that all captions are processed.
-    while True:
-        try:
-            # Files that have not been processed yet.
-            completed_files = os.listdir(output_dir)
-            print(f"completed_files: {len(completed_files)}")
-
-            # Files that have not been processed yet.
-            incomplete_files = [f for f in caption_files if f not in completed_files]
-            print(f"incomplete_files: {len(incomplete_files)}")
-
-            # Break the loop when there are no incomplete files
-            if len(incomplete_files) == 0:
-                break
-            if len(incomplete_files) <= num_tasks:
-                num_tasks = 1
-
-            # Split tasks into parts.
-            part_len = len(incomplete_files) // num_tasks
-            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
-            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
-
-            # Use a pool of workers to process the files in parallel.
-            with Pool() as pool:
-                pool.starmap(annotate, task_args)
-
-        except Exception as e:
-            print(f"Error: {e}")
-
-    # Combine all the processed files into one
-    combined_contents = {}
-    json_path = args.output_json
-
-    # Iterate through json files
-    for file_name in os.listdir(output_dir):
-        if file_name.endswith(".json"):
-            file_path = os.path.join(output_dir, file_name)
-            with open(file_path, "r") as json_file:
-                content = json.load(json_file)
-                combined_contents[file_name[:-5]] = content
-
-    # Write combined content to a json file
-    with open(json_path, "w") as json_file:
-        json.dump(combined_contents, json_file)
-    print("All evaluation completed!")
-
-    # Calculate average score and accuracy
-    score_sum = 0
-    count = 0
-    yes_count = 0
-    no_count = 0
-    for key, result in combined_contents.items():
-        # Computing score
-        count += 1
-        try :
-            score_match = result[0]['score']
-            score = int(score_match)
-            score_sum += score
-        except:
-            print("Score not found for", key)
-            continue
-
-        # Computing accuracy
-        try:
-            pred = result[0]['pred']
-            if "yes" in pred.lower():
-                yes_count += 1
-            elif "no" in pred.lower():
-                no_count += 1
-        except:
-            print("Prediction not found for", key)
-            continue
-
-    average_score = score_sum / count
-    accuracy = yes_count / (yes_count + no_count)
-    print("Yes count:", yes_count)
-    print("No count:", no_count)
-    print("Accuracy:", accuracy)
-    print("Average score:", average_score)
-
-
-if __name__ == "__main__":
-    main()
-
diff --git a/test_benchmark/quantitative_evaluation/evaluate_benchmark.sh b/test_benchmark/quantitative_evaluation/evaluate_benchmark.sh
deleted file mode 100644
index 2151e67..0000000
--- a/test_benchmark/quantitative_evaluation/evaluate_benchmark.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-
-# Define common arguments for all scripts
-# Without subtitles
-PRED_GENERIC=""
-PRED_TEMPORAL="test_benchmark/Other models/video_llava_temporal.json"
-PRED_CONSISTENCY="test_benchmark/Other models/video_llava_consistency.json"
-OUTPUT_DIR="/ibex/ai/home/ataallka/minigpt_video_results/quantitative_evaluation/ckpt_52_without_subtitles"
-rm -rf $OUTPUT_DIR
-# # With subtitles
-# PRED_GENERIC="results/ckpt_52_video_chatgpt_generic_subtitles.json"
-# PRED_TEMPORAL="results/ckpt_52_video_chatgpt_temporal_subtitles.json"
-# PRED_CONSISTENCY="results/ckpt_52_video_chatgpt_consistency_subtitles.json"
-#  OUTPUT_DIR="/ibex/ai/home/ataallka/minigpt_video_results/quantitative_evaluation/ckpt_52_with_subtitles"
-# rm -rf $OUTPUT_DIR
-
-API_KEY="open_ai_key"
-NUM_TASKS=64
-
-# Run the "correctness" evaluation script
-python evaluate_benchmark_1_correctness.py \
-  --pred_path "${PRED_GENERIC}" \
-  --output_dir "${OUTPUT_DIR}/correctness_eval" \
-  --output_json "${OUTPUT_DIR}/correctness_results.json" \
-  --api_key $API_KEY \
-  --num_tasks $NUM_TASKS
-
-# Run the "detailed orientation" evaluation script
-python evaluate_benchmark_2_detailed_orientation.py \
-  --pred_path "${PRED_GENERIC}" \
-  --output_dir "${OUTPUT_DIR}/detailed_eval" \
-  --output_json "${OUTPUT_DIR}/detailed_orientation_results.json" \
-  --api_key $API_KEY \
-  --num_tasks $NUM_TASKS
-
-# Run the "contextual understanding" evaluation script
-python evaluate_benchmark_3_context.py \
-  --pred_path "${PRED_GENERIC}" \
-  --output_dir "${OUTPUT_DIR}/context_eval" \
-  --output_json "${OUTPUT_DIR}/contextual_understanding_results.json" \
-  --api_key $API_KEY \
-  --num_tasks $NUM_TASKS
-
-# Run the "temporal understanding" evaluation script
-python evaluate_benchmark_4_temporal.py \
-  --pred_path "${PRED_TEMPORAL}" \
-  --output_dir "${OUTPUT_DIR}/temporal_eval" \
-  --output_json "${OUTPUT_DIR}/temporal_understanding_results.json" \
-  --api_key $API_KEY \
-  --num_tasks $NUM_TASKS
-
-# Run the "consistency" evaluation script
-python evaluate_benchmark_5_consistency.py \
-  --pred_path "${PRED_CONSISTENCY}" \
-  --output_dir "${OUTPUT_DIR}/consistency_eval" \
-  --output_json "${OUTPUT_DIR}/consistency_results.json" \
-  --api_key $API_KEY \
-  --num_tasks $NUM_TASKS
-
-
-echo "All evaluations completed!"
diff --git a/test_benchmark/quantitative_evaluation/evaluate_benchmark_1_correctness.py b/test_benchmark/quantitative_evaluation/evaluate_benchmark_1_correctness.py
deleted file mode 100644
index 6ebae90..0000000
--- a/test_benchmark/quantitative_evaluation/evaluate_benchmark_1_correctness.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import openai
-import os
-import argparse
-import json
-import ast
-from multiprocessing.pool import Pool
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
-    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
-    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
-    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
-    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
-    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
-    args = parser.parse_args()
-    return args
-
-
-def annotate(prediction_set, caption_files, output_dir):
-    """
-    Evaluates question and answer pairs using GPT-3
-    Returns a score for correctness.
-    """
-    for file in caption_files:
-        key = file[:-5] # Strip file extension
-        qa_set = prediction_set[key]
-        question = qa_set['q']
-        answer = qa_set['a']
-        pred = qa_set['pred']
-        try:
-            # Compute the correctness score
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "system",
-                        "content": 
-                            "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
-                            "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:"
-                            "------"
-                            "##INSTRUCTIONS: "
-                            "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
-                            "- The predicted answer must be factually accurate and align with the video content.\n"
-                            "- Consider synonyms or paraphrases as valid matches.\n"
-                            "- Evaluate the factual accuracy of the prediction compared to the answer."
-                    },
-                    {
-                        "role": "user",
-                        "content":
-                            "Please evaluate the following video-based question-answer pair:\n\n"
-                            f"Question: {question}\n"
-                            f"Correct Answer: {answer}\n"
-                            f"Predicted Answer: {pred}\n\n"
-                            "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
-                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
-                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
-                            "For example, your response should look like this: {''score': 4.8}."
-                    }
-                ]
-            )
-            # Convert response to a Python dictionary.
-            response_message = completion["choices"][0]["message"]["content"]
-            response_dict = ast.literal_eval(response_message)
-            result_qa_pair = [response_dict, qa_set]
-
-            # Save the question-answer pairs to a json file.
-            with open(f"{output_dir}/{key}.json", "w") as f:
-                json.dump(result_qa_pair, f)
-
-        except Exception as e:
-            print(f"Error processing file '{key}': {e}")
-
-
-def main():
-    """
-    Main function to control the flow of the program.
-    """
-    # Parse arguments.
-    args = parse_args()
-
-    file = open(args.pred_path)
-    pred_contents = json.load(file)
-
-    # Dictionary to store the count of occurrences for each video_id
-    video_id_counts = {}
-    new_pred_contents = []
-
-    # Iterate through each sample in pred_contents
-    for sample in pred_contents:
-        video_id = sample['video_name']
-        if video_id in video_id_counts:
-            video_id_counts[video_id] += 1
-        else:
-            video_id_counts[video_id] = 0
-
-        # Create a new sample with the modified key
-        new_sample = sample
-        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
-        new_pred_contents.append(new_sample)
-
-    # Generating list of id's and corresponding files
-    id_list = [x['video_name'] for x in new_pred_contents]
-    caption_files = [f"{id}.json" for id in id_list]
-
-    output_dir = args.output_dir
-    # Generate output directory if not exists.
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # Preparing dictionary of question-answer sets
-    prediction_set = {}
-    for sample in new_pred_contents:
-        id = sample['video_name']
-        question = sample['Q']
-        answer = sample['A']
-        pred = sample['pred']
-        qa_set = {"q": question, "a": answer, "pred": pred}
-        prediction_set[id] = qa_set
-
-    # Set the OpenAI API key.
-    openai.api_key = args.api_key
-    num_tasks = args.num_tasks
-
-    # While loop to ensure that all captions are processed.
-    while True:
-        try:
-            # Files that have not been processed yet.
-            completed_files = os.listdir(output_dir)
-            print(f"completed_files: {len(completed_files)}")
-
-            # Files that have not been processed yet.
-            incomplete_files = [f for f in caption_files if f not in completed_files]
-            print(f"incomplete_files: {len(incomplete_files)}")
-
-            # Break the loop when there are no incomplete files
-            if len(incomplete_files) == 0:
-                break
-            if len(incomplete_files) <= num_tasks:
-                num_tasks = 1
-
-            # Split tasks into parts.
-            part_len = len(incomplete_files) // num_tasks
-            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
-            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
-
-            # Use a pool of workers to process the files in parallel.
-            with Pool() as pool:
-                pool.starmap(annotate, task_args)
-
-        except Exception as e:
-            print(f"Error: {e}")
-
-    # Combine all the processed files into one
-    combined_contents = {}
-    json_path = args.output_json
-
-    # Iterate through json files
-    for file_name in os.listdir(output_dir):
-        if file_name.endswith(".json"):
-            file_path = os.path.join(output_dir, file_name)
-            with open(file_path, "r") as json_file:
-                content = json.load(json_file)
-                combined_contents[file_name[:-5]] = content
-
-    # Write combined content to a json file
-    with open(json_path, "w") as json_file:
-        json.dump(combined_contents, json_file)
-    print("All evaluation completed!")
-
-    # Calculate average score
-    score_sum = 0
-    count = 0
-    for key, result in combined_contents.items():
-        count += 1
-        score_match = result[0]['score']
-        score = int(score_match)
-        score_sum += score
-    average_score = score_sum / count
-
-    print("Average score for correctness:", average_score)
-
-
-if __name__ == "__main__":
-    main()
-
diff --git a/test_benchmark/quantitative_evaluation/evaluate_benchmark_2_detailed_orientation.py b/test_benchmark/quantitative_evaluation/evaluate_benchmark_2_detailed_orientation.py
deleted file mode 100644
index 634bda0..0000000
--- a/test_benchmark/quantitative_evaluation/evaluate_benchmark_2_detailed_orientation.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import openai
-import os
-import argparse
-import json
-import ast
-from multiprocessing.pool import Pool
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
-    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
-    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
-    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
-    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
-    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
-    args = parser.parse_args()
-    return args
-
-
-def annotate(prediction_set, caption_files, output_dir):
-    """
-    Evaluates question and answer pairs using GPT-3 and
-    returns a score for detailed orientation.
-    """
-    for file in caption_files:
-        key = file[:-5] # Strip file extension
-        qa_set = prediction_set[key]
-        question = qa_set['q']
-        answer = qa_set['a']
-        pred = qa_set['pred']
-        try:
-            # Compute the detailed-orientation score
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "system",
-                        "content":
-                            "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
-                            "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
-                            "------"
-                            "##INSTRUCTIONS: "
-                            "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
-                            "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
-                            "- Consider synonyms or paraphrases as valid matches.\n"
-                            "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity."
-                    },
-                    {
-                        "role": "user",
-                        "content":
-                            "Please evaluate the following video-based question-answer pair:\n\n"
-                            f"Question: {question}\n"
-                            f"Correct Answer: {answer}\n"
-                            f"Predicted Answer: {pred}\n\n"
-                            "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
-                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
-                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
-                            "For example, your response should look like this: {''score': 4.8}."
-                    }
-                ]
-            )
-            # Convert response to a Python dictionary.
-            response_message = completion["choices"][0]["message"]["content"]
-            response_dict = ast.literal_eval(response_message)
-            result_qa_pair = [response_dict, qa_set]
-
-            # Save the question-answer pairs to a json file.
-            with open(f"{output_dir}/{key}.json", "w") as f:
-                json.dump(result_qa_pair, f)
-
-        except Exception as e:
-            print(f"Error processing file '{key}': {e}")
-
-
-def main():
-    """
-    Main function to control the flow of the program.
-    """
-    # Parse arguments.
-    args = parse_args()
-
-    file = open(args.pred_path)
-    pred_contents = json.load(file)
-
-    # Dictionary to store the count of occurrences for each video_id
-    video_id_counts = {}
-    new_pred_contents = []
-
-    # Iterate through each sample in pred_contents
-    for sample in pred_contents:
-        video_id = sample['video_name']
-        if video_id in video_id_counts:
-            video_id_counts[video_id] += 1
-        else:
-            video_id_counts[video_id] = 0
-
-        # Create a new sample with the modified key
-        new_sample = sample
-        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
-        new_pred_contents.append(new_sample)
-
-    # Generating list of id's and corresponding files
-    id_list = [x['video_name'] for x in new_pred_contents]
-    caption_files = [f"{id}.json" for id in id_list]
-
-    output_dir = args.output_dir
-    # Generate output directory if not exists.
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # Preparing dictionary of question-answer sets
-    prediction_set = {}
-    for sample in new_pred_contents:
-        id = sample['video_name']
-        question = sample['Q']
-        answer = sample['A']
-        pred = sample['pred']
-        qa_set = {"q": question, "a": answer, "pred": pred}
-        prediction_set[id] = qa_set
-
-    # Set the OpenAI API key.
-    openai.api_key = args.api_key
-    num_tasks = args.num_tasks
-
-    # While loop to ensure that all captions are processed.
-    while True:
-        try:
-            # Files that have not been processed yet.
-            completed_files = os.listdir(output_dir)
-            print(f"completed_files: {len(completed_files)}")
-
-            # Files that have not been processed yet.
-            incomplete_files = [f for f in caption_files if f not in completed_files]
-            print(f"incomplete_files: {len(incomplete_files)}")
-
-            # Break the loop when there are no incomplete files
-            if len(incomplete_files) == 0:
-                break
-            if len(incomplete_files) <= num_tasks:
-                num_tasks = 1
-
-            # Split tasks into parts.
-            part_len = len(incomplete_files) // num_tasks
-            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
-            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
-
-            # Use a pool of workers to process the files in parallel.
-            with Pool() as pool:
-                pool.starmap(annotate, task_args)
-
-        except Exception as e:
-            print(f"Error: {e}")
-
-    # Combine all the processed files into one
-    combined_contents = {}
-    json_path = args.output_json
-
-    # Iterate through json files
-    for file_name in os.listdir(output_dir):
-        if file_name.endswith(".json"):
-            file_path = os.path.join(output_dir, file_name)
-            with open(file_path, "r") as json_file:
-                content = json.load(json_file)
-                combined_contents[file_name[:-5]] = content
-
-    # Write combined content to a json file
-    with open(json_path, "w") as json_file:
-        json.dump(combined_contents, json_file)
-    print("All evaluation completed!")
-
-    # Calculate average score
-    score_sum = 0
-    count = 0
-    for key, result in combined_contents.items():
-        count += 1
-        score_match = result[0]['score']
-        score = int(score_match)
-        score_sum += score
-    average_score = score_sum / count
-
-    print("Average score for detailed orientation:", average_score)
-
-
-if __name__ == "__main__":
-    main()
-
diff --git a/test_benchmark/quantitative_evaluation/evaluate_benchmark_3_context.py b/test_benchmark/quantitative_evaluation/evaluate_benchmark_3_context.py
deleted file mode 100644
index 0058f75..0000000
--- a/test_benchmark/quantitative_evaluation/evaluate_benchmark_3_context.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import openai
-import os
-import argparse
-import json
-import ast
-from multiprocessing.pool import Pool
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
-    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
-    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
-    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
-    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
-    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
-    args = parser.parse_args()
-    return args
-
-
-def annotate(prediction_set, caption_files, output_dir):
-    """
-    Evaluates question and answer pairs using GPT-3 and
-    returns a score for contextual understanding.
-    """
-    for file in caption_files:
-        key = file[:-5] # Strip file extension
-        qa_set = prediction_set[key]
-        question = qa_set['q']
-        answer = qa_set['a']
-        pred = qa_set['pred']
-        try:
-            # Compute the contextual understanding score
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "system",
-                        "content":
-                            "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. "
-                            "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:"
-                            "------"
-                            "##INSTRUCTIONS: "
-                            "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n"
-                            "- The predicted answer must capture the main themes and sentiments of the video.\n"
-                            "- Consider synonyms or paraphrases as valid matches.\n"
-                            "- Provide your evaluation of the contextual understanding of the prediction compared to the answer."
-                    },
-                    {
-                        "role": "user",
-                        "content":
-                            "Please evaluate the following video-based question-answer pair:\n\n"
-                            f"Question: {question}\n"
-                            f"Correct Answer: {answer}\n"
-                            f"Predicted Answer: {pred}\n\n"
-                            "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. "
-                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING."
-                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
-                            "For example, your response should look like this: {''score': 4.8}."
-                    }
-                ]
-            )
-            # Convert response to a Python dictionary.
-            response_message = completion["choices"][0]["message"]["content"]
-            response_dict = ast.literal_eval(response_message)
-            result_qa_pair = [response_dict, qa_set]
-
-            # Save the question-answer pairs to a json file.
-            with open(f"{output_dir}/{key}.json", "w") as f:
-                json.dump(result_qa_pair, f)
-
-        except Exception as e:
-            print(f"Error processing file '{key}': {e}")
-
-
-def main():
-    """
-    Main function to control the flow of the program.
-    """
-    # Parse arguments.
-    args = parse_args()
-
-    file = open(args.pred_path)
-    pred_contents = json.load(file)
-
-    # Dictionary to store the count of occurrences for each video_id
-    video_id_counts = {}
-    new_pred_contents = []
-
-    # Iterate through each sample in pred_contents
-    for sample in pred_contents:
-        video_id = sample['video_name']
-        if video_id in video_id_counts:
-            video_id_counts[video_id] += 1
-        else:
-            video_id_counts[video_id] = 0
-
-        # Create a new sample with the modified key
-        new_sample = sample
-        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
-        new_pred_contents.append(new_sample)
-
-    # Generating list of id's and corresponding files
-    id_list = [x['video_name'] for x in new_pred_contents]
-    caption_files = [f"{id}.json" for id in id_list]
-
-    output_dir = args.output_dir
-    # Generate output directory if not exists.
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # Preparing dictionary of question-answer sets
-    prediction_set = {}
-    for sample in new_pred_contents:
-        id = sample['video_name']
-        question = sample['Q']
-        answer = sample['A']
-        pred = sample['pred']
-        qa_set = {"q": question, "a": answer, "pred": pred}
-        prediction_set[id] = qa_set
-
-    # Set the OpenAI API key.
-    openai.api_key = args.api_key
-    num_tasks = args.num_tasks
-
-    # While loop to ensure that all captions are processed.
-    while True:
-        try:
-            # Files that have not been processed yet.
-            completed_files = os.listdir(output_dir)
-            print(f"completed_files: {len(completed_files)}")
-
-            # Files that have not been processed yet.
-            incomplete_files = [f for f in caption_files if f not in completed_files]
-            print(f"incomplete_files: {len(incomplete_files)}")
-
-            # Break the loop when there are no incomplete files
-            if len(incomplete_files) == 0:
-                break
-            if len(incomplete_files) <= num_tasks:
-                num_tasks = 1
-
-            # Split tasks into parts.
-            part_len = len(incomplete_files) // num_tasks
-            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
-            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
-
-            # Use a pool of workers to process the files in parallel.
-            with Pool() as pool:
-                pool.starmap(annotate, task_args)
-
-        except Exception as e:
-            print(f"Error: {e}")
-
-    # Combine all the processed files into one
-    combined_contents = {}
-    json_path = args.output_json
-
-    # Iterate through json files
-    for file_name in os.listdir(output_dir):
-        if file_name.endswith(".json"):
-            file_path = os.path.join(output_dir, file_name)
-            with open(file_path, "r") as json_file:
-                content = json.load(json_file)
-                combined_contents[file_name[:-5]] = content
-
-    # Write combined content to a json file
-    with open(json_path, "w") as json_file:
-        json.dump(combined_contents, json_file)
-    print("All evaluation completed!")
-
-    # Calculate average score
-    score_sum = 0
-    count = 0
-    for key, result in combined_contents.items():
-        count += 1
-        score_match = result[0]['score']
-        score = int(score_match)
-        score_sum += score
-    average_score = score_sum / count
-
-    print("Average score for contextual understanding:", average_score)
-
-
-if __name__ == "__main__":
-    main()
-
diff --git a/test_benchmark/quantitative_evaluation/evaluate_benchmark_4_temporal.py b/test_benchmark/quantitative_evaluation/evaluate_benchmark_4_temporal.py
deleted file mode 100644
index 33e8db0..0000000
--- a/test_benchmark/quantitative_evaluation/evaluate_benchmark_4_temporal.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import openai
-import os
-import argparse
-import json
-import ast
-from multiprocessing.pool import Pool
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
-    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
-    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
-    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
-    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
-    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
-    args = parser.parse_args()
-    return args
-
-
-def annotate(prediction_set, caption_files, output_dir):
-    """
-    Evaluates question and answer pairs using GPT-3 and
-    returns a score for temporal understanding.
-    """
-    for file in caption_files:
-        key = file[:-5] # Strip file extension
-        qa_set = prediction_set[key]
-        question = qa_set['q']
-        answer = qa_set['a']
-        pred = qa_set['pred']
-        try:
-            # Compute the temporal understanding score
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "system",
-                        "content":
-                            "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. "
-                            "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:"
-                            "------"
-                            "##INSTRUCTIONS: "
-                            "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n"
-                            "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n"
-                            "- Evaluate the temporal accuracy of the prediction compared to the answer."
-                    },
-                    {
-                        "role": "user",
-                        "content":
-                            "Please evaluate the following video-based question-answer pair:\n\n"
-                            f"Question: {question}\n"
-                            f"Correct Answer: {answer}\n"
-                            f"Predicted Answer: {pred}\n\n"
-                            "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. "
-                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING."
-                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
-                            "For example, your response should look like this: {''score': 4.8}."
-                    }
-                ]
-            )
-            # Convert response to a Python dictionary.
-            response_message = completion["choices"][0]["message"]["content"]
-            response_dict = ast.literal_eval(response_message)
-            result_qa_pair = [response_dict, qa_set]
-
-            # Save the question-answer pairs to a json file.
-            with open(f"{output_dir}/{key}.json", "w") as f:
-                json.dump(result_qa_pair, f)
-
-        except Exception as e:
-            print(f"Error processing file '{key}': {e}")
-
-
-def main():
-    """
-    Main function to control the flow of the program.
-    """
-    # Parse arguments.
-    args = parse_args()
-
-    file = open(args.pred_path)
-    pred_contents = json.load(file)
-
-    # Dictionary to store the count of occurrences for each video_id
-    video_id_counts = {}
-    new_pred_contents = []
-
-    # Iterate through each sample in pred_contents
-    for sample in pred_contents:
-        video_id = sample['video_name']
-        if video_id in video_id_counts:
-            video_id_counts[video_id] += 1
-        else:
-            video_id_counts[video_id] = 0
-
-        # Create a new sample with the modified key
-        new_sample = sample
-        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
-        new_pred_contents.append(new_sample)
-
-    # Generating list of id's and corresponding files
-    id_list = [x['video_name'] for x in new_pred_contents]
-    caption_files = [f"{id}.json" for id in id_list]
-
-    output_dir = args.output_dir
-    # Generate output directory if not exists.
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # Preparing dictionary of question-answer sets
-    prediction_set = {}
-    for sample in new_pred_contents:
-        id = sample['video_name']
-        question = sample['Q']
-        answer = sample['A']
-        pred = sample['pred']
-        qa_set = {"q": question, "a": answer, "pred": pred}
-        prediction_set[id] = qa_set
-
-    # Set the OpenAI API key.
-    openai.api_key = args.api_key
-    num_tasks = args.num_tasks
-
-    # While loop to ensure that all captions are processed.
-    while True:
-        try:
-            # Files that have not been processed yet.
-            completed_files = os.listdir(output_dir)
-            print(f"completed_files: {len(completed_files)}")
-
-            # Files that have not been processed yet.
-            incomplete_files = [f for f in caption_files if f not in completed_files]
-            print(f"incomplete_files: {len(incomplete_files)}")
-
-            # Break the loop when there are no incomplete files
-            if len(incomplete_files) == 0:
-                break
-            if len(incomplete_files) <= num_tasks:
-                num_tasks = 1
-
-            # Split tasks into parts.
-            part_len = len(incomplete_files) // num_tasks
-            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
-            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
-
-            # Use a pool of workers to process the files in parallel.
-            with Pool() as pool:
-                pool.starmap(annotate, task_args)
-
-        except Exception as e: 
-            print(f"Error: {e}")
-
-    # Combine all the processed files into one
-    combined_contents = {}
-    json_path = args.output_json
-
-    # Iterate through json files
-    for file_name in os.listdir(output_dir):
-        if file_name.endswith(".json"):
-            file_path = os.path.join(output_dir, file_name)
-            with open(file_path, "r") as json_file:
-                content = json.load(json_file)
-                combined_contents[file_name[:-5]] = content
-
-    # Write combined content to a json file
-    with open(json_path, "w") as json_file:
-        json.dump(combined_contents, json_file)
-    print("All evaluation completed!")
-
-    # Calculate average score
-    score_sum = 0
-    count = 0
-    for key, result in combined_contents.items():
-        count += 1
-        score_match = result[0]['score']
-        score = int(score_match)
-        score_sum += score
-    average_score = score_sum / count
-
-    print("Average score temporal understanding:", average_score)
-
-
-if __name__ == "__main__":
-    main()
-
diff --git a/test_benchmark/quantitative_evaluation/evaluate_benchmark_5_consistency.py b/test_benchmark/quantitative_evaluation/evaluate_benchmark_5_consistency.py
deleted file mode 100644
index 3352c42..0000000
--- a/test_benchmark/quantitative_evaluation/evaluate_benchmark_5_consistency.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import openai
-import os
-import argparse
-import json
-import ast
-from multiprocessing.pool import Pool
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
-    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
-    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
-    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
-    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
-    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
-    args = parser.parse_args()
-    return args
-
-
-def annotate(prediction_set, caption_files, output_dir):
-    """
-    Evaluates question and answer pairs using GPT-3 and
-    returns a score for consistency.
-    """
-    for file in caption_files:
-        key = file[:-5] # Strip file extension
-        qa_set = prediction_set[key]
-        question1 = qa_set['q1']
-        question2 = qa_set['q2']
-        answer = qa_set['a']
-        pred1 = qa_set['pred1']
-        pred2 = qa_set['pred2']
-        try:
-            # Compute the consistency score
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "system",
-                        "content":
-                            "You are an intelligent chatbot designed for evaluating the consistency of generative outputs for similar video-based question-answer pairs. "
-                            "You will be given two very similar questions, a common answer common to both the questions and predicted answers for the two questions ."
-                            "Your task is to compare the predicted answers for two very similar question, with a common correct answer and determine if they are consistent. Here's how you can accomplish the task:"
-                            "------"
-                            "##INSTRUCTIONS: "
-                            "- Focus on the consistency between the two predicted answers and the correct answer. Both predicted answers should correspond to the correct answer and to each other, and should not contain any contradictions or significant differences in the conveyed information.\n"
-                            "- Both predicted answers must be consistent with each other and the correct answer, in terms of the information they provide about the video content.\n"
-                            "- Consider synonyms or paraphrases as valid matches, but only if they maintain the consistency in the conveyed information.\n"
-                            "- Evaluate the consistency of the two predicted answers compared to the correct answer."
-                    },
-                    {
-                        "role": "user",
-                        "content":
-                            "Please evaluate the following video-based question-answer pair:\n\n"
-                            f"Question 1: {question1}\n"
-                            f"Question 2: {question2}\n"
-                            f"Correct Answer: {answer}\n"
-                            f"Predicted Answer to Question 1: {pred1}\n"
-                            f"Predicted Answer to Question 2: {pred2}\n\n"
-                            "Provide your evaluation only as a consistency score where the consistency score is an integer value between 0 and 5, with 5 indicating the highest level of consistency. "
-                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the consistency score in INTEGER, not STRING."
-                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
-                            "For example, your response should look like this: {''score': 4.8}."
-                    }
-                ]
-            )
-            # Convert response to a Python dictionary.
-            response_message = completion["choices"][0]["message"]["content"]
-            response_dict = ast.literal_eval(response_message)
-            result_qa_pair = [response_dict, qa_set]
-
-            # Save the question-answer pairs to a json file.
-            with open(f"{output_dir}/{key}.json", "w") as f:
-                json.dump(result_qa_pair, f)
-
-        except Exception as e:
-            print(f"Error processing file '{key}': {e}")
-
-
-def main():
-    """
-    Main function to control the flow of the program.
-    """
-    # Parse arguments.
-    args = parse_args()
-
-    file = open(args.pred_path)
-    pred_contents = json.load(file)
-
-    # Dictionary to store the count of occurrences for each video_id
-    video_id_counts = {}
-    new_pred_contents = []
-
-    # Iterate through each sample in pred_contents
-    for sample in pred_contents:
-        video_id = sample['video_name']
-        if video_id in video_id_counts:
-            video_id_counts[video_id] += 1
-        else:
-            video_id_counts[video_id] = 0
-
-        # Create a new sample with the modified key
-        new_sample = sample
-        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
-        new_pred_contents.append(new_sample)
-
-    # Generating list of id's and corresponding files
-    id_list = [x['video_name'] for x in new_pred_contents]
-    caption_files = [f"{id}.json" for id in id_list]
-
-    output_dir = args.output_dir
-    # Generate output directory if not exists.
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # Preparing dictionary of question-answer sets
-    prediction_set = {}
-    for sample in new_pred_contents:
-        id = sample['video_name']
-        question1 = sample['Q1']
-        question2 = sample['Q1']
-        answer = sample['A']
-        pred1 = sample['pred1']
-        pred2 = sample['pred2']
-        qa_set = {"q1": question1, "q2": question2, "a": answer, "pred1": pred1, "pred2": pred2}
-        prediction_set[id] = qa_set
-
-    # Set the OpenAI API key.
-    openai.api_key = args.api_key
-    num_tasks = args.num_tasks
-
-    # While loop to ensure that all captions are processed.
-    while True:
-        try:
-            # Files that have not been processed yet.
-            completed_files = os.listdir(output_dir)
-            print(f"completed_files: {len(completed_files)}")
-
-            # Files that have not been processed yet.
-            incomplete_files = [f for f in caption_files if f not in completed_files]
-            print(f"incomplete_files: {len(incomplete_files)}")
-
-            # Break the loop when there are no incomplete files
-            if len(incomplete_files) == 0:
-                break
-            if len(incomplete_files) <= num_tasks:
-                num_tasks = 1
-
-            # Split tasks into parts.
-            part_len = len(incomplete_files) // num_tasks
-            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
-            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
-
-            # Use a pool of workers to process the files in parallel.
-            with Pool() as pool:
-                pool.starmap(annotate, task_args)
-
-        except Exception as e: 
-            print(f"Error: {e}")
-
-    # Combine all the processed files into one
-    combined_contents = {}
-    json_path = args.output_json
-
-    # Iterate through json files
-    for file_name in os.listdir(output_dir):
-        if file_name.endswith(".json"):
-            file_path = os.path.join(output_dir, file_name)
-            with open(file_path, "r") as json_file:
-                content = json.load(json_file)
-                combined_contents[file_name[:-5]] = content
-
-    # Write combined content to a json file
-    with open(json_path, "w") as json_file:
-        json.dump(combined_contents, json_file)
-    print("All evaluation completed!")
-
-    # Calculate average score
-    score_sum = 0
-    count = 0
-    for key, result in combined_contents.items():
-        count += 1
-        score_match = result[0]['score']
-        score = int(score_match)
-        score_sum += score
-    average_score = score_sum / count
-
-    print("Average score for consistency:", average_score)
-
-
-if __name__ == "__main__":
-    main()
-
diff --git a/test_benchmark/quantitative_evaluation/evaluate_zeroshot.sh b/test_benchmark/quantitative_evaluation/evaluate_zeroshot.sh
deleted file mode 100644
index bb79f0b..0000000
--- a/test_benchmark/quantitative_evaluation/evaluate_zeroshot.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-#SBATCH --partition=batch
-
-
-#SBATCH --job-name=mistral_all%j
-#SBATCH --output=mistral_all%j.out
-#SBATCH --error=mistral_all%j.err
-#SBATCH --time=0-10:00:00
-#SBATCH --mem=64G
-#SBATCH --nodes=1
-## run the application:
-
-# PRED="./../../results/cmd_webvid_video_instruct_checkpoint_0_Video_validation_Dataset_subtitles.json"
-# OUTPUT_DIR="./../output/Video_validation_Dataset/cmd_webvid_video_instruct_checkpoint_0_Video_validation_Dataset_subtitles"
-# # rm -rf $OUTPUT_DIR
-# API_KEY="api_key"
-# NUM_TASKS=128
-
-
-
-PRED="pred_path"
-OUTPUT_DIR="output_dir"
-API_KEY="api_key"
-NUM_TASKS=128
-
-
-python evaluate_activitynet_qa.py \
-    --pred_path ${PRED} \
-    --output_dir "${OUTPUT_DIR}/fewshot_accuracy" \
-    --output_json "${OUTPUT_DIR}/fewshot_accuracy_results.json"\
-    --api_key $API_KEY \
-    --num_tasks $NUM_TASKS
-
-echo pred_path: $PRED
\ No newline at end of file