diff --git a/.gitignore b/.gitignore
index c899eeb..58b99ed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -170,3 +170,4 @@ demotask.md
 
 # benchmark reports
 benchmark/reports/
+!benchmark/reports/*_reference.md
diff --git a/README.md b/README.md
index bb367e5..0c5480a 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ AI-Powered End-to-End Task Implementation & blazingly fast Codebase-to-LLM Conte
 [Templates](#-templates) •
 [Configuration](#-configuration) •
 [API](#-api) •
-[Benchmarking](#-benchmarking) •
+[Benchmarking](#benchmarking) •
 [Contributing](#-contributing) •
 [Roadmap](#-roadmap) •
 [FAQ](#-faq)
@@ -28,7 +28,7 @@ AI-Powered End-to-End Task Implementation & blazingly fast Codebase-to-LLM Conte
 
 CodeWhisper is a powerful tool that bridges the gap between your codebase and Large Language Models (LLMs). It serves two primary functions:
 
-1. **AI-Powered End-to-End Task Implementation**: Tackle complex, codebase-spanning tasks with ease. CodeWhisper doesn't just suggest snippets; it plans, generates, and applies comprehensive code changes across your entire project, from backend logic to frontend integration.
+1. **AI-Powered End-to-End Task Implementation**: Tackle complex, codebase-spanning tasks with ease. CodeWhisper doesn't just suggest snippets; it plans, generates, and applies comprehensive code changes across your entire project, from backend logic to frontend integration. CodeWhisper's generations are SOTA and outperform other AI-code generation tools in benchmarks. See [Benchmarking](#benchmarking) for more details.
 
 2. **Precision-Guided Context Curation for LLMs**: Harness the power of human insight to feed AI exactly what it needs. Quickly transform carefully selected parts of your codebase into rich, relevant context for LLMs, ensuring more accurate and project-aligned results.
 
@@ -112,26 +112,27 @@ While CodeWhisper excels at performing individual coding tasks and even large fe
 
 ## ✨ Key Features
 
-| Feature                                         | Description                                                       |
-| ----------------------------------------------- | ----------------------------------------------------------------- |
-| 🧠 AI-powered task planning and code generation | Leverage AI to plan and implement complex coding tasks            |
-| 🔄 Full git integration                         | Version control of AI-generated changes                           |
-| 🔄 Diff-based code modifications                | Handle larger edits within output token limits                    |
-| 🌍 Support for various LLM providers            | Compatible with Anthropic, OpenAI, Ollama and Groq                |
-| 🔐 Support for local models                     | Use local models via Ollama                                       |
-| 🚀 Blazingly fast code processing               | Concurrent workers for improved performance                       |
-| 🎯 Customizable file filtering and exclusion    | Fine-tune which files to include in the context                   |
-| 📊 Intelligent caching                          | Improved performance through smart caching                        |
-| 🔧 Extensible template system                   | Interactive variable prompts for flexible output                  |
-| 🖊️ Custom variables in templates                | Support for single-line and multi-line custom variables           |
-| 💾 Value caching                                | Quick template reuse with cached values                           |
-| 🖥️ CLI and programmatic API                     | Use CodeWhisper in scripts or as a library                        |
-| 🔒 Respect for .gitignore                       | Option to use custom include and exclude globs                    |
-| 🌈 Full language support                        | Compatible with all text-based file types                         |
-| 🤖 Interactive mode                             | Granular file selection and template customization                |
-| ⚡ Optimized for large repositories             | Efficient processing of extensive codebases                       |
-| 📝 Detailed logging                             | Log AI prompts, responses, and parsing results                    |
-| 🔗 GitHub integration                           | Fetch and work with issues (see [Configuration](#-configuration)) |
+| Feature                                         | Description                                                                                                                                         |
+| ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
+| 🧠 AI-powered task planning and code generation | Leverage AI to plan and implement complex coding tasks                                                                                              |
+| 🚀 SOTA generations                             | CodeWhisper's generations are SOTA and outperform other AI-code generation tools in benchmarks. See [Benchmarking](#benchmarking) for more details. |
+| 🔄 Full git integration                         | Version control of AI-generated changes                                                                                                             |
+| 🔄 Diff-based code modifications                | Handle larger edits within output token limits                                                                                                      |
+| 🌍 Support for various LLM providers            | Compatible with Anthropic, OpenAI, Ollama and Groq                                                                                                  |
+| 🔐 Support for local models                     | Use local models via Ollama                                                                                                                         |
+| 🚀 Blazingly fast code processing               | Concurrent workers for improved performance                                                                                                         |
+| 🎯 Customizable file filtering and exclusion    | Fine-tune which files to include in the context                                                                                                     |
+| 📊 Intelligent caching                          | Improved performance through smart caching                                                                                                          |
+| 🔧 Extensible template system                   | Interactive variable prompts for flexible output                                                                                                    |
+| 🖊️ Custom variables in templates                | Support for single-line and multi-line custom variables                                                                                             |
+| 💾 Value caching                                | Quick template reuse with cached values                                                                                                             |
+| 🖥️ CLI and programmatic API                     | Use CodeWhisper in scripts or as a library                                                                                                          |
+| 🔒 Respect for .gitignore                       | Option to use custom include and exclude globs                                                                                                      |
+| 🌈 Full language support                        | Compatible with all text-based file types                                                                                                           |
+| 🤖 Interactive mode                             | Granular file selection and template customization                                                                                                  |
+| ⚡ Optimized for large repositories             | Efficient processing of extensive codebases                                                                                                         |
+| 📝 Detailed logging                             | Log AI prompts, responses, and parsing results                                                                                                      |
+| 🔗 GitHub integration                           | Fetch and work with issues (see [Configuration](#-configuration))                                                                                   |
 
 ## 📺 Video
 
@@ -387,7 +388,7 @@ For more detailed instructions on using the GitHub integration and other CodeWhi
 
 CodeWhisper can be used programmatically in your Node.js projects. For detailed API documentation and examples, please refer to [USAGE.md](USAGE.md).
 
-## 🏋️ Benchmarking
+## Benchmarking
 
 CodeWhisper includes a benchmarking tool to evaluate its performance on Exercism Python exercises. This tool allows you to assess the capabilities of different AI models and configurations.
 
@@ -424,6 +425,18 @@ Reports are saved in `benchmark/reports/` with timestamped filenames.
 
 For full details on running benchmarks, interpreting results, and available options, please refer to the [Benchmark README](./benchmark/README.md).
 
+### Results
+
+CodeWhisper's performance has been evaluated across different models using the Exercism Python exercises. Below is a summary of the benchmark results:
+
+| Model                      | Tests Passed | Time (s) | Cost ($) | Command                                              |
+| -------------------------- | ------------ | -------- | -------- | ---------------------------------------------------- |
+| claude-3-5-sonnet-20240620 | 80.27%       | 1619.49  | 3.4000   | `./benchmark/run_benchmark.sh --workers 5 --no-plan` |
+
+These results provide insights into the efficiency and accuracy of different models when used with CodeWhisper. The "Tests Passed" percentage indicates the proportion of Exercism tests successfully completed, while the time and cost metrics offer a view of the resource requirements for each model.
+
+As we continue to run benchmarks with various models and configurations, this table will be updated to provide a comprehensive comparison, helping users make informed decisions about which model might best suit their needs.
+
 ## 🤝 Contributing
 
 We welcome contributions to CodeWhisper! Please read our [CONTRIBUTING.md](CONTRIBUTING.md) for details on our code of conduct and the process for submitting pull requests.
diff --git a/benchmark/README.md b/benchmark/README.md
index 811cccd..4a72721 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -7,6 +7,21 @@ This benchmark tool is designed to evaluate the performance of CodeWhisper on Ex
 - Running the full benchmark will use a significant amount of tokens.
 - Too many concurrent workers is likely to cause rate limiting issues.
 
+## Results
+
+CodeWhisper's performance has been evaluated across different models using the Exercism Python exercises. Below is a summary of the benchmark results:
+
+| Model                      | Tests Passed | Time (s) | Cost ($) | Command                                                                        |
+| -------------------------- | ------------ | -------- | -------- | ------------------------------------------------------------------------------ |
+| claude-3-5-sonnet-20240620 | 80.27%       | 1619.49  | 3.4000   | `./benchmark/run_benchmark.sh --workers 5 --no-plan`                           |
+| gpt-4o-2024-08-06          | 81.51%       | 986.68   | 1.6800   | `./benchmark/run_benchmark.sh --workers 5 --no-plan --model gpt-4o-2024-08-06` |
+
+The full reports used to generate these results are available in the `benchmark/reports/` directory.
+
+These results provide insights into the efficiency and accuracy of different models when used with CodeWhisper. The "Tests Passed" percentage indicates the proportion of Exercism tests successfully completed, while the time and cost metrics offer a view of the resource requirements for each model.
+
+As we continue to run benchmarks with various models and configurations, this table will be updated to provide a comprehensive comparison, helping users make informed decisions about which model might best suit their needs.
+
 ## Usage
 
 1. Build the Docker image:
diff --git a/benchmark/benchmark.ts b/benchmark/benchmark.ts
index 83862f5..016cbb4 100644
--- a/benchmark/benchmark.ts
+++ b/benchmark/benchmark.ts
@@ -1,18 +1,25 @@
 import * as fs from 'node:fs';
 import * as path from 'node:path';
+import { setTimeout } from 'node:timers/promises';
 import ora from 'ora';
 import pLimit from 'p-limit';
 import type { BenchmarkResult, SummaryStats } from './types';
 import { cloneRepo, runExercise } from './utils';
 
 const DEBUG_MODE = process.env.DEBUG_MODE === 'true';
-const DEBUG_LIMIT = 10;
+const DEBUG_SKIP = DEBUG_MODE ? 113 : 0; // Skip the first 113 exercises in debug mode
 
 const EXERCISM_REPO = 'https://github.com/exercism/python.git';
 const REPO_DIR = '/tmp/exercism-python';
 const EXERCISES_DIR = path.join(REPO_DIR, 'exercises', 'practice');
 
 async function main(): Promise<void> {
+  console.log('Main function started');
+  console.log('Process ID:', process.pid);
+  console.log('Node version:', process.version);
+  console.log('Current working directory:', process.cwd());
+  console.log('Debug mode:', DEBUG_MODE ? 'ON' : 'OFF');
+
   try {
     const model = process.env.MODEL || 'claude-3-5-sonnet-20240620';
     const concurrentWorkers = Number.parseInt(
@@ -36,11 +43,12 @@ async function main(): Promise<void> {
     // Get list of exercises
     let exercises = fs
       .readdirSync(EXERCISES_DIR)
-      .map((dir) => path.join(EXERCISES_DIR, dir));
+      .map((dir) => path.join(EXERCISES_DIR, dir))
+      .sort();
 
     if (DEBUG_MODE) {
-      console.log(`DEBUG: Limiting tests to ${DEBUG_LIMIT} for testing`);
-      exercises = exercises.slice(0, DEBUG_LIMIT);
+      console.log(`DEBUG: Skipping the first ${DEBUG_SKIP} exercises`);
+      exercises = exercises.slice(DEBUG_SKIP);
     } else if (numTests !== 'all') {
       const numTestsInt = Number.parseInt(numTests, 10);
       if (Number.isNaN(numTestsInt) || numTestsInt <= 0) {
@@ -53,57 +61,107 @@ async function main(): Promise<void> {
 
     console.log(`Total exercises to run: ${exercises.length}`);
 
+    const reportDir = '/app/benchmark/reports';
+    if (!fs.existsSync(reportDir)) {
+      fs.mkdirSync(reportDir, { recursive: true });
+    }
+
+    const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+    const reportFileName = `benchmark_report_${timestamp}.md`;
+    const reportPath = path.join(reportDir, reportFileName);
+
+    console.log(`Benchmark report will be saved as ${reportFileName}`);
+
     // Set up concurrent limit
     const limit = pLimit(concurrentWorkers);
 
-    // Run exercises concurrently
+    // Run exercises concurrently and write results incrementally
     spinner.text = 'Running exercises';
-    let completedExercises = 0;
-    const results: BenchmarkResult[] = await Promise.all(
-      exercises.map((exerciseDir) =>
-        limit(async () => {
-          const exerciseName = path.basename(exerciseDir);
-          spinner.text = `Running exercise: ${exerciseName}`;
-          const result = await runExercise(
-            exerciseDir,
-            model,
-            noPlan,
-            diffMode,
-          );
-          completedExercises++;
-          console.log(
-            `Completed ${completedExercises}/${exercises.length}: ${exerciseName}`,
-          );
-          return result;
-        }),
-      ),
+    const resultPromises = exercises.map((exerciseDir, index) =>
+      limit(async () => {
+        const exerciseName = path.basename(exerciseDir);
+        spinner.text = `Running exercise: ${exerciseName}`;
+        console.log(
+          `Starting exercise ${index + 1}/${exercises.length}: ${exerciseName}`,
+        );
+
+        const exercisePromise = runExercise(
+          exerciseDir,
+          model,
+          noPlan,
+          diffMode,
+        );
+        const timeoutPromise = setTimeout(
+          60000,
+          'Exercise execution timed out',
+        );
+
+        let result: BenchmarkResult;
+        try {
+          const raceResult = await Promise.race([
+            exercisePromise,
+            timeoutPromise,
+          ]);
+          if (typeof raceResult === 'string') {
+            throw new Error(raceResult);
+          }
+          result = raceResult;
+        } catch (error) {
+          console.error(`Error in exercise ${exerciseName}:`, error);
+          result = {
+            exercise: exerciseName,
+            time_taken: 60, // 1 minute timeout
+            total_cost: 0,
+            mode_used: diffMode ? 'diff' : 'whole',
+            model_used: model,
+            test_passed: false,
+            test_output: 'Exercise execution timed out or errored',
+            total_tests: 0,
+            passed_tests: 0,
+            failed_tests: [],
+            errors: [
+              error instanceof Error ? error.message : 'Unknown error occurred',
+            ],
+          };
+        }
+
+        // Write result to report file (use a lock here if necessary)
+        writeResultToReport(result, reportPath, index);
+
+        console.log(
+          `Completed ${index + 1}/${exercises.length}: ${exerciseName}`,
+        );
+        return result;
+      }),
     );
 
+    const results = await Promise.all(resultPromises);
+
     spinner.succeed('Benchmark completed');
 
-    // Calculate summary statistics
+    // Calculate summary
     const summary: SummaryStats = results.reduce(
       (acc, result) => {
         acc.totalTime += result.time_taken;
         acc.totalCost += result.total_cost;
         acc.passedTests += result.test_passed ? 1 : 0;
+        acc.totalTests += result.total_tests;
+        acc.totalPassedTests += result.passed_tests;
         return acc;
       },
-      { totalTime: 0, totalCost: 0, passedTests: 0 },
+      {
+        totalTime: 0,
+        totalCost: 0,
+        passedTests: 0,
+        totalTests: 0,
+        totalPassedTests: 0,
+      },
     );
 
-    // Generate and save markdown report with timestamp
-    const reportDir = '/app/benchmark/reports';
-    if (!fs.existsSync(reportDir)) {
-      fs.mkdirSync(reportDir, { recursive: true });
-    }
-
-    const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
-    const reportFileName = `benchmark_report_${timestamp}.md`;
-    const reportPath = path.join(reportDir, reportFileName);
-
-    const markdownReport = generateMarkdownReport(results, summary);
-    fs.writeFileSync(reportPath, markdownReport);
+    // Generate and prepend summary to the report
+    const summaryMarkdown = generateSummaryMarkdown(results, summary);
+    const existingReport = fs.readFileSync(reportPath, 'utf8');
+    fs.writeFileSync(reportPath, summaryMarkdown + existingReport);
 
     console.log(`Benchmark report saved as ${reportFileName}`);
 
@@ -112,7 +170,10 @@ async function main(): Promise<void> {
     console.log(`Total time: ${summary.totalTime.toFixed(2)} seconds`);
     console.log(`Total cost: $${summary.totalCost.toFixed(4)}`);
     console.log(
-      `Passed tests: ${summary.passedTests}/${results.length} (${((summary.passedTests / results.length) * 100).toFixed(2)}%)`,
+      `Passed exercises: ${summary.passedTests}/${results.length} (${((summary.passedTests / results.length) * 100).toFixed(2)}%)`,
+    );
+    console.log(
+      `Total tests passed: ${summary.totalPassedTests}/${summary.totalTests} (${((summary.totalPassedTests / summary.totalTests) * 100).toFixed(2)}%)`,
     );
 
     console.log('Benchmark process finished. Exiting.');
@@ -124,7 +185,48 @@ async function main(): Promise<void> {
   }
 }
 
-function generateMarkdownReport(
+function writeResultToReport(
+  result: BenchmarkResult,
+  reportPath: string,
+  index: number,
+): void {
+  let markdown = '';
+  if (index === 0) {
+    markdown += '# CodeWhisper Benchmark Report\n\n';
+    markdown += '## Detailed Results\n\n';
+  }
+
+  markdown += `### ${index + 1}. ${result.exercise}\n\n`;
+  markdown += `- **Time taken:** ${result.time_taken.toFixed(2)} seconds\n`;
+  markdown += `- **Cost:** $${result.total_cost.toFixed(4)}\n`;
+  markdown += `- **Mode used:** ${result.mode_used}\n`;
+  markdown += `- **Model used:** ${result.model_used}\n`;
+  const exerciseTestPassPercentage =
+    result.total_tests > 0
+      ? ((result.passed_tests / result.total_tests) * 100).toFixed(2)
+      : '0.00';
+  markdown += `- **Tests passed:** ${result.passed_tests}/${result.total_tests} (${exerciseTestPassPercentage}%)\n`;
+
+  if (result.failed_tests.length > 0) {
+    markdown += '- **Failed tests:**\n';
+    for (const test of result.failed_tests) {
+      markdown += `  - ${test}\n`;
+    }
+  }
+
+  if (result.errors.length > 0) {
+    markdown += '- **Errors:**\n';
+    for (const error of result.errors) {
+      markdown += `  - ${error}\n`;
+    }
+  }
+
+  markdown += '\n';
+
+  fs.appendFileSync(reportPath, markdown);
+}
+
+function generateSummaryMarkdown(
   results: BenchmarkResult[],
   summary: SummaryStats,
 ): string {
@@ -133,34 +235,12 @@ function generateMarkdownReport(
   markdown += '## Summary\n\n';
   markdown += `- **Total time:** ${summary.totalTime.toFixed(2)} seconds\n`;
   markdown += `- **Total cost:** $${summary.totalCost.toFixed(4)}\n`;
-  markdown += `- **Passed exercises:** ${summary.passedTests}/${results.length} (${((summary.passedTests / results.length) * 100).toFixed(2)}%)\n\n`;
-
-  markdown += '## Detailed Results\n\n';
-
-  results.forEach((result, index) => {
-    markdown += `### ${index + 1}. ${result.exercise}\n\n`;
-    markdown += `- **Time taken:** ${result.time_taken.toFixed(2)} seconds\n`;
-    markdown += `- **Cost:** $${result.total_cost.toFixed(4)}\n`;
-    markdown += `- **Mode used:** ${result.mode_used}\n`;
-    markdown += `- **Model used:** ${result.model_used}\n`;
-    markdown += `- **Tests passed:** ${result.passed_tests}/${result.total_tests}\n`;
-
-    if (result.failed_tests.length > 0) {
-      markdown += '- **Failed tests:**\n';
-      for (const test of result.failed_tests) {
-        markdown += `  - ${test}\n`;
-      }
-    }
-
-    if (result.errors.length > 0) {
-      markdown += '- **Errors:**\n';
-      for (const error of result.errors) {
-        markdown += `  - ${error}\n`;
-      }
-    }
-
-    markdown += '\n';
-  });
+  markdown += `- **Passed exercises:** ${summary.passedTests}/${results.length} (${((summary.passedTests / results.length) * 100).toFixed(2)}%)\n`;
+  const testPassPercentage =
+    summary.totalTests > 0
+      ? ((summary.totalPassedTests / summary.totalTests) * 100).toFixed(2)
+      : '0.00';
+  markdown += `- **Total tests passed:** ${summary.totalPassedTests}/${summary.totalTests} (${testPassPercentage}%)\n\n`;
 
   return markdown;
 }
@@ -170,3 +250,9 @@ main().catch((error) => {
   console.error('Unhandled error in main:', error);
   process.exit(1);
 });
+
+process.on('unhandledRejection', (reason, promise) => {
+  console.error('Unhandled Rejection at:', promise, 'reason:', reason);
+  // Optionally exit the process
+  // process.exit(1);
+});
diff --git a/benchmark/types.ts b/benchmark/types.ts
index 73ae160..e1350ea 100644
--- a/benchmark/types.ts
+++ b/benchmark/types.ts
@@ -19,8 +19,10 @@ export interface BenchmarkResult {
   errors: string[];
 }
 
-export interface SummaryStats {
+export type SummaryStats = {
   totalTime: number;
   totalCost: number;
   passedTests: number;
-}
+  totalTests: number;
+  totalPassedTests: number;
+};
diff --git a/benchmark/utils.ts b/benchmark/utils.ts
index 11da78f..533a4e5 100644
--- a/benchmark/utils.ts
+++ b/benchmark/utils.ts
@@ -1,6 +1,7 @@
 import { exec } from 'node:child_process';
 import * as fs from 'node:fs';
 import * as path from 'node:path';
+import { setTimeout } from 'node:timers/promises';
 import { promisify } from 'node:util';
 import type { BenchmarkResult, CodeWhisperResult } from './types';
 
@@ -38,15 +39,17 @@ export async function runCodeWhisper(
 
   const solutionFile = path.join(exerciseDir, config.files.solution[0]);
   const testFile = path.join(exerciseDir, config.files.test[0]);
+  // const introductionFile = path.join(exerciseDir, '.docs', 'introduction.md');
   const instructionsFile = path.join(exerciseDir, '.docs', 'instructions.md');
 
   // Use relative paths for the -f option
   const relSolutionFile = path.relative(exerciseDir, solutionFile);
   const relTestFile = path.relative(exerciseDir, testFile);
+  // const relIntroductionFile = path.relative(exerciseDir, introductionFile);
   const relInstructionsFile = path.relative(exerciseDir, instructionsFile);
 
   const planFlag = noPlan ? '--no-plan' : '--accept-plan';
-  const cmd = `node /app/dist/cli/index.js task -t "Solve the following problem" --description "Solve the problem described in the instructions.md file by editing the file ${relSolutionFile}. Ensure the solution passes the tests in ${relTestFile}." -i " " --skip-files ${planFlag} --model "${model}" --path "${exerciseDir}" ${diffMode} -f "${relSolutionFile}" "${relTestFile}" "${relInstructionsFile}"`;
+  const cmd = `node /app/dist/cli/index.js task -t "Complete the following task" --description "Complete the task described in the instructions.md file by modifying the file ${relSolutionFile}. Ensure the solution passes the tests in ${relTestFile}." -i "Don't change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc. Only use standard python libraries, don't suggest installing any packages. The test file that is provided is 100% correct and will pass if the solution is correct." --skip-files ${planFlag} --model "${model}" --path "${exerciseDir}" ${diffMode} -f "${relSolutionFile}" "${relTestFile}" "${relInstructionsFile}" --log-ai-interactions`;
 
   const startTime = Date.now();
   const { stdout } = await execAsync(cmd);
@@ -90,54 +93,68 @@ export async function runTests(testFile: string): Promise<{
   passed_tests: number;
   failed_tests: string[];
 }> {
+  const testFileName = path.basename(testFile);
+  console.log(`Running tests for ${testFileName}`);
+
   try {
     const testDir = path.dirname(testFile);
     const { stdout, stderr } = await execAsync(
-      `python3 -m unittest ${path.basename(testFile)}`,
+      `python3 -m unittest ${testFileName}`,
       { cwd: testDir },
     );
     const output = stdout + stderr;
-
-    const totalTests = Number.parseInt(
-      output.match(/Ran (\d+) test/)?.[1] || '0',
-    );
-    const failedTests =
-      output.match(/FAIL: (test_\w+)/g)?.map((match) => match.split(': ')[1]) ||
-      [];
-    const passedTests = totalTests - failedTests.length;
-
-    // Create a concise output
-    const conciseOutput =
-      '.'.repeat(passedTests) + 'F'.repeat(failedTests.length);
-
-    return {
-      passed: failedTests.length === 0,
-      output: conciseOutput,
-      total_tests: totalTests,
-      passed_tests: passedTests,
-      failed_tests: failedTests,
-    };
+    return parseTestOutput(output, testFileName);
   } catch (error) {
+    console.error(`Error running tests for ${testFileName}:`, error);
     if (error instanceof Error && 'stdout' in error && 'stderr' in error) {
       const output = (error.stdout as string) + (error.stderr as string);
-      return {
-        passed: false,
-        output: 'E',
-        total_tests: 0,
-        passed_tests: 0,
-        failed_tests: [],
-      };
+      return parseTestOutput(output, testFileName);
     }
     return {
       passed: false,
-      output: 'E',
+      output: error instanceof Error ? error.message : 'Unknown error occurred',
       total_tests: 0,
       passed_tests: 0,
-      failed_tests: [],
+      failed_tests: ['Error running tests'],
     };
   }
 }
 
+function parseTestOutput(
+  output: string,
+  testFileName: string,
+): {
+  passed: boolean;
+  output: string;
+  total_tests: number;
+  passed_tests: number;
+  failed_tests: string[];
+} {
+  // Count dots for passed tests and F's for failed tests
+  const passedCount = (output.match(/\./g) || []).length;
+  const failedCount = (output.match(/F/g) || []).length;
+  const totalTests = passedCount + failedCount;
+
+  // Extract failed test names
+  const failedTests = (output.match(/FAIL: (test_\w+)/g) || []).map(
+    (match) => match.split(': ')[1],
+  );
+
+  const passed = failedCount === 0;
+
+  console.log(
+    `Tests completed for ${testFileName}. Total: ${totalTests}, Passed: ${passedCount}, Failed: ${failedCount}`,
+  );
+
+  return {
+    passed,
+    output,
+    total_tests: totalTests,
+    passed_tests: passedCount,
+    failed_tests: failedTests,
+  };
+}
+
 export async function runExercise(
   exerciseDir: string,
   model: string,
@@ -147,40 +164,73 @@ export async function runExercise(
   const exerciseName = path.basename(exerciseDir);
   console.log(`Starting exercise: ${exerciseName}`);
 
-  const configFile = path.join(exerciseDir, '.meta', 'config.json');
-  const config = JSON.parse(fs.readFileSync(configFile, 'utf-8'));
-
-  const testFile = path.join(exerciseDir, config.files.test[0]);
-
-  let codewhisperResult: CodeWhisperResult | undefined;
-  let codewhisperError: string | null = null;
-
   try {
+    // Read the config file to get the correct test file name
+    const configFile = path.join(exerciseDir, '.meta', 'config.json');
+    const config = JSON.parse(fs.readFileSync(configFile, 'utf-8'));
+    const testFileName = config.files.test[0]; // Get the first test file name
+
+    // Run CodeWhisper with a timeout
     const codewhisperPromise = runCodeWhisper(
       exerciseDir,
       model,
       noPlan,
       diffMode,
     );
-    const timeoutPromise = new Promise<never>(
-      (_, reject) =>
-        setTimeout(
-          () => reject(new Error('CodeWhisper execution timed out')),
-          600000,
-        ), // 10 minutes timeout
-    );
-    codewhisperResult = await Promise.race([
+    const timeoutPromise = setTimeout(60000, 'CodeWhisper execution timed out');
+
+    const codewhisperResult = await Promise.race([
       codewhisperPromise,
       timeoutPromise,
     ]);
-  } catch (error) {
-    codewhisperError =
-      error instanceof Error ? error.message : 'Unknown error occurred';
-    console.error(`Error in runCodeWhisper for ${exerciseName}:`, error);
-  }
 
-  if (codewhisperError || !codewhisperResult) {
-    console.log(`Exercise ${exerciseName} failed during CodeWhisper execution`);
+    if (codewhisperResult === 'CodeWhisper execution timed out') {
+      console.log(
+        `CodeWhisper execution for ${exerciseName} timed out after 1 minute`,
+      );
+      return {
+        exercise: exerciseName,
+        time_taken: 600,
+        total_cost: 0,
+        mode_used: diffMode ? 'diff' : 'whole',
+        model_used: model,
+        test_passed: false,
+        test_output: 'CodeWhisper execution timed out',
+        total_tests: 0,
+        passed_tests: 0,
+        failed_tests: [],
+        errors: ['CodeWhisper execution timed out after 1 minutes'],
+      };
+    }
+
+    console.log(
+      `CodeWhisper execution for ${exerciseName} completed. Running tests.`,
+    );
+
+    // Run tests
+    const testFile = path.join(exerciseDir, testFileName);
+    const testResult = await runTests(testFile);
+
+    console.log(
+      `Tests for ${exerciseName} completed. Result: ${testResult.passed ? 'PASSED' : 'FAILED'}`,
+    );
+
+    // Calculate metrics
+    return {
+      exercise: exerciseName,
+      time_taken: (codewhisperResult as CodeWhisperResult).time / 1000, // Convert to seconds
+      total_cost: (codewhisperResult as CodeWhisperResult).totalCost,
+      mode_used: (codewhisperResult as CodeWhisperResult).modeUsed,
+      model_used: model,
+      test_passed: testResult.passed,
+      test_output: testResult.output,
+      total_tests: testResult.total_tests,
+      passed_tests: testResult.passed_tests,
+      failed_tests: testResult.failed_tests,
+      errors: [],
+    };
+  } catch (error) {
+    console.error(`Error in exercise ${exerciseName}:`, error);
     return {
       exercise: exerciseName,
       time_taken: 0,
@@ -192,30 +242,9 @@ export async function runExercise(
       total_tests: 0,
       passed_tests: 0,
       failed_tests: [],
-      errors: codewhisperError
-        ? [codewhisperError]
-        : ['CodeWhisper execution failed'],
+      errors: [
+        error instanceof Error ? error.message : 'Unknown error occurred',
+      ],
     };
   }
-
-  // Run tests
-  console.log(`Running tests for ${exerciseName}`);
-  const testResult = await runTests(testFile);
-
-  console.log(`Completed exercise: ${exerciseName}`);
-
-  // Calculate metrics
-  return {
-    exercise: exerciseName,
-    time_taken: codewhisperResult.time / 1000, // Convert to seconds
-    total_cost: codewhisperResult.totalCost,
-    mode_used: codewhisperResult.modeUsed,
-    model_used: model,
-    test_passed: testResult.passed,
-    test_output: testResult.output,
-    total_tests: testResult.total_tests,
-    passed_tests: testResult.passed_tests,
-    failed_tests: testResult.failed_tests,
-    errors: [],
-  };
 }