From 22362a797cbf938d5b823476a9304d95009e0d9e Mon Sep 17 00:00:00 2001 From: Zheng0428 <1219979760@qq.com> Date: Sat, 4 May 2024 22:20:33 +0800 Subject: [PATCH] add page --- index.html | 1478 ++ static/.DS_Store | Bin 0 -> 6148 bytes static/css/bulma-carousel.min.css | 1 + static/css/bulma-slider.min.css | 1 + static/css/bulma.css.map.txt | 1 + static/css/bulma.min.css | 1 + static/css/fontawesome.all.min.css | 5 + static/css/index.css | 146 + static/css/jquery.dataTables.css | 459 + static/images/Figure_1.jpg | Bin 0 -> 217663 bytes static/images/ablation_results.png | Bin 0 -> 469932 bytes static/images/combined.jpg | Bin 0 -> 258108 bytes static/images/example.jpg | Bin 0 -> 474100 bytes static/images/mammoth_github.png | Bin 0 -> 552574 bytes static/images/mammoth_icon.png | Bin 0 -> 4057 bytes static/images/overall_results.png | Bin 0 -> 167475 bytes static/images/presentation.pptx | Bin 0 -> 845541 bytes static/images/teaser.jpg | Bin 0 -> 299670 bytes static/images/teaser_front.jpg | Bin 0 -> 234377 bytes static/images/url.jpg | Bin 0 -> 318121 bytes static/images/xiang.pptx | Bin 0 -> 1117684 bytes .../Figure_1.jpg" | Bin 0 -> 217663 bytes .../combined.jpg" | Bin 0 -> 258108 bytes .../example.jpg" | Bin 0 -> 474100 bytes .../teaser.jpg" | Bin 0 -> 299670 bytes .../teaser_front.jpg" | Bin 0 -> 234377 bytes .../url.jpg" | Bin 0 -> 318121 bytes static/js/bulma-carousel.js | 2371 +++ static/js/bulma-carousel.min.js | 1 + static/js/bulma-slider.js | 461 + static/js/bulma-slider.min.js | 1 + static/js/fontawesome.all.min.js | 5 + static/js/index.js | 49 + static/js/jquery-3.5.1.js | 10872 +++++++++++ static/js/jquery.dataTables.js | 15381 ++++++++++++++++ static/js/jquery.min.js | 2 + static/js/main.js | 2 + 37 files changed, 31237 insertions(+) create mode 100644 index.html create mode 100644 static/.DS_Store create mode 100644 static/css/bulma-carousel.min.css create mode 100644 static/css/bulma-slider.min.css create mode 100644 static/css/bulma.css.map.txt create mode 100644 static/css/bulma.min.css create mode 100644 static/css/fontawesome.all.min.css create mode 100644 static/css/index.css create mode 100644 static/css/jquery.dataTables.css create mode 100644 static/images/Figure_1.jpg create mode 100644 static/images/ablation_results.png create mode 100644 static/images/combined.jpg create mode 100644 static/images/example.jpg create mode 100644 static/images/mammoth_github.png create mode 100644 static/images/mammoth_icon.png create mode 100644 static/images/overall_results.png create mode 100644 static/images/presentation.pptx create mode 100644 static/images/teaser.jpg create mode 100644 static/images/teaser_front.jpg create mode 100644 static/images/url.jpg create mode 100644 static/images/xiang.pptx create mode 100644 "static/images/\350\275\254\346\215\242\345\233\276\345\203\217_6_1.8MB_imagesTool.com_82165/Figure_1.jpg" create mode 100644 "static/images/\350\275\254\346\215\242\345\233\276\345\203\217_6_1.8MB_imagesTool.com_82165/combined.jpg" create mode 100644 "static/images/\350\275\254\346\215\242\345\233\276\345\203\217_6_1.8MB_imagesTool.com_82165/example.jpg" create mode 100644 "static/images/\350\275\254\346\215\242\345\233\276\345\203\217_6_1.8MB_imagesTool.com_82165/teaser.jpg" create mode 100644 "static/images/\350\275\254\346\215\242\345\233\276\345\203\217_6_1.8MB_imagesTool.com_82165/teaser_front.jpg" create mode 100644 "static/images/\350\275\254\346\215\242\345\233\276\345\203\217_6_1.8MB_imagesTool.com_82165/url.jpg" create mode 100644 static/js/bulma-carousel.js create mode 100644 static/js/bulma-carousel.min.js create mode 100644 static/js/bulma-slider.js create mode 100644 static/js/bulma-slider.min.js create mode 100644 static/js/fontawesome.all.min.js create mode 100644 static/js/index.js create mode 100644 static/js/jquery-3.5.1.js create mode 100644 static/js/jquery.dataTables.js create mode 100644 static/js/jquery.min.js create mode 100644 static/js/main.js diff --git a/index.html b/index.html new file mode 100644 index 0000000..cfb9dc8 --- /dev/null +++ b/index.html @@ -0,0 +1,1478 @@ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + ++ Instruction tuning improves the reasoning abilities of large language models +(LLMs), with data quality and scalability being the crucial factors. Most instruction +tuning data come from human crowd-sourcing or GPT-4 distillation. We propose a +paradigm to efficiently harvest 10 million naturally existing instruction data from +the pre-training web corpus to enhance LLM reasoning. Our approach involves +(1) recalling relevant documents, (2) extracting instruction-response pairs, and (3) +refining the extracted pairs using open-source LLMs. Fine-tuning base LLMs on +this dataset, we build MAmmoTH2 models, which significantly boost performance on +reasoning benchmarks. Notably, MAmmoTH2-7B’s (Mistral) performance increases +from 11% to 34% on MATH and from 36% to 67% on GSM8K without training +on any in-domain data. Further training MAmmoTH2 on public instruction tuning +datasets yields MAmmoTH2-Plus, achieving state-of-the-art performance on several +reasoning and chatbot benchmarks. Our work demonstrates how to harvest large- +scale, high-quality instruction data without costly human annotation or GPT-4 +distillation, providing a new perspective on building better instruction tuning data.
+ +
+
+
+
+ Figure 1: Overview of MAmmoTH2-Plus results. The MAmmoTH2-8x7B-Plus variant outperforms Mixtral-Instruct on reasoning benchmarks, matching Qwen-1.5-110B with only 13% active parame-ters. It also surpasses Mixtral-Instruct by around 10 points on general code and chatbot benchmarks.
+
+
+
+ Reasoning is a fundamental aspect of human cognition and problem-solving [Clark et al., 2018, +Hendrycks et al., 2021a, Cobbe et al., 2021, Hendrycks et al., 2021b, Rein et al., 2023]. Proficiency in +reasoning is essential for advancing scientific knowledge, developing new technologies, and making +informed decisions in various contexts. Recently, large language models (LLMs) [Brown et al., 2020, +Ouyang et al., 2022, Touvron et al., 2023a,b, Achiam et al., 2023, Team et al., 2023] have shown +remarkable progress in various NLP tasks. However, their ability to perform complex reasoning +tasks [Lin et al., 2024] in the domains of mathematics, science, and engineering is still limited. + +
+
+
+
+
+ Figure 2: Comparison between our dataset curation method and previous studies.
+
+
+
+ Recent studies have extensively explored different approaches to enhance LLMs’ reasoning abilities +after pre-training. The two main approaches are continued training and instruction tuning. Continued +training focuses on training LLMs with language model loss on large-scale math or science-centric +documents [Lewkowycz et al., 2022, Taylor et al., 2022, Azerbayev et al., 2023, Shao et al., 2024, +Ying et al., 2024] recalled from the general web corpus. Instruction tuning, on the other hand, seeks +to employ supervised fine-tuning loss on curated high-quality instruction-response pairs [Ouyang +et al., 2022, Chung et al., 2024]. While human-annotated instruction datasets [Cobbe et al., 2021, +Hendrycks et al., 2021b, Amini et al., 2019] are often limited in scale, recent studies [Yu et al., 2023, +Yue et al., 2023, Tosh et al., 2024, Li et al., 2024, Tang et al., 2024] attempt to prompt state-of-the- +art models like GPT-4 with seed data to synthesize large-scale instruction datasets. However, the +synthesized instruction data becomes highly biased towards the seed data distribution and is prone to +a high degree of hallucination.
+
+To address these limitations, we propose to discover naturally existing instruction data from +the web. We argue that the web corpus (e.g., Common Crawl) already contains a vast amount of +high-quality instruction data that can enhance LLM reasoning. For example, textbooks, exams, +educational resources, and forums on the web contain a wealth of instruction-following pairs across +various domains like math, science, engineering, and humanities. We believe such readily available +instruction data is not only diverse but also of high quality. However, such instruction data is highly +dispersed across the corpus, which makes it particularly challenging to discover.
+
+In this paper, we aim to mine these instruction-response pairs from the web using a three-step pipeline. +(1) Recall step: We create a diverse seed dataset by crawling several quiz websites. We use this +seed data to train a fastText model [Joulin et al., 2016] and employ it to recall documents from +Common Crawl [Computer, 2023]. GPT-4 is used to trim down the recalled documents by their root +URL. We obtain 18M documents through this step. (2) Extract step: We utilize open-source LLMs +like Mixtral [Jiang et al., 2024] to extract Q-A pairs from these documents, producing roughly 5M +candidate Q-A pairs. (3) Refine step: After extraction, we further employ Mixtral-8×7B [Jiang et al., +2024] and Qwen-72B [Bai et al., 2023] to refine [Zheng et al., 2024b] these candidate Q-A pairs. This +refinement operation aims to remove unrelated content, fix formality, and add missing explanations +to the candidate Q-A pairs. This refinement operation is pivotal to maintaining the quality of the +mined Q-A pairs. Eventually, we harvest a total of 10M instruction-response pairs through these +steps. Unlike existing instruction-tuning dataset, our dataset WEBINSTRUCT is purely mined from +the Web without any human crowdsourcing or GPT-4 distillation.
+
+To validate the effectiveness of our WEBINSTRUCT, we train MAmmoTH2, a family of open-sourced +language models that are trained on a variety of base models, including Mistral-7B, Llama3-8B, +Mixtral-8×7B, and Yi-34B. Our experiments demonstrate the effectiveness of scaling up instruction +tuning on science reasoning tasks. On a range of held-out reasoning benchmarks including Theo- +remQA, GSM8K, MATH, ARC-C, MMLU-STEM, GPQA and BBH, MAmmoTH2 exhibits remarkably +better reasoning abilities compared to the base model. MAmmoTH2 boosts the performance of the base +model significantly. For example, MAmmoTH2-7B boosts the performance of Mistral-7B by 5-31% +across different datasets, while MAmmoTH2-34B boosts the Yi-34B base model by 6-24% across +different datasets. Notably, WEBINSTRUCT does not contain any data from any of the evaluation +benchmarks, highlighting the strong generalization of the instruction-tuned model. These findings +reveal new perspectives on how to construct high-quality instruction dataset in scale without requiring +crowdsourcing or distillation.
+
+
+
+
+ Figure 3: Step 1: Recall relevant documents from Common Crawl. Step 2: Extracting Q-A pairs.
+Step 3: Refine with the extracted Q-A pairs.
+
+
+
+
+
+
+ Figure 4: An illustrating example from WEBINSTRUCT for the extraction and refine step
+
+
+
+ To further enhance the performance of MAmmoTH2 on more general code generation, math rea- + soning and instruction-following tasks, we continue to tune it on several open-source instruction + datasets including OpenHermes2.5, Code-Feedback, Math-plus. The further enhanced model is + named MAmmoTH2-Plus, which exhibits strong performance across these general tasks. For example, + MAmmoTH2-7B-Plus achieves top performance on HumanEval and MBPP code generation bench- + marks, and MAmmoTH2-8×7B leads the AlpacaEval 2.0 and Arena Hard leaderboards for general + language understanding and instruction-following. The consistent gain of MAmmoTH2-Plus over the + official instruct models (e.g. Mixtral-Instruct-v0.1) further validates the effectiveness of our dataset + and training approach. + + +
+ + +
+ In this section, we outline the process of constructing WEBINSTRUCT from the web corpus. Specif- + ically, we divide the data collection pipeline into two stages: (1) high-quality data recall from the + web corpus, and (2) Q-A pair extraction and (3) Q-A pair refinement. We depict the full pipeline + in Figure 3 and provide an example for extraction and refinement in Figure 4 +
+ ++ Unlike previous math-centric approaches [Paster et al., 2023, Wang et al., 2023c, Shao et al., 2024], +we aim for broad coverage disciplines like math, science, engineering, etc. Therefore, we need to +carefully balance the seed data to ensure diversity. However, the public available training datasets are +mostly limited to mathematics. To address this issue, we propose to crawl new exam problems from +several educational websites like stemez.com, homeworkstudy.com, khanacademy.org. These sites +contain diverse problems from different disciplines to help us ensure diversity. +We crawled 100K seed data as the positive training examples and randomly selected 100K negative +documents from CC [Computer, 2023] to train a fastText model [Joulin et al., 2016]. We use the +trained fastText model to recall relevant documents. We employ the open-source fastText library with a vector dimension of 256 to train the model for 3 epochs, where the learning rate is 0.1, the +maximum length of n-gram is 3, and the maximum number of word occurrences is 3. In the first stage, +the trained fastText model recalls the top 100B documents from CC. The documents are grouped by +their domains (root URL) and we only retain the domains with enough documents. We then prompt +GPT-4 to scan through the domains and automatically select the ones which might contain instruction +data. We use in-context learning and found that GPT-4 has achieved adequate success rate. Then, +we further sample documents from the selected domains as positive, and sample documents from +the non-selected domains & general Common Crawl are used as negative to re-train a better fastText +classifier. The newly trained fastText classifier is used to recall the top 18M documents. +
+
+
+
+
+ Figure 5: The distribution of the top 25 URLs among the 81 total URLs in our instruction dataset.
+Most of the instruction data comes from tutoring websites, forums, and homework websites.
+
+
+
+ The recalled documents come from very diverse domains including forums, homework, quiz, and + exam websites, etc. We observe that there exists a significant amount of naturally existing Q-A pairs + in these documents. However, these Q-A pairs are interwoven with a high volume of noise tokens + like ads, markups, boilerplate, etc. +
++ We first write pre-processing function to pre-extract useful content from recalled documents. This is +by parsing the HTML of the recalled documents to strip off the unrelated content like site information, +ads, HTML boilerplate, etc. This step helps us shorten the document length significantly for the next +step. We then prompt Mixtral-8×7B [Jiang et al., 2024] to identify the text span (beginning and end) +of question and answer pairs. Specifically, we provide five in-context examples to guide the model to +do the extraction. We also allow the model to return void if there exist no natural question-answer +pairs. In this stage, only 30% of the recalled documents were identified to contain naturally existing +Q-A pairs, which leads to roughly 5M Q-A pairs as our candidates. However, these candidates still +contain lots of unrelated content, formality issues. Also, a large portion of the extracted Q-A pairs do +not contain any explanation for how the answer is derived. Therefore, we propose to perform another +round of refinement to increase the data quality. +
+
+ To avoid contamination, we follow previous work [Shao et al., 2024] to filter out web pages containing + questions or answers to all of our evaluation benchmarks. Specifically, we filter out all the web pages + that contain n-grams (n = 10) string matching with either the questions and the answer. + +
+
+ Overall, we can see that MAmmoTH and MAmmoTH-Coder are able to outperform the SoTA model at different scales. In general, the performance gain for OOD datasets is more significant than IND datasets. These results show us the potential of our models as + a mathematical generalist. On several datasets, MAmmoTH-Coder-34B and MAmmoTH-70B are even surpassing closed-source LLMs (see more break down results below). +
++ In order to better understand what factors contribute to the great gain of 🦣MAmmoTH over existing baselines, we set up a group of control experiments in the Figure 3. We study the following setups: +
Dataset | +#Pairs | +Domain | +Format | +Dataset Source | +
---|---|---|---|---|
FLAN V2 | +100K | +General | +SFT | +NLP data + Human CoT | +
Self-Instruct | +82K | +General | +SFT | +Generated by GPT3 | +
GPT4-Alpaca | +52K | +General | +SFT | +Generated by GPT4 | +
SuperNI | +96K | +General | +SFT | +NLP Datasets | +
Tora | +16K | +Math | +SFT | +GPT4 GSM+MATH Synthesis | +
WizardMath | +96K | +Math | +SFT | +GPT4 GSM+MATH Synthesis | +
MathInstruct | +262K | +Math | +SFT | +GPT4 Math datasets Synthesis | +
MetaMathQA | +395K | +Math | +SFT | +GPT4 GSM+MATH Synthesis | +
XwinMath | +1.4M | +Math | +SFT | +GPT4 GSM+MATH Synthesis | +
OpenMathInstruct | +1.8M | +Math | +SFT | +GPT4 GSM+MATH Synthesis | +
Dataset | +#Tokens | +Domain | +Format | +Dataset Source | +
---|---|---|---|---|
OpenWebMath | +12B | +Math | +LM | +Filtered from Web | +
MathPile | +10B | +Math | +LM | +Filtered from Web | +
Cosmopeida | +25B | +General | +LM | +Synthesized by Mixtral | +
MINERVA | +38B | +Math | +LM | +Filtered from Web | +
Proof-Pile-2 | +55B | +Math | +LM | +OWM+Arxiv+Code | +
Galactica | +106B | +Math & Sci. | +LM | +Filtered from Web | +
DeepseekMath | +120B | +Math | +LM | +Recalled from Web | +
WEBINSTRUCT | +(10M) 5B | +Math & Sci. | +SFT | +Recall and Extracted from Web | +
Model | +TheoremQA | +MATH | +GSM8K | +GPQA | +MMLU-ST | +BBH | +ARC-C | +Avg | +
---|---|---|---|---|---|---|---|---|
GPT-4-Turbo-0409 | +48.4 | +69.2 | +94.5 | +46.2 | +76.5 | +86.7 | +93.6 | +73.6 | +
Parameter Size between 20B and 110B | +||||||||
Qwen-1.5-110B | +34.9 | +49.6 | +85.4 | +35.9 | +73.4 | +74.8 | +91.6 | +63.6 | +
Qwen-1.5-72B | +29.3 | +46.8 | +77.6 | +36.3 | +68.5 | +68.0 | +92.2 | +59.8 | +
Deepseek-LM-67B | +25.3 | +15.9 | +66.5 | +31.8 | +57.4 | +71.7 | +86.8 | +50.7 | +
Yi-34B | +23.2 | +15.9 | +67.9 | +29.7 | +62.6 | +66.4 | +89.5 | +50.7 | +
Llemma-34B | +21.1 | +25.0 | +71.9 | +29.2 | +54.7 | +48.4 | +69.5 | +45.7 | +
Mixtral-8×7B | +23.2 | +28.4 | +74.4 | +29.7 | +59.7 | +66.8 | +84.7 | +52.4 | +
Mixtral-8×7B-Instruct | +25.3 | +22.1 | +71.7 | +32.4 | +61.4 | +57.3 | +84.7 | +50.7 | +
Intern-Math-20B | +17.1 | +37.7 | +82.9 | +28.9 | +50.1 | +39.3 | +68.6 | +46.4 | +
Trained only with WEBINSTRUCT (All evaluations are held-out) | +||||||||
\model{34B} | +30.4 | +35.0 | +75.6 | +31.8 | +64.5 | +68.0 | +90.0 | +56.4 | +
$\Delta$ over \e{Yi} | ++7.2 | ++19.1 | ++7.7 | ++2.1 | ++2.9 | ++1.2 | ++0.5 | ++5.8 | +
\model{8x7B} | +32.2 | +39.0 | +75.4 | +36.8 | +67.4 | +71.1 | +87.5 | +58.9 | +
$\Delta$ over \e{Mixtral} | ++9.2 | ++10.6 | ++1.0 | ++7.1 | ++7.4 | ++3.3 | ++2.8 | ++6.5 | +
Continue trained with additional instruction datasets (All held-out except MATH and GSM8K | +||||||||
\modelplus{8x7B} | +34.1 | +47.0 | +86.4 | +37.4 | +72.4 | +74.1 | +88.4 | +62.8 | +
$\Delta$ over Qwen-1.5-110B | +-0.8 | +-2.6 | ++1.0 | ++2.5 | +-1.0 | +-0.7 | +-4.0 | +-0.8 | +
Parameter Size = 7B or 8B | +||||||||
Deepseek-7B | +15.7 | +6.4 | +17.4 | +25.7 | +43.1 | +42.8 | +47.8 | +28.4 | +
Qwen-1.5-7B | +14.2 | +13.3 | +54.1 | +26.7 | +45.4 | +45.2 | +75.6 | +39.2 | +
Mistral-7B | +19.2 | +11.2 | +36.2 | +24.7 | +50.1 | +55.7 | +74.2 | +38.8 | +
Gemma-7B | +21.5 | +24.3 | +46.4 | +25.7 | +53.3 | +57.4 | +72.5 | +43.0 | +
Llemma-7B | +17.2 | +18.0 | +36.4 | +23.2 | +45.2 | +44.9 | +50.5 | +33.6 | +
WizardMath-7B-1.1 | +11.7 | +33.0 | +83.2 | +28.7 | +52.7 | +56.7 | +76.9 | +49.0 | +
OpenMath-Mistral | +13.1 | +9.1 | +24.5 | +26.5 | +43.7 | +49.5 | +69.4 | +33.7 | +
Abel-7B-002 | +19.3 | +29.5 | +83.2 | +30.3 | +29.7 | +32.7 | +72.5 | +42.5 | +
Intern-Math-7B | +13.2 | +34.6 | +78.1 | +22.7 | +41.1 | +48.1 | +59.8 | +42.5 | +
Rho-1-Math-7B | +21.0 | +31.0 | +66.9 | +29.2 | +53.1 | +57.7 | +72.7 | +47.3 | +
Deepseek-Math-7B | +25.3 | +34.0 | +64.2 | +29.2 | +56.4 | +59.5 | +67.8 | +48.0 | +
Deepseek-Math-Instruct | +23.7 | +44.3 | +82.9 | +31.8 | +59.3 | +55.4 | +70.1 | +52.5 | +
Llama-3-8B | +20.1 | +21.3 | +54.8 | +27.2 | +55.6 | +61.1 | +78.6 | +45.5 | +
Llama-3-8B-Instruct | +22.8 | +30.0 | +79.5 | +34.5 | +60.2 | +66.0 | +80.8 | +53.4 | +
Trained only with \dataset (All evaluations are held-out) | +||||||||
\model{7B} | +26.7 | +34.2 | +67.4 | +34.8 | +60.6 | +60.0 | +81.8 | +52.2 | +
$\Delta$ over \e{Mistral} | ++7.5 | ++23.0 | ++31.2 | ++10.1 | ++10.5 | ++4.3 | ++7.6 | ++13.4 | +
\model{8B} | +29.7 | +33.4 | +67.9 | +38.4 | +61.0 | +60.8 | +81.0 | +53.1 | +
$\Delta$ over \e{Llama3} | ++9.6 | ++12.1 | ++13.1 | ++11.2 | ++5.4 | +-0.3 | ++2.4 | ++7.6 | +
Continue trained with additional instruction datasets (All held-out except MATH and GSM8K) | +||||||||
\modelplus{7B} | +29.2 | +45.0 | +84.7 | +30.3 | +64.5 | +63.1 | +83.0 | +57.1 | +
\modelplus{8B} | +32.5 | +42.8 | +84.1 | +37.3 | +65.7 | +67.8 | +83.4 | +59.1 | +
$\Delta$ over best baseline | ++7.2 | ++0.7 | ++1.5 | ++2.8 | ++5.5 | ++1.8 | ++2.6 | ++5.7 | +
+ | HumanEval | +HumanEval+ | +MBPP | +MBPP+ | +Average | +Average+ | +
---|---|---|---|---|---|---|
Mistral-7B | +28.7 | +23.8 | +51.9 | +42.1 | +40.3 | +33.0 | +
Gemma-7B | +26.8 | +20.1 | +52.6 | +43.4 | +39.7 | +31.8 | +
Llama-3-8B | +33.5 | +29.3 | +61.4 | +51.6 | +47.5 | +40.5 | +
Gemma-1.1-7B-Instruct | +42.7 | +35.4 | +57.1 | +45.0 | +49.9 | +40.2 | +
Mistral-7B-Instruct-v0.2 | +75.0 | +70.1 | +44.7 | +37.0 | +59.9 | +53.6 | +
Llama-3-8B-Instruct | +61.6 | +56.7 | +70.1 | +59.3 | +65.9 | +58.0 | +
Mixtral-8×7B-Instruct-v0.1 | +45.1 | +39.6 | +59.5 | +49.7 | +52.3 | +44.7 | +
\model-7B-Plus | +72.1 | +65.9 | +60.1 | +50.4 | +66.1 | +58.2 | +
\model-8B-Plus | +63.4 | +57.9 | +60.4 | +48.6 | +61.9 | +53.3 | +
\model-8×7B-Plus | +57.9 | +53.7 | +68.7 | +56.9 | +63.3 | +55.3 | +
+ | MT-Bench | +AlpacaEval 2.0 | +Arena Hard | +MMLU | +
---|---|---|---|---|
GPT-4-1106-preview | +9.32 | +50.0 | +- | +- | +
GPT-3.5-Turbo-1106 | +8.32 | +19.3 | +18.9 | +- | +
GPT-3.5-Turbo-0301 | +7.94 | +18.1 | +18.1 | +70.0 | +
Tulu-2-DPO-70B | +7.89 | +21.2 | +15.0 | +67.8 | +
Llama-2-70b-chat | +6.86 | +14.7 | +11.6 | +63.0 | +
Yi-34B-Chat | +7.86 | +27.2 | +23.1 | +73.5 | +
Gemma-1.1-7B-Instruct | +- | +10.4 | +7.5 | +64.3 | +
Mistral-7B-Instruct-v0.2 | +7.60 | +17.1 | +12.6 | +60.8 | +
Llama-3-8B-Instruct | +8.02 | +22.9 | +20.6 | +67.2 | +
Mixtral-8×7B-Instruct-v0.1 | +8.30 | +23.7 | +23.4 | +70.6 | +
\modelplus{7B} | +7.88 | +23.4 | +14.6 | +63.3 | +
\modelplus{8B} | +7.95 | +18.5 | +16.6 | +64.6 | +
\modelplus{8x7B} | +8.20 | +33.8 | +32.6 | +68.3 | +
%@j}rEUpl0DocfeFbW*%Nbm`-ycKX%;L8~h7c#gi41sk*@*?;6KcUwqS+
z2c|Tl5h&!4dwJTLFO+`%bIVz8r?+3cZ+LxlSK_0MOO_L7-$=}80eBX>O%mtYz}#M-
z!<74a`ppCvzgHoat~>bURlzh$vm$JpVe`@O{=0O)o9_lomr1@e4lk9B1?tQPgau7o
z!tlI3Y_V#903ovu6DET<>(JTNuBBj9$qA<9TkF
zcdIt}cG>SEzB-afr*3^Fx2KwWp0aUA^j4QxJ2wlr9<)}^<@ah
zw7N&ySNEmoPq6}?zj9UPbprdWvfl2UOm%QQVJiX1t|eU>s*8N3B0f~S_Ro~k&A9#0
zwcpCFt#NpZ
AkxcjrnwsK`~<+do>VoVjFlTs6;zXKGfdHYIXej1sF>GJpU|%)XtsY4>=RucXnwq&T?k_LIkGsr*qcmIP(&3Jlbqpmr{l@
zWxNBar;J4`Gp}4a6=52f@vfvO%-d|gqb0#uov@|^)<$|#iLtt2{t(lR(bZwF5)sbK
zVI@aUpSoqLmPLxPjKY)oKD_r4e0HNTeYi`Na3(A%-&;8p-YB{L3YFE;iHjK@J26T~
zVMxa*-oEp=p)hN1K~Ya4qEOYnE)wm@8gJ@kNf-8KUiX|kcV_}jp$Slik;^=H7~iX4
z=q`(ee+}b?w%6~DzryC+)WmeQY*lE>SlM#zH&N;8ds@^&F}8Zr&ES!5`6i1{3HW>#
zYlzzla$cr>-}VnlgC`es4_qXkC-6l4C&BT*P#quuh_vAdW>TxT8SJv1+pkUwh~^ux
zK?KtSIB{ICRSbT4{a^Re-(v8D3Gaz}FYw+%T1@@C%)WvMo-BL*(a?WB`Gp+m_g^_{
z@H}(uzQ+6SQ%Pl;Jc3_0k*w0^Y*j{A0{nKtm#@qz7IY7TcHA^&$VL4M1^>Ryd5QIc
z3ru1BY6o*J<4i`IME4B1Lqo`r*DLok-#(L?KaO_m^n1gHzVQrBIs4gG{OnbJ?3Znd
z-0BFpjbZsC`B{_951a;H?Sk>4_DEh}WF%o(va@RirD)y{5P7