diff --git a/.github/workflows/validate-mcq-tags.yml b/.github/workflows/validate-mcq-tags.yml new file mode 100644 index 00000000..e8dd845c --- /dev/null +++ b/.github/workflows/validate-mcq-tags.yml @@ -0,0 +1,92 @@ +name: Validate MCQ Tags + +on: + pull_request: + paths: + - 'src/openbench/config.py' + - 'src/openbench/evals/**' + - 'src/openbench/utils/mcq.py' + - 'src/openbench/scorers/mcq.py' + +permissions: + contents: write + pull-requests: write + +jobs: + validate-and-fix: + name: Validate and Fix MCQ Tags + runs-on: ubuntu-latest + # Only run on PRs from the same repository (not forks) + if: github.event.pull_request.head.repo.full_name == github.repository + steps: + - name: Checkout PR branch + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + version: "latest" + enable-cache: true + cache-dependency-glob: | + **/pyproject.toml + **/uv.lock + + - name: Set up Python + run: uv python install 3.12 + + - name: Install dependencies + run: uv sync + + - name: Validate and auto-fix MCQ tags + id: validate + run: | + echo "Running MCQ tag validation with auto-fix..." + uv run python3 scripts/validate_mcq_tags.py --fix + echo "result=completed" >> "$GITHUB_OUTPUT" + + - name: Check for changes + id: check_changes + run: | + if git diff --quiet src/openbench/config.py; then + echo "changed=false" >> "$GITHUB_OUTPUT" + echo "No changes needed" + else + echo "changed=true" >> "$GITHUB_OUTPUT" + echo "Changes applied to config.py" + fi + + - name: Commit and push changes + if: steps.check_changes.outputs.changed == 'true' + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git add src/openbench/config.py + git commit -m "fix: auto-update MCQ 'mcq' tags [skip ci]" + git push + + - name: Comment on PR with success + if: steps.check_changes.outputs.changed == 'true' + uses: actions/github-script@v7 + with: + script: | + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: '✅ **MCQ Tags Auto-Fixed**\n\nThe `mcq` tags have been automatically updated in `config.py` to match the actual MCQ benchmark implementations detected by `is_mcq_task()`.\n\nThe changes have been committed to this PR.' + }) + + - name: Comment on PR with validation + if: steps.check_changes.outputs.changed == 'false' + uses: actions/github-script@v7 + with: + script: | + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: '✅ **MCQ Tags Validated**\n\nAll MCQ benchmarks have correct `mcq` tags. No changes needed.' + }) diff --git a/docs/snippets/benchmarks.data.mdx b/docs/snippets/benchmarks.data.mdx index 1914d424..55803cb7 100644 --- a/docs/snippets/benchmarks.data.mdx +++ b/docs/snippets/benchmarks.data.mdx @@ -4,7 +4,7 @@ export const benchmarksData = [ "description": "Human-centric benchmark with 17 official qualifying exam questions testing general cognitive abilities", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "academic-exams", "reasoning", "cognitive-abilities" @@ -17,7 +17,7 @@ export const benchmarksData = [ "description": "Algebraic question answering and reasoning", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "algebra", "reasoning", "math", @@ -31,7 +31,7 @@ export const benchmarksData = [ "description": "Chinese national college entrance exam - Biology", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "gaokao", "biology", "science", @@ -45,7 +45,7 @@ export const benchmarksData = [ "description": "Chinese national college entrance exam - Chemistry", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "gaokao", "chemistry", "science", @@ -59,7 +59,7 @@ export const benchmarksData = [ "description": "Chinese national college entrance exam - Chinese language", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "gaokao", "chinese", "language", @@ -73,7 +73,7 @@ export const benchmarksData = [ "description": "Chinese national college entrance exam - English", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "gaokao", "english", "language", @@ -87,7 +87,7 @@ export const benchmarksData = [ "description": "Chinese national college entrance exam - Geography", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "gaokao", "geography", "social-studies", @@ -101,7 +101,7 @@ export const benchmarksData = [ "description": "Chinese national college entrance exam - History", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "gaokao", "history", "social-studies", @@ -115,7 +115,7 @@ export const benchmarksData = [ "description": "Chinese national college entrance exam - Mathematics", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "gaokao", "mathematics", "problem-solving", @@ -129,7 +129,7 @@ export const benchmarksData = [ "description": "Chinese national college entrance exam - Physics", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "gaokao", "physics", "science", @@ -143,7 +143,7 @@ export const benchmarksData = [ "description": "Law School Admission Test - Analytical Reasoning section", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "law", "analytical-reasoning", "lsat", @@ -157,7 +157,7 @@ export const benchmarksData = [ "description": "Law School Admission Test - Logical Reasoning section", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "law", "logical-reasoning", "lsat", @@ -171,7 +171,7 @@ export const benchmarksData = [ "description": "Law School Admission Test - Reading Comprehension section", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "law", "reading-comprehension", "lsat", @@ -185,7 +185,7 @@ export const benchmarksData = [ "description": "Logical reasoning questions in Chinese", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "logic", "reasoning", "chinese", @@ -199,7 +199,7 @@ export const benchmarksData = [ "description": "Logical reasoning questions in English", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "logic", "reasoning", "english", @@ -213,7 +213,7 @@ export const benchmarksData = [ "description": "Scholastic Assessment Test - English section", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "sat", "english", "reading", @@ -227,7 +227,7 @@ export const benchmarksData = [ "description": "SAT English questions without reading passages", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "sat", "english", "grammar", @@ -241,7 +241,7 @@ export const benchmarksData = [ "description": "Scholastic Assessment Test - Math section", "category": "agieval", "tags": [ - "multiple-choice", + "mcq", "sat", "mathematics", "problem-solving", @@ -347,7 +347,7 @@ export const benchmarksData = [ "description": "Adversarial Natural Language Inference - challenging NLI benchmark", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "nli", "adversarial", "reasoning" @@ -360,7 +360,7 @@ export const benchmarksData = [ "description": "Adversarial NLI Round 1", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "nli", "adversarial", "reasoning" @@ -373,7 +373,7 @@ export const benchmarksData = [ "description": "Adversarial NLI Round 2", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "nli", "adversarial", "reasoning" @@ -386,7 +386,7 @@ export const benchmarksData = [ "description": "Adversarial NLI Round 3", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "nli", "adversarial", "reasoning" @@ -444,7 +444,7 @@ export const benchmarksData = [ "description": "AI2 Reasoning Challenge - Challenging questions from grade-school science exams", "category": "core", "tags": [ - "multiple-choice", + "mcq", "science", "commonsense-reasoning" ], @@ -456,7 +456,7 @@ export const benchmarksData = [ "description": "AI2 Reasoning Challenge - Easy questions from grade-school science exams", "category": "core", "tags": [ - "multiple-choice", + "mcq", "science", "commonsense-reasoning" ], @@ -481,7 +481,7 @@ export const benchmarksData = [ "description": "Multi-task Arabic language understanding benchmark from school exams across North Africa, the Levant, and the Gulf", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "multilingual", "education", @@ -495,7 +495,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Accounting questions from university-level exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "accounting", "university" @@ -508,7 +508,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Arabic language questions from general exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "language", "general" @@ -521,7 +521,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Arabic language grammar questions", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "language", "grammar" @@ -534,7 +534,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Arabic language questions from high school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "language", "high-school" @@ -547,7 +547,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Arabic language questions from middle school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "language", "middle-school" @@ -560,7 +560,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Arabic language questions from primary school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "language", "primary-school" @@ -573,7 +573,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Biology questions from high school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "biology", "high-school" @@ -586,7 +586,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Civics questions from high school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "civics", "high-school" @@ -599,7 +599,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Civics questions from middle school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "civics", "middle-school" @@ -612,7 +612,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Computer science questions from high school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "computer-science", "high-school" @@ -625,7 +625,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Computer science questions from middle school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "computer-science", "middle-school" @@ -638,7 +638,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Computer science questions from primary school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "computer-science", "primary-school" @@ -651,7 +651,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Computer science questions from university-level exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "computer-science", "university" @@ -664,7 +664,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Driving test questions", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "driving" ], @@ -676,7 +676,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Economics questions from high school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "economics", "high-school" @@ -689,7 +689,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Economics questions from middle school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "economics", "middle-school" @@ -702,7 +702,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Economics questions from university-level exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "economics", "university" @@ -715,7 +715,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - General knowledge questions", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "general-knowledge" ], @@ -727,7 +727,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - General knowledge questions from middle school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "general-knowledge", "middle-school" @@ -740,7 +740,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - General knowledge questions from primary school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "general-knowledge", "primary-school" @@ -753,7 +753,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Geography questions from high school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "geography", "high-school" @@ -766,7 +766,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Geography questions from middle school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "geography", "middle-school" @@ -779,7 +779,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Geography questions from primary school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "geography", "primary-school" @@ -792,7 +792,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - History questions from high school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "history", "high-school" @@ -805,7 +805,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - History questions from middle school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "history", "middle-school" @@ -818,7 +818,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - History questions from primary school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "history", "primary-school" @@ -831,7 +831,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Islamic studies questions from general exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "islamic-studies", "general" @@ -844,7 +844,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Islamic studies questions from high school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "islamic-studies", "high-school" @@ -857,7 +857,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Islamic studies questions from middle school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "islamic-studies", "middle-school" @@ -870,7 +870,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Islamic studies questions from primary school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "islamic-studies", "primary-school" @@ -883,7 +883,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Law questions from professional exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "law", "professional" @@ -896,7 +896,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Management questions from university-level exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "management", "university" @@ -904,25 +904,12 @@ export const benchmarksData = [ "function_name": "arabic_exams_management_university", "is_alpha": false }, - { - "name": "Arabic Exams: Math (High School)", - "description": "Arabic MMLU - Math questions from high school exams", - "category": "domain-specific", - "tags": [ - "multiple-choice", - "arabic", - "math", - "high-school" - ], - "function_name": "arabic_exams_math_high_school", - "is_alpha": false - }, { "name": "Arabic Exams: Math (Primary School)", "description": "Arabic MMLU - Math questions from primary school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "math", "primary-school" @@ -935,7 +922,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Natural science questions from middle school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "natural-science", "middle-school" @@ -948,7 +935,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Natural science questions from primary school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "natural-science", "primary-school" @@ -961,7 +948,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Philosophy questions from high school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "philosophy", "high-school" @@ -974,7 +961,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Physics questions from high school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "physics", "high-school" @@ -987,7 +974,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Political science questions from university-level exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "political-science", "university" @@ -1000,7 +987,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Social science questions from middle school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "social-science", "middle-school" @@ -1013,7 +1000,7 @@ export const benchmarksData = [ "description": "Arabic MMLU - Social science questions from primary school exams", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "arabic", "social-science", "primary-school" @@ -1026,7 +1013,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Causal judgment reasoning", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought" @@ -1039,7 +1026,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Understanding and reasoning about dates", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought" @@ -1052,7 +1039,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Pronoun disambiguation in questions", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought" @@ -1065,7 +1052,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Reasoning about geometric shapes", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1079,7 +1066,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Logical deduction with three objects", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1093,7 +1080,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Logical deduction with five objects", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1107,7 +1094,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Logical deduction with seven objects", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1121,7 +1108,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Movie recommendation reasoning", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought" @@ -1134,7 +1121,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Spatial navigation reasoning", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1148,7 +1135,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Reasoning about colored objects", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought" @@ -1161,7 +1148,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Word manipulation and reasoning", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1175,7 +1162,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Detecting translation errors", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1189,7 +1176,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Understanding sarcasm and irony", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1203,7 +1190,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Sports knowledge and reasoning", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1217,7 +1204,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Understanding temporal sequences", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1231,7 +1218,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Tracking three shuffled objects", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1245,7 +1232,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Tracking five shuffled objects", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1259,7 +1246,7 @@ export const benchmarksData = [ "description": "BigBench Hard - Tracking seven shuffled objects", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -1273,7 +1260,7 @@ export const benchmarksData = [ "description": "BBQ bias evaluation for a specific category - use individual category tasks instead", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1287,7 +1274,7 @@ export const benchmarksData = [ "description": "Evaluate age-related biases in question-answering", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1302,7 +1289,7 @@ export const benchmarksData = [ "description": "Evaluate disability-related biases in question-answering", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1317,7 +1304,7 @@ export const benchmarksData = [ "description": "Evaluate gender identity-related biases in question-answering", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1332,7 +1319,7 @@ export const benchmarksData = [ "description": "Evaluate nationality-related biases in question-answering", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1347,7 +1334,7 @@ export const benchmarksData = [ "description": "Evaluate physical appearance-related biases in question-answering", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1362,7 +1349,7 @@ export const benchmarksData = [ "description": "Evaluate intersectional race and gender biases", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1378,7 +1365,7 @@ export const benchmarksData = [ "description": "Evaluate intersectional race and socioeconomic status biases", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1394,7 +1381,7 @@ export const benchmarksData = [ "description": "Evaluate race and ethnicity-related biases in question-answering", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1409,7 +1396,7 @@ export const benchmarksData = [ "description": "Evaluate religion-related biases in question-answering", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1424,7 +1411,7 @@ export const benchmarksData = [ "description": "Evaluate sexual orientation-related biases in question-answering", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1439,7 +1426,7 @@ export const benchmarksData = [ "description": "Evaluate socioeconomic status-related biases in question-answering", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -1454,7 +1441,7 @@ export const benchmarksData = [ "description": "Benchmark of Linguistic Minimal Pairs testing grammatical knowledge through minimal pair comparisons", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1468,7 +1455,7 @@ export const benchmarksData = [ "description": "BLiMP 'Only' NPI scope", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1482,7 +1469,7 @@ export const benchmarksData = [ "description": "BLiMP 'Only' as NPI licensor", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1496,7 +1483,7 @@ export const benchmarksData = [ "description": "BLiMP Adjunct island effects", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1510,7 +1497,7 @@ export const benchmarksData = [ "description": "BLiMP Anaphor gender agreement", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1524,7 +1511,7 @@ export const benchmarksData = [ "description": "BLiMP Anaphor number agreement", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1538,7 +1525,7 @@ export const benchmarksData = [ "description": "BLiMP Animate subject in passive constructions", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1552,7 +1539,7 @@ export const benchmarksData = [ "description": "BLiMP Animate subject in transitive constructions", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1566,7 +1553,7 @@ export const benchmarksData = [ "description": "BLiMP Binding Principle A - c-command", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1580,7 +1567,7 @@ export const benchmarksData = [ "description": "BLiMP Binding Principle A - case (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1594,7 +1581,7 @@ export const benchmarksData = [ "description": "BLiMP Binding Principle A - case (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1608,7 +1595,7 @@ export const benchmarksData = [ "description": "BLiMP Binding Principle A - domain (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1622,7 +1609,7 @@ export const benchmarksData = [ "description": "BLiMP Binding Principle A - domain (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1636,7 +1623,7 @@ export const benchmarksData = [ "description": "BLiMP Binding Principle A - domain (3)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1650,7 +1637,7 @@ export const benchmarksData = [ "description": "BLiMP Binding Principle A - reconstruction", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1664,7 +1651,7 @@ export const benchmarksData = [ "description": "BLiMP Causative constructions", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1678,7 +1665,7 @@ export const benchmarksData = [ "description": "BLiMP Complex NP island effects", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1692,7 +1679,7 @@ export const benchmarksData = [ "description": "BLiMP Coordinate structure constraint - complex left branch", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1706,7 +1693,7 @@ export const benchmarksData = [ "description": "BLiMP Coordinate structure constraint - object extraction", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1720,7 +1707,7 @@ export const benchmarksData = [ "description": "BLiMP Determiner-noun agreement (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1734,7 +1721,7 @@ export const benchmarksData = [ "description": "BLiMP Determiner-noun agreement (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1748,7 +1735,7 @@ export const benchmarksData = [ "description": "BLiMP Determiner-noun agreement with adjective (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1762,7 +1749,7 @@ export const benchmarksData = [ "description": "BLiMP Determiner-noun agreement with adjective (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1776,7 +1763,7 @@ export const benchmarksData = [ "description": "BLiMP Determiner-noun agreement with adjective and irregular nouns (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1790,7 +1777,7 @@ export const benchmarksData = [ "description": "BLiMP Determiner-noun agreement with adjective and irregular nouns (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1804,7 +1791,7 @@ export const benchmarksData = [ "description": "BLiMP Determiner-noun agreement with irregular nouns (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1818,7 +1805,7 @@ export const benchmarksData = [ "description": "BLiMP Determiner-noun agreement with irregular nouns (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1832,7 +1819,7 @@ export const benchmarksData = [ "description": "BLiMP Distractor agreement in relative clauses", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1846,7 +1833,7 @@ export const benchmarksData = [ "description": "BLiMP Distractor agreement with relational nouns", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1860,7 +1847,7 @@ export const benchmarksData = [ "description": "BLiMP Dropped argument", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1874,7 +1861,7 @@ export const benchmarksData = [ "description": "BLiMP Existential 'there' with object raising", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1888,7 +1875,7 @@ export const benchmarksData = [ "description": "BLiMP Existential 'there' with quantifiers (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1902,7 +1889,7 @@ export const benchmarksData = [ "description": "BLiMP Existential 'there' with quantifiers (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1916,7 +1903,7 @@ export const benchmarksData = [ "description": "BLiMP Existential 'there' with subject raising", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1930,7 +1917,7 @@ export const benchmarksData = [ "description": "BLiMP Expletive 'it' with object raising", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1944,7 +1931,7 @@ export const benchmarksData = [ "description": "BLiMP Inchoative constructions", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1958,7 +1945,7 @@ export const benchmarksData = [ "description": "BLiMP Intransitive verbs", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1972,7 +1959,7 @@ export const benchmarksData = [ "description": "BLiMP Irregular past participles as adjectives", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -1986,7 +1973,7 @@ export const benchmarksData = [ "description": "BLiMP Irregular past participles in verbs", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2000,7 +1987,7 @@ export const benchmarksData = [ "description": "BLiMP Left branch island effects in echo questions", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2014,7 +2001,7 @@ export const benchmarksData = [ "description": "BLiMP Left branch island effects in simple questions", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2028,7 +2015,7 @@ export const benchmarksData = [ "description": "BLiMP Matrix question NPI licensor present", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2042,7 +2029,7 @@ export const benchmarksData = [ "description": "BLiMP N-bar ellipsis (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2056,7 +2043,7 @@ export const benchmarksData = [ "description": "BLiMP N-bar ellipsis (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2070,7 +2057,7 @@ export const benchmarksData = [ "description": "BLiMP Negative polarity items present (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2084,7 +2071,7 @@ export const benchmarksData = [ "description": "BLiMP Negative polarity items present (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2098,7 +2085,7 @@ export const benchmarksData = [ "description": "BLiMP Passive constructions (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2112,7 +2099,7 @@ export const benchmarksData = [ "description": "BLiMP Passive constructions (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2126,7 +2113,7 @@ export const benchmarksData = [ "description": "BLiMP Sentential negation NPI scope", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2140,7 +2127,7 @@ export const benchmarksData = [ "description": "BLiMP Sentential negation as NPI licensor", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2154,7 +2141,7 @@ export const benchmarksData = [ "description": "BLiMP Sentential subject island effects", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2168,7 +2155,7 @@ export const benchmarksData = [ "description": "BLiMP Subject-verb agreement with irregular plurals (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2182,7 +2169,7 @@ export const benchmarksData = [ "description": "BLiMP Subject-verb agreement with irregular plurals (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2196,7 +2183,7 @@ export const benchmarksData = [ "description": "BLiMP Subject-verb agreement with regular plurals (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2210,7 +2197,7 @@ export const benchmarksData = [ "description": "BLiMP Subject-verb agreement with regular plurals (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2224,7 +2211,7 @@ export const benchmarksData = [ "description": "BLiMP Superlative quantifiers (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2238,7 +2225,7 @@ export const benchmarksData = [ "description": "BLiMP Superlative quantifiers (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2252,7 +2239,7 @@ export const benchmarksData = [ "description": "BLiMP Tough vs raising constructions (1)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2266,7 +2253,7 @@ export const benchmarksData = [ "description": "BLiMP Tough vs raising constructions (2)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2280,7 +2267,7 @@ export const benchmarksData = [ "description": "BLiMP Transitive verbs", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2294,7 +2281,7 @@ export const benchmarksData = [ "description": "BLiMP Wh vs that complementizers with gap", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2308,7 +2295,7 @@ export const benchmarksData = [ "description": "BLiMP Wh vs that complementizers with gap (long-distance)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2322,7 +2309,7 @@ export const benchmarksData = [ "description": "BLiMP Wh vs that complementizers without gap", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2336,7 +2323,7 @@ export const benchmarksData = [ "description": "BLiMP Wh vs that complementizers without gap (long-distance)", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2350,7 +2337,7 @@ export const benchmarksData = [ "description": "BLiMP Wh-island effects", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2364,7 +2351,7 @@ export const benchmarksData = [ "description": "BLiMP Wh-questions with long-distance subject gap", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2378,7 +2365,7 @@ export const benchmarksData = [ "description": "BLiMP Wh-questions with object gap", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2392,7 +2379,7 @@ export const benchmarksData = [ "description": "BLiMP Wh-questions with subject gap", "category": "linguistic", "tags": [ - "multiple-choice", + "mcq", "linguistics", "grammar", "syntax", @@ -2419,7 +2406,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: anachronisms", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2431,7 +2418,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: analogical_similarity", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2443,7 +2430,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: analytic_entailment", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2455,7 +2442,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: arithmetic", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2467,7 +2454,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: authorship_verification", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2479,7 +2466,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: bbq_lite_json", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2491,7 +2478,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: causal_judgment", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2503,7 +2490,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: cause_and_effect", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2515,7 +2502,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: checkmate_in_one", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2527,7 +2514,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: cifar10_classification", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2539,7 +2526,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: code_line_description", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2551,7 +2538,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: color", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2563,7 +2550,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: common_morpheme", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2575,7 +2562,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: conceptual_combinations", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2587,7 +2574,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: contextual_parametric_knowledge_conflicts", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2599,7 +2586,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: crash_blossom", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2611,7 +2598,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: crass_ai", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2623,7 +2610,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: cryobiology_spanish", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2635,7 +2622,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: cs_algorithms", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2647,7 +2634,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: dark_humor_detection", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2659,7 +2646,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: date_understanding", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2671,7 +2658,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: disambiguation_qa", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2683,7 +2670,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: discourse_marker_prediction", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2695,7 +2682,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: dyck_languages", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2707,7 +2694,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: elementary_math_qa", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2719,7 +2706,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: emoji_movie", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2731,7 +2718,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: emojis_emotion_prediction", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2743,7 +2730,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: empirical_judgments", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2755,7 +2742,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: english_proverbs", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2767,7 +2754,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: english_russian_proverbs", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2779,7 +2766,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: entailed_polarity", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2791,7 +2778,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: entailed_polarity_hindi", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2803,7 +2790,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: epistemic_reasoning", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2815,7 +2802,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: evaluating_information_essentiality", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2827,7 +2814,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: fact_checker", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2839,7 +2826,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: fantasy_reasoning", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2851,7 +2838,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: figure_of_speech_detection", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2863,7 +2850,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: formal_fallacies_syllogisms_negation", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2875,7 +2862,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: general_knowledge", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2887,7 +2874,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: geometric_shapes", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2899,7 +2886,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: goal_step_wikihow", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2911,7 +2898,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: gre_reading_comprehension", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2923,7 +2910,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: hhh_alignment", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2935,7 +2922,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: hindu_knowledge", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2947,7 +2934,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: hinglish_toxicity", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2959,7 +2946,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: human_organs_senses", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2971,7 +2958,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: hyperbaton", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2983,7 +2970,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: identify_math_theorems", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -2995,7 +2982,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: identify_odd_metaphor", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3007,7 +2994,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: implicatures", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3019,7 +3006,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: implicit_relations", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3031,7 +3018,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: indic_cause_and_effect", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3043,7 +3030,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: intent_recognition", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3055,7 +3042,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: international_phonetic_alphabet_nli", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3067,7 +3054,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: intersect_geometry", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3079,7 +3066,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: irony_identification", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3091,7 +3078,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: kanji_ascii", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3103,7 +3090,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: kannada", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3115,7 +3102,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: key_value_maps", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3127,7 +3114,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: known_unknowns", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3139,7 +3126,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: language_identification", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3151,7 +3138,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: logic_grid_puzzle", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3163,7 +3150,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: logical_args", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3175,7 +3162,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: logical_deduction", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3187,7 +3174,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: logical_fallacy_detection", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3199,7 +3186,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: logical_sequence", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3211,7 +3198,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: mathematical_induction", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3223,7 +3210,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: medical_questions_russian", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3235,7 +3222,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: metaphor_boolean", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3247,7 +3234,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: metaphor_understanding", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3259,7 +3246,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: minute_mysteries_qa", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3271,7 +3258,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: misconceptions", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3283,7 +3270,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: misconceptions_russian", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3295,7 +3282,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: mnist_ascii", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3307,7 +3294,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: moral_permissibility", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3319,7 +3306,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: movie_dialog_same_or_different", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3331,7 +3318,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: movie_recommendation", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3343,7 +3330,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: navigate", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3355,7 +3342,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: nonsense_words_grammar", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3367,7 +3354,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: novel_concepts", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3379,7 +3366,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: odd_one_out", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3391,7 +3378,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: parsinlu_qa", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3403,7 +3390,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: penguins_in_a_table", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3415,7 +3402,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: periodic_elements", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3427,7 +3414,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: persian_idioms", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3439,7 +3426,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: phrase_relatedness", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3451,7 +3438,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: physical_intuition", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3463,7 +3450,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: physics", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3475,7 +3462,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: play_dialog_same_or_different", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3487,7 +3474,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: presuppositions_as_nli", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3499,7 +3486,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: question_selection", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3511,7 +3498,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: real_or_fake_text", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3523,7 +3510,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: reasoning_about_colored_objects", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3535,7 +3522,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: rhyming", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3547,7 +3534,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: riddle_sense", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3559,7 +3546,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: ruin_names", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3571,7 +3558,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: salient_translation_error_detection", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3583,7 +3570,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: sentence_ambiguity", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3595,7 +3582,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: similarities_abstraction", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3607,7 +3594,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: simple_ethical_questions", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3619,7 +3606,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: snarks", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3631,7 +3618,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: social_iqa", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3643,7 +3630,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: social_support", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3655,7 +3642,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: sports_understanding", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3667,7 +3654,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: strange_stories", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3679,7 +3666,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: strategyqa", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3691,7 +3678,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: suicide_risk", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3703,7 +3690,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: swahili_english_proverbs", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3715,7 +3702,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: swedish_to_german_proverbs", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3727,7 +3714,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: symbol_interpretation", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3739,7 +3726,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: temporal_sequences", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3751,7 +3738,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: timedial", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3763,7 +3750,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: tracking_shuffled_objects", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3775,7 +3762,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: understanding_fables", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3787,7 +3774,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: undo_permutation", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3799,7 +3786,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: unit_conversion", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3811,7 +3798,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: unit_interpretation", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3823,7 +3810,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: vitaminc_fact_verification", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3835,7 +3822,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: what_is_the_tao", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3847,7 +3834,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: which_wiki_edit", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3859,7 +3846,7 @@ export const benchmarksData = [ "description": "BigBench MCQ task: winowhy", "category": "bigbench", "tags": [ - "multiple-choice", + "mcq", "reasoning", "bigbench" ], @@ -3872,7 +3859,8 @@ export const benchmarksData = [ "category": "core", "tags": [ "boolean-reasoning", - "question-answering" + "question-answering", + "mcq" ], "function_name": "boolq", "is_alpha": false @@ -3895,7 +3883,7 @@ export const benchmarksData = [ "description": "Choice of Plausible Alternatives for causal reasoning", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "superglue", "nli", "reasoning" @@ -3947,7 +3935,7 @@ export const benchmarksData = [ "description": "Natural language inference with commitment", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "superglue", "nli", "reasoning" @@ -4024,7 +4012,7 @@ export const benchmarksData = [ "description": "Aligning AI With Shared Human Values - tests moral reasoning across 5 fundamental dimensions", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "ethics", "moral-reasoning", "philosophy" @@ -4037,7 +4025,7 @@ export const benchmarksData = [ "description": "Tests everyday moral reasoning and common ethical intuitions", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "ethics", "moral-reasoning", "commonsense", @@ -4051,7 +4039,7 @@ export const benchmarksData = [ "description": "Tests duty-based ethics and understanding of moral rules", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "ethics", "moral-reasoning", "deontology", @@ -4065,7 +4053,7 @@ export const benchmarksData = [ "description": "Tests fairness and impartiality in ethical decision-making", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "ethics", "moral-reasoning", "justice", @@ -4079,7 +4067,7 @@ export const benchmarksData = [ "description": "Tests consequence-based ethics and utility maximization", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "ethics", "moral-reasoning", "utilitarianism", @@ -4093,7 +4081,7 @@ export const benchmarksData = [ "description": "Tests character-based ethics and recognition of virtuous behavior", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "ethics", "moral-reasoning", "virtue", @@ -4198,7 +4186,7 @@ export const benchmarksData = [ "description": "General Language Understanding Evaluation benchmark suite", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "glue", "nli", "sentiment", @@ -4212,7 +4200,7 @@ export const benchmarksData = [ "description": "Corpus of Linguistic Acceptability", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "glue", "nli" ], @@ -4224,7 +4212,7 @@ export const benchmarksData = [ "description": "Multi-Genre Natural Language Inference", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "glue", "nli" ], @@ -4236,7 +4224,7 @@ export const benchmarksData = [ "description": "MNLI Mismatched", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "glue", "nli" ], @@ -4248,7 +4236,7 @@ export const benchmarksData = [ "description": "Microsoft Research Paraphrase Corpus", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "glue", "nli" ], @@ -4260,7 +4248,7 @@ export const benchmarksData = [ "description": "Question Natural Language Inference", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "glue", "nli" ], @@ -4272,7 +4260,7 @@ export const benchmarksData = [ "description": "Quora Question Pairs", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "glue", "nli" ], @@ -4284,7 +4272,7 @@ export const benchmarksData = [ "description": "Recognizing Textual Entailment", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "glue", "nli" ], @@ -4296,7 +4284,7 @@ export const benchmarksData = [ "description": "Stanford Sentiment Treebank", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "glue", "nli" ], @@ -4308,7 +4296,7 @@ export const benchmarksData = [ "description": "Semantic Textual Similarity Benchmark", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "glue", "nli" ], @@ -4320,7 +4308,7 @@ export const benchmarksData = [ "description": "Winograd Natural Language Inference", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "glue", "nli" ], @@ -4332,7 +4320,8 @@ export const benchmarksData = [ "description": "GitHub Multiple Choice Questions", "category": "core", "tags": [ - "code-understanding" + "code-understanding", + "mcq" ], "function_name": "rootly_gmcq", "is_alpha": false @@ -4342,7 +4331,7 @@ export const benchmarksData = [ "description": "Graduate-level science questions (multiple choice) across physics, chemistry, and biology", "category": "core", "tags": [ - "multiple-choice", + "mcq", "science", "graduate-level", "reasoning" @@ -4355,7 +4344,7 @@ export const benchmarksData = [ "description": "Graduate-level Google-Proof Q&A in biology, chemistry, and physics", "category": "core", "tags": [ - "multiple-choice", + "mcq", "science", "graduate-level" ], @@ -4419,7 +4408,7 @@ export const benchmarksData = [ "description": "Culturally adapted multilingual MMLU with 42 languages", "category": "core", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-sensitivity", "mmlu" @@ -4432,7 +4421,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Amharic (am)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4445,7 +4434,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Arabic (ar)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4458,7 +4447,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Bengali (bn)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4471,7 +4460,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Chichewa (ny)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4484,7 +4473,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Chinese (zh)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4497,7 +4486,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Czech (cs)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4510,7 +4499,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Dutch (nl)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4523,7 +4512,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for English (en)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4536,7 +4525,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Filipino (fil)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4549,7 +4538,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for French (fr)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4562,7 +4551,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for German (de)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4575,7 +4564,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Greek (el)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4588,7 +4577,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Hausa (ha)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4601,7 +4590,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Hebrew (he)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4614,7 +4603,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Hindi (hi)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4627,7 +4616,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Igbo (ig)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4640,7 +4629,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Indonesian (id)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4653,7 +4642,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Italian (it)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4666,7 +4655,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Japanese (ja)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4679,7 +4668,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Korean (ko)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4692,7 +4681,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Kyrgyz (ky)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4705,7 +4694,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Lithuanian (lt)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4718,7 +4707,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Malagasy (mg)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4731,7 +4720,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Malay (ms)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4744,7 +4733,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Nepali (ne)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4757,7 +4746,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Persian (fa)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4770,7 +4759,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Polish (pl)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4783,7 +4772,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Portuguese (pt)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4796,7 +4785,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Romanian (ro)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4809,7 +4798,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Russian (ru)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4822,7 +4811,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Serbian (sr)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4835,7 +4824,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Shona (sn)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4848,7 +4837,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Sinhala (si)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4861,7 +4850,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Somali (so)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4874,7 +4863,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Spanish (es)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4887,7 +4876,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Swahili (sw)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4900,7 +4889,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Swedish (sv)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4913,7 +4902,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Telugu (te)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4926,7 +4915,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Turkish (tr)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4939,7 +4928,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Ukrainian (uk)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4952,7 +4941,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Vietnamese (vi)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -4965,7 +4954,7 @@ export const benchmarksData = [ "description": "Global-MMLU culturally adapted MMLU for Yoruba (yo)", "category": "global-mmlu", "tags": [ - "multiple-choice", + "mcq", "multilingual", "cultural-adaptation", "global-mmlu" @@ -5058,7 +5047,7 @@ export const benchmarksData = [ "description": "Spanish healthcare specialization exam questions (Spanish and English)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "medical", "healthcare", "multilingual" @@ -5071,7 +5060,7 @@ export const benchmarksData = [ "description": "Spanish healthcare specialization exam questions in English", "category": "core", "tags": [ - "multiple-choice", + "mcq", "medical", "healthcare", "english" @@ -5084,7 +5073,7 @@ export const benchmarksData = [ "description": "Spanish healthcare specialization exam questions in Spanish", "category": "core", "tags": [ - "multiple-choice", + "mcq", "medical", "healthcare", "spanish" @@ -5138,7 +5127,7 @@ export const benchmarksData = [ "description": "Adversarially-filtered sentence completion benchmark for commonsense reasoning", "category": "core", "tags": [ - "multiple-choice", + "mcq", "commonsense-reasoning", "sentence-completion" ], @@ -5228,7 +5217,7 @@ export const benchmarksData = [ "description": "Legal citation support identification - identify which citation provides stronger support for a legal argument", "category": "domain-specific", "tags": [ - "multiple-choice", + "mcq", "legal", "reasoning", "citation-analysis" @@ -5255,7 +5244,7 @@ export const benchmarksData = [ "description": "Logical reasoning dataset from Chinese civil service exam questions - tests deductive reasoning skills", "category": "knowledge-qa", "tags": [ - "multiple-choice", + "mcq", "logical-reasoning", "deduction", "critical-thinking" @@ -5518,7 +5507,7 @@ export const benchmarksData = [ "description": "Massive Multitask Language Understanding - 57 academic subjects from the cais/mmlu dataset. Only supports English (EN-US).", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "reasoning", "multitask" @@ -5531,7 +5520,7 @@ export const benchmarksData = [ "description": "Enhanced version of MMLU with more challenging, reasoning-focused questions.", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "reasoning", "multitask" @@ -5544,7 +5533,7 @@ export const benchmarksData = [ "description": "Manually re-annotated subset of 5,700 MMLU questions addressing annotation errors in the original dataset.", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "reasoning", "multitask" @@ -5557,7 +5546,7 @@ export const benchmarksData = [ "description": "MMLU in Arabic (AR_XY)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5570,7 +5559,7 @@ export const benchmarksData = [ "description": "MMLU in Bengali (BN_BD)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5583,7 +5572,7 @@ export const benchmarksData = [ "description": "MMLU in Chinese (ZH_CN)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5596,7 +5585,7 @@ export const benchmarksData = [ "description": "MMLU in French (FR_FR)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5609,7 +5598,7 @@ export const benchmarksData = [ "description": "MMLU in German (DE_DE)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5622,7 +5611,7 @@ export const benchmarksData = [ "description": "MMLU in Hindi (HI_IN)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5635,7 +5624,7 @@ export const benchmarksData = [ "description": "MMLU in Indonesian (ID_ID)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5648,7 +5637,7 @@ export const benchmarksData = [ "description": "MMLU in Italian (IT_IT)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5661,7 +5650,7 @@ export const benchmarksData = [ "description": "MMLU in Japanese (JA_JP)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5674,7 +5663,7 @@ export const benchmarksData = [ "description": "MMLU in Korean (KO_KR)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5687,7 +5676,7 @@ export const benchmarksData = [ "description": "MMLU in Portuguese Brazil (PT_BR)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5700,7 +5689,7 @@ export const benchmarksData = [ "description": "MMLU in Spanish Latin America (ES_LA)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5713,7 +5702,7 @@ export const benchmarksData = [ "description": "MMLU in Swahili (SW_KE)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5726,7 +5715,7 @@ export const benchmarksData = [ "description": "MMLU in Yoruba (YO_NG)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "multilingual", "mmmlu" @@ -5739,7 +5728,7 @@ export const benchmarksData = [ "description": "MMLU translated to 15 languages.", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "reasoning", "multitask" @@ -5753,7 +5742,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "reasoning", "college-level", "images" @@ -5767,7 +5756,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "accounting", "business", "images" @@ -5781,7 +5770,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "agriculture", "science", "images" @@ -5795,7 +5784,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "architecture", "engineering", "design", @@ -5810,7 +5799,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "art", "visual-design", "images" @@ -5824,7 +5813,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "art", "theory", "history", @@ -5839,7 +5828,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "medicine", "science", "health", @@ -5854,7 +5843,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "biology", "science", "images" @@ -5868,7 +5857,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "chemistry", "science", "images" @@ -5882,7 +5871,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "medicine", "clinical", "health", @@ -5897,7 +5886,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "design", "visual", "creative", @@ -5912,7 +5901,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "medicine", "diagnostics", "laboratory", @@ -5927,7 +5916,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "electronics", "engineering", "technology", @@ -5942,7 +5931,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "energy", "power", "engineering", @@ -5957,7 +5946,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "finance", "business", "economics", @@ -5972,7 +5961,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "geography", "earth-science", "spatial", @@ -5987,7 +5976,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "history", "humanities", "culture", @@ -6002,7 +5991,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "literature", "humanities", "language", @@ -6017,7 +6006,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "images" ], "function_name": "mmmu_mcq", @@ -6029,7 +6018,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "management", "business", "leadership", @@ -6044,7 +6033,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "marketing", "business", "communication", @@ -6059,7 +6048,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "materials", "science", "engineering", @@ -6074,7 +6063,7 @@ export const benchmarksData = [ "category": "math", "tags": [ "multimodal", - "multiple-choice", + "mcq", "mathematics", "reasoning", "images" @@ -6088,7 +6077,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "mechanical", "engineering", "design", @@ -6103,7 +6092,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "music", "arts", "theory", @@ -6119,7 +6108,8 @@ export const benchmarksData = [ "tags": [ "multimodal", "open-ended", - "images" + "images", + "mcq" ], "function_name": "mmmu_open", "is_alpha": false @@ -6130,7 +6120,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "pharmacy", "medicine", "health", @@ -6145,7 +6135,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "physics", "science", "images" @@ -6159,7 +6149,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "public-health", "health", "population", @@ -6174,7 +6164,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "sociology", "social-science", "society", @@ -6189,7 +6179,7 @@ export const benchmarksData = [ "category": "core", "tags": [ "multimodal", - "multiple-choice", + "mcq", "reasoning", "images", "mmmu-pro" @@ -6204,7 +6194,7 @@ export const benchmarksData = [ "tags": [ "multimodal", "vision", - "multiple-choice", + "mcq", "images", "mmmu-pro" ], @@ -6230,7 +6220,7 @@ export const benchmarksData = [ "description": "Mathematical word problems with multiple-choice answers and solution rationales", "category": "knowledge-qa", "tags": [ - "multiple-choice", + "mcq", "mathematics", "word-problems", "reasoning" @@ -6258,7 +6248,7 @@ export const benchmarksData = [ "description": "Medical multiple-choice questions from Indian medical entrance exams (AIIMS & NEET PG)", "category": "core", "tags": [ - "multiple-choice", + "mcq", "medical", "healthcare", "medicine" @@ -6271,7 +6261,7 @@ export const benchmarksData = [ "description": "US Medical Licensing Exam (USMLE) questions for medical reasoning", "category": "core", "tags": [ - "multiple-choice", + "mcq", "medical", "healthcare", "medicine", @@ -6324,7 +6314,7 @@ export const benchmarksData = [ "description": "Testing the Limits of Chain-of-thought with Multistep Soft Reasoning - includes murder mysteries, object placements, and team allocation tasks", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "commonsense", "chain-of-thought" @@ -6337,7 +6327,7 @@ export const benchmarksData = [ "description": "MuSR murder mystery scenarios - who is the most likely murderer?", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "commonsense", "chain-of-thought", @@ -6351,7 +6341,7 @@ export const benchmarksData = [ "description": "MuSR object placement reasoning - where would someone look for an object?", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "commonsense", "chain-of-thought", @@ -6365,7 +6355,7 @@ export const benchmarksData = [ "description": "MuSR team allocation problems - how to allocate people to tasks efficiently?", "category": "core", "tags": [ - "multiple-choice", + "mcq", "reasoning", "commonsense", "chain-of-thought", @@ -6392,7 +6382,7 @@ export const benchmarksData = [ "description": "Multi-Sentence Reading Comprehension", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "superglue", "nli", "reasoning" @@ -6484,7 +6474,7 @@ export const benchmarksData = [ "description": "Elementary-level science questions probing understanding of core facts", "category": "core", "tags": [ - "multiple-choice", + "mcq", "science", "elementary", "open-book" @@ -6497,7 +6487,7 @@ export const benchmarksData = [ "description": "Physical Interaction Question Answering - commonsense about physical situations", "category": "core", "tags": [ - "multiple-choice", + "mcq", "commonsense-reasoning", "physical-reasoning" ], @@ -6509,7 +6499,7 @@ export const benchmarksData = [ "description": "Physical Reasoning about Objects through Space and Time", "category": "core", "tags": [ - "multiple-choice", + "mcq", "commonsense-reasoning", "physical-reasoning" ], @@ -6859,7 +6849,7 @@ export const benchmarksData = [ "description": "Biomedical question answering from PubMed abstracts", "category": "core", "tags": [ - "multiple-choice", + "mcq", "medical", "biomedical", "research", @@ -6873,7 +6863,7 @@ export const benchmarksData = [ "description": "Question Answering for Machine Reading Evaluation - CLEF shared tasks 2011-2013", "category": "reading-comprehension", "tags": [ - "multiple-choice", + "mcq", "reading-comprehension", "clef", "machine-reading" @@ -6886,7 +6876,7 @@ export const benchmarksData = [ "description": "Question Answering for Machine Reading Evaluation (English, 2011)", "category": "reading-comprehension", "tags": [ - "multiple-choice", + "mcq", "reading-comprehension", "clef", "machine-reading", @@ -6900,7 +6890,7 @@ export const benchmarksData = [ "description": "Question Answering for Machine Reading Evaluation (English, 2012)", "category": "reading-comprehension", "tags": [ - "multiple-choice", + "mcq", "reading-comprehension", "clef", "machine-reading", @@ -6914,7 +6904,7 @@ export const benchmarksData = [ "description": "Question Answering for Machine Reading Evaluation (English, 2013)", "category": "reading-comprehension", "tags": [ - "multiple-choice", + "mcq", "reading-comprehension", "clef", "machine-reading", @@ -6928,7 +6918,7 @@ export const benchmarksData = [ "description": "Question Answering on Scientific Papers - binary yes/no questions on research paper abstracts", "category": "reading-comprehension", "tags": [ - "multiple-choice", + "mcq", "reading-comprehension", "scientific-papers", "binary-classification" @@ -6941,7 +6931,7 @@ export const benchmarksData = [ "description": "Reading comprehension from middle and high school English exams (combined)", "category": "reading-comprehension", "tags": [ - "multiple-choice", + "mcq", "reading-comprehension", "english-exam" ], @@ -6953,7 +6943,7 @@ export const benchmarksData = [ "description": "High school level reading comprehension from English exams for Chinese students - passages with multiple questions", "category": "reading-comprehension", "tags": [ - "multiple-choice", + "mcq", "reading-comprehension", "english-exam", "high-school" @@ -6966,7 +6956,7 @@ export const benchmarksData = [ "description": "Middle school level reading comprehension from English exams for Chinese students", "category": "reading-comprehension", "tags": [ - "multiple-choice", + "mcq", "reading-comprehension", "english-exam", "middle-school" @@ -6979,7 +6969,7 @@ export const benchmarksData = [ "description": "Recognizing Textual Entailment from SuperGLUE", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "superglue", "nli", "reasoning" @@ -7115,7 +7105,7 @@ export const benchmarksData = [ "description": "Situations With Adversarial Generations - grounded commonsense inference", "category": "core", "tags": [ - "multiple-choice", + "mcq", "commonsense-reasoning", "video-captions" ], @@ -7151,7 +7141,7 @@ export const benchmarksData = [ "description": "Science exam questions covering Physics, Chemistry, Biology, and other scientific domains", "category": "knowledge-qa", "tags": [ - "multiple-choice", + "mcq", "science", "physics", "chemistry", @@ -7189,7 +7179,7 @@ export const benchmarksData = [ "description": "Social Intelligence Question Answering - tests reasoning about social situations, emotions, and mental states", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "social-intelligence", "emotional-reasoning", "theory-of-mind" @@ -7202,7 +7192,7 @@ export const benchmarksData = [ "description": "SuperGLUE benchmark suite - run any subset by name (boolq, cb, copa, multirc, rte, wic, wsc)", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "superglue", "nli", "reasoning" @@ -7215,7 +7205,7 @@ export const benchmarksData = [ "description": "Scaling LLM Evaluation across 285 Graduate Disciplines - 26,529 multiple-choice questions across science, engineering, medicine, economics, and philosophy", "category": "core", "tags": [ - "multiple-choice", + "mcq", "knowledge", "graduate-level", "multidisciplinary" @@ -7230,7 +7220,6 @@ export const benchmarksData = [ "tags": [ "factuality", "question-answering", - "multiple-choice", "reasoning" ], "function_name": "tumlu", @@ -7277,7 +7266,8 @@ export const benchmarksData = [ "description": "Terraform Multiple Choice Questions", "category": "core", "tags": [ - "code-understanding" + "code-understanding", + "mcq" ], "function_name": "rootly_terraform", "is_alpha": false @@ -7287,7 +7277,7 @@ export const benchmarksData = [ "description": "Toxicity detection benchmark - tests ability to identify toxic and hateful language", "category": "ethics-social", "tags": [ - "multiple-choice", + "mcq", "toxicity-detection", "hate-speech", "safety" @@ -7312,7 +7302,7 @@ export const benchmarksData = [ "description": "Tests if models generate truthful answers to questions that humans often answer falsely due to misconceptions", "category": "knowledge-qa", "tags": [ - "multiple-choice", + "mcq", "truthfulness", "misconceptions", "factuality" @@ -7325,7 +7315,7 @@ export const benchmarksData = [ "description": "Winograd Schema Challenge - coreference resolution", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "superglue", "nli", "reasoning", @@ -7339,7 +7329,7 @@ export const benchmarksData = [ "description": "Original Winograd Schema Challenge with 273 expert-crafted questions", "category": "core", "tags": [ - "multiple-choice", + "mcq", "commonsense-reasoning", "pronoun-resolution" ], @@ -7351,7 +7341,7 @@ export const benchmarksData = [ "description": "Word in Context - word sense disambiguation", "category": "glue", "tags": [ - "multiple-choice", + "mcq", "superglue", "nli", "wsd", @@ -7365,7 +7355,7 @@ export const benchmarksData = [ "description": "Large-scale Winograd Schema Challenge for commonsense pronoun resolution", "category": "core", "tags": [ - "multiple-choice", + "mcq", "commonsense-reasoning", "pronoun-resolution" ], @@ -7377,7 +7367,7 @@ export const benchmarksData = [ "description": "Cross-lingual Choice of Plausible Alternatives for causal commonsense reasoning", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual" @@ -7390,7 +7380,7 @@ export const benchmarksData = [ "description": "XCOPA causal reasoning for Chinese (zh)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -7404,7 +7394,7 @@ export const benchmarksData = [ "description": "XCOPA causal reasoning for Estonian (et)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -7418,7 +7408,7 @@ export const benchmarksData = [ "description": "XCOPA causal reasoning for Haitian Creole (ht)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -7432,7 +7422,7 @@ export const benchmarksData = [ "description": "XCOPA causal reasoning for Indonesian (id)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -7446,7 +7436,7 @@ export const benchmarksData = [ "description": "XCOPA causal reasoning for Italian (it)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -7460,7 +7450,7 @@ export const benchmarksData = [ "description": "XCOPA causal reasoning for Quechua (qu)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -7474,7 +7464,7 @@ export const benchmarksData = [ "description": "XCOPA causal reasoning for Swahili (sw)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -7488,7 +7478,7 @@ export const benchmarksData = [ "description": "XCOPA causal reasoning for Tamil (ta)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -7502,7 +7492,7 @@ export const benchmarksData = [ "description": "XCOPA causal reasoning for Thai (th)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -7516,7 +7506,7 @@ export const benchmarksData = [ "description": "XCOPA causal reasoning for Turkish (tr)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -7530,7 +7520,7 @@ export const benchmarksData = [ "description": "XCOPA causal reasoning for Vietnamese (vi)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -7544,7 +7534,7 @@ export const benchmarksData = [ "description": "Cross-lingual story completion for commonsense reasoning", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual" @@ -7557,7 +7547,7 @@ export const benchmarksData = [ "description": "XStoryCloze story completion for Arabic (ar)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -7571,7 +7561,7 @@ export const benchmarksData = [ "description": "XStoryCloze story completion for Basque (eu)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -7585,7 +7575,7 @@ export const benchmarksData = [ "description": "XStoryCloze story completion for Burmese (my)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -7599,7 +7589,7 @@ export const benchmarksData = [ "description": "XStoryCloze story completion for Chinese (zh)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -7613,7 +7603,7 @@ export const benchmarksData = [ "description": "XStoryCloze story completion for English (en)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -7627,7 +7617,7 @@ export const benchmarksData = [ "description": "XStoryCloze story completion for Hindi (hi)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -7641,7 +7631,7 @@ export const benchmarksData = [ "description": "XStoryCloze story completion for Indonesian (id)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -7655,7 +7645,7 @@ export const benchmarksData = [ "description": "XStoryCloze story completion for Russian (ru)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -7669,7 +7659,7 @@ export const benchmarksData = [ "description": "XStoryCloze story completion for Spanish (es)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -7683,7 +7673,7 @@ export const benchmarksData = [ "description": "XStoryCloze story completion for Swahili (sw)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -7697,7 +7687,7 @@ export const benchmarksData = [ "description": "XStoryCloze story completion for Telugu (te)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -7711,7 +7701,7 @@ export const benchmarksData = [ "description": "Cross-lingual Winograd Schema Challenge for pronoun resolution", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual" @@ -7724,7 +7714,7 @@ export const benchmarksData = [ "description": "XWinograd pronoun resolution for Chinese (zh)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -7738,7 +7728,7 @@ export const benchmarksData = [ "description": "XWinograd pronoun resolution for English (en)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -7752,7 +7742,7 @@ export const benchmarksData = [ "description": "XWinograd pronoun resolution for French (fr)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -7766,7 +7756,7 @@ export const benchmarksData = [ "description": "XWinograd pronoun resolution for Japanese (jp)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -7780,7 +7770,7 @@ export const benchmarksData = [ "description": "XWinograd pronoun resolution for Portuguese (pt)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -7794,7 +7784,7 @@ export const benchmarksData = [ "description": "XWinograd pronoun resolution for Russian (ru)", "category": "cross-lingual", "tags": [ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -7864,13 +7854,13 @@ export const evalGroupsData = [ }, { "name": "Arabic Exams", - "description": "Aggregate of 40+ Arabic exam tasks", + "description": "Aggregate of 40 Arabic exam tasks", "category": "eval-group", "tags": [ "eval-group" ], "id": "arabic_exams", - "benchmark_count": 41, + "benchmark_count": 40, "benchmarks": [ "arabic_exams_accounting_university", "arabic_exams_arabic_language_general", @@ -7904,7 +7894,6 @@ export const evalGroupsData = [ "arabic_exams_islamic_studies_primary_school", "arabic_exams_law_professional", "arabic_exams_management_university", - "arabic_exams_math_high_school", "arabic_exams_math_primary_school", "arabic_exams_natural_science_middle_school", "arabic_exams_natural_science_primary_school", diff --git a/scripts/validate_mcq_tags.py b/scripts/validate_mcq_tags.py new file mode 100755 index 00000000..4e8c7152 --- /dev/null +++ b/scripts/validate_mcq_tags.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python3 +""" +Validate and auto-fix "mcq" tags for MCQ benchmarks. + +This script uses the centralized is_mcq_task() function to detect MCQ benchmarks +by inspecting their actual implementations, then ensures they all have the +"mcq" tag in config.py. + +Performance Note: + This script loads all benchmarks (~200+) to check their scorers via + is_mcq_task(). This task is implemented in CI/CD since it takes 2-3 minutes + and runs infrequently (only when config.py or eval files change in PRs). + The detection is comprehensive and catches all inconsistencies. + +Usage: + # Check for missing or incorrect tags + python3 scripts/validate_mcq_tags.py + + # Auto-fix config.py (adds missing tags, removes incorrect ones) + python3 scripts/validate_mcq_tags.py --fix + + # Only check alpha benchmarks + python3 scripts/validate_mcq_tags.py --alpha-only +""" + +from __future__ import annotations +import argparse +import sys +from pathlib import Path +from typing import List, Tuple + +# Add src to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from openbench.config import get_all_benchmarks, BENCHMARKS +from openbench.utils.mcq import get_mcq_benchmarks + + +def validate_mcq_tags(include_alpha: bool = True) -> Tuple[List[str], List[str]]: + """ + Validate that MCQ benchmarks have the "mcq" tag. + + Args: + include_alpha: Whether to check alpha benchmarks + + Returns: + Tuple of (missing_tag_benchmarks, incorrect_tag_benchmarks) + - missing_tag: MCQ benchmarks without "mcq" tag + - incorrect_tag: Non-MCQ benchmarks with "mcq" tag + """ + print("🔍 Detecting MCQ benchmarks by inspecting implementations...") + mcq_benchmarks = set(get_mcq_benchmarks(include_alpha=include_alpha)) + print(f" Found {len(mcq_benchmarks)} MCQ benchmarks") + + print("\n🔍 Checking tags in config.py...") + all_benchmarks = get_all_benchmarks(include_alpha=include_alpha) + + missing_tag = [] + incorrect_tag = [] + + for benchmark_name, metadata in all_benchmarks.items(): + is_mcq = benchmark_name in mcq_benchmarks + has_tag = "mcq" in metadata.tags + + if is_mcq and not has_tag: + missing_tag.append(benchmark_name) + elif not is_mcq and has_tag: + incorrect_tag.append(benchmark_name) + + return missing_tag, incorrect_tag + + +def fix_config_file(missing_tag: List[str], incorrect_tag: List[str]) -> None: + """ + Update config.py to fix missing and incorrect tags. + + Args: + missing_tag: List of benchmarks missing "mcq" tag + incorrect_tag: List of benchmarks incorrectly tagged + """ + config_path = Path(__file__).parent.parent / "src" / "openbench" / "config.py" + + if not config_path.exists(): + print(f"❌ Config file not found: {config_path}") + return + + print(f"\n📝 Reading {config_path}...") + content = config_path.read_text() + original_content = content + + # Get current tags for each benchmark + all_benchmarks = get_all_benchmarks(include_alpha=True) + + # Add missing "mcq" tags + for benchmark in missing_tag: + print(f" ➕ Adding 'mcq' tag to {benchmark}") + + if benchmark not in all_benchmarks: + print(f" ⚠️ Benchmark not found: {benchmark}") + continue + + current_tags = list(all_benchmarks[benchmark].tags) # Copy the list + + # Skip if already has "mcq" (shouldn't happen, but be safe) + if "mcq" in current_tags: + continue + + # Add "mcq" at the beginning + new_tags = current_tags.copy() + new_tags.insert(0, "mcq") + + # Build the old and new tags strings + old_tags_str = 'tags=[' + ', '.join(f'"{t}"' for t in current_tags) + ']' + new_tags_str = 'tags=[' + ', '.join(f'"{t}"' for t in new_tags) + ']' + + # Replace in content + if old_tags_str in content: + content = content.replace(old_tags_str, new_tags_str, 1) + else: + print(f" ⚠️ Could not find exact tags match for {benchmark}") + + # Remove incorrect "mcq" tags + for benchmark in incorrect_tag: + print(f" ➖ Removing 'mcq' tag from {benchmark}") + + if benchmark not in all_benchmarks: + print(f" ⚠️ Benchmark not found: {benchmark}") + continue + + current_tags = list(all_benchmarks[benchmark].tags) # Copy the list + + # Skip if doesn't have "mcq" (shouldn't happen, but be safe) + if "mcq" not in current_tags: + continue + + # Remove "mcq" from tags + new_tags = current_tags.copy() + new_tags.remove("mcq") + + # Build the old and new tags strings + old_tags_str = 'tags=[' + ', '.join(f'"{t}"' for t in current_tags) + ']' + new_tags_str = 'tags=[' + ', '.join(f'"{t}"' for t in new_tags) + ']' + + # Replace in content + if old_tags_str in content: + content = content.replace(old_tags_str, new_tags_str, 1) + else: + print(f" ⚠️ Could not find exact tags match for {benchmark}") + + if content != original_content: + print(f"\n💾 Writing changes to {config_path}...") + config_path.write_text(content) + print("✅ Config file updated successfully!") + else: + print("\n✅ No changes needed - config file is already correct") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Validate and auto-fix 'mcq' tags for MCQ benchmarks" + ) + parser.add_argument( + "--fix", + action="store_true", + help="Automatically fix config.py by adding/removing tags", + ) + parser.add_argument( + "--alpha-only", + action="store_true", + help="Only check alpha/experimental benchmarks", + ) + parser.add_argument( + "--no-alpha", + action="store_true", + help="Exclude alpha/experimental benchmarks", + ) + + args = parser.parse_args() + + # Determine alpha inclusion + if args.alpha_only and args.no_alpha: + print("❌ Cannot use both --alpha-only and --no-alpha") + sys.exit(1) + + include_alpha = not args.no_alpha + + print("=" * 70) + print("MCQ Tag Validation") + print("=" * 70) + print(f"Checking: {'All benchmarks' if include_alpha else 'Non-alpha benchmarks only'}") + print() + + try: + missing_tag, incorrect_tag = validate_mcq_tags(include_alpha=include_alpha) + + # Filter by alpha-only if requested + if args.alpha_only: + all_benchmarks = get_all_benchmarks(include_alpha=True) + missing_tag = [b for b in missing_tag if all_benchmarks[b].is_alpha] + incorrect_tag = [b for b in incorrect_tag if all_benchmarks[b].is_alpha] + + # Report results + print("\n" + "=" * 70) + print("Results") + print("=" * 70) + + if missing_tag: + print(f"\n❌ Missing 'mcq' tag ({len(missing_tag)} benchmarks):") + for benchmark in sorted(missing_tag)[:10]: + print(f" • {benchmark}") + if len(missing_tag) > 10: + print(f" ... and {len(missing_tag) - 10} more") + + if incorrect_tag: + print( + f"\n❌ Incorrect 'mcq' tag ({len(incorrect_tag)} benchmarks):" + ) + print(" (These are not MCQ benchmarks but have the tag)") + for benchmark in sorted(incorrect_tag)[:10]: + print(f" • {benchmark}") + if len(incorrect_tag) > 10: + print(f" ... and {len(incorrect_tag) - 10} more") + + if not missing_tag and not incorrect_tag: + print("\n✅ All MCQ tags are correct!") + sys.exit(0) + + # Suggest fix + if not args.fix: + print("\n" + "-" * 70) + print("To automatically fix these issues, run:") + print(f" python {Path(__file__).name} --fix") + print("-" * 70) + sys.exit(1) + + # Apply fixes + print("\n" + "=" * 70) + print("Applying Fixes") + print("=" * 70) + fix_config_file(missing_tag, incorrect_tag) + + # Verify fixes + print("\n" + "=" * 70) + print("Verifying Fixes") + print("=" * 70) + + # Reload the config module to get updated tags + import importlib + import openbench.config + importlib.reload(openbench.config) + + new_missing, new_incorrect = validate_mcq_tags(include_alpha=include_alpha) + + if args.alpha_only: + all_benchmarks = get_all_benchmarks(include_alpha=True) + new_missing = [b for b in new_missing if all_benchmarks[b].is_alpha] + new_incorrect = [b for b in new_incorrect if all_benchmarks[b].is_alpha] + + if new_missing or new_incorrect: + print("⚠️ Some issues remain after fixes:") + if new_missing: + print(f" Still missing: {len(new_missing)}") + if new_incorrect: + print(f" Still incorrect: {len(new_incorrect)}") + print("\nYou may need to manually review config.py") + sys.exit(1) + else: + print("✅ All issues fixed successfully!") + sys.exit(0) + + except Exception as e: + print(f"\n❌ Error: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/openbench/config.py b/src/openbench/config.py index c5c69550..56dd84bb 100644 --- a/src/openbench/config.py +++ b/src/openbench/config.py @@ -148,7 +148,7 @@ class EvalGroup: name="MMLU (cais/mmlu)", description="Massive Multitask Language Understanding - 57 academic subjects from the cais/mmlu dataset. Only supports English (EN-US).", category="core", - tags=["multiple-choice", "knowledge", "reasoning", "multitask"], + tags=["mcq", "knowledge", "reasoning", "multitask"], module_path="openbench.evals.mmlu", function_name="mmlu", ), @@ -156,7 +156,7 @@ class EvalGroup: name="MMLU Pro (TIGER-Lab)", description="Enhanced version of MMLU with more challenging, reasoning-focused questions.", category="core", - tags=["multiple-choice", "knowledge", "reasoning", "multitask"], + tags=["mcq", "knowledge", "reasoning", "multitask"], module_path="openbench.evals.mmlu_pro", function_name="mmlu_pro", ), @@ -164,7 +164,7 @@ class EvalGroup: name="MMLU-Redux", description="Manually re-annotated subset of 5,700 MMLU questions addressing annotation errors in the original dataset.", category="core", - tags=["multiple-choice", "knowledge", "reasoning", "multitask"], + tags=["mcq", "knowledge", "reasoning", "multitask"], module_path="openbench.evals.mmlu_redux", function_name="mmlu_redux", ), @@ -172,7 +172,7 @@ class EvalGroup: name="MMMLU (openai/MMMLU)", description="MMLU translated to 15 languages.", category="core", - tags=["multiple-choice", "knowledge", "reasoning", "multitask"], + tags=["mcq", "knowledge", "reasoning", "multitask"], module_path="openbench.evals.mmmlu", function_name="mmmlu", ), @@ -181,7 +181,7 @@ class EvalGroup: name="MMMLU (Arabic)", description="MMLU in Arabic (AR_XY)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_ar_xy", subtask=True, @@ -190,7 +190,7 @@ class EvalGroup: name="MMMLU (Bengali)", description="MMLU in Bengali (BN_BD)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_bn_bd", subtask=True, @@ -199,7 +199,7 @@ class EvalGroup: name="MMMLU (German)", description="MMLU in German (DE_DE)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_de_de", subtask=True, @@ -208,7 +208,7 @@ class EvalGroup: name="MMMLU (Spanish)", description="MMLU in Spanish Latin America (ES_LA)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_es_la", subtask=True, @@ -217,7 +217,7 @@ class EvalGroup: name="MMMLU (French)", description="MMLU in French (FR_FR)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_fr_fr", subtask=True, @@ -226,7 +226,7 @@ class EvalGroup: name="MMMLU (Hindi)", description="MMLU in Hindi (HI_IN)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_hi_in", subtask=True, @@ -235,7 +235,7 @@ class EvalGroup: name="MMMLU (Indonesian)", description="MMLU in Indonesian (ID_ID)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_id_id", subtask=True, @@ -244,7 +244,7 @@ class EvalGroup: name="MMMLU (Italian)", description="MMLU in Italian (IT_IT)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_it_it", subtask=True, @@ -253,7 +253,7 @@ class EvalGroup: name="MMMLU (Japanese)", description="MMLU in Japanese (JA_JP)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_ja_jp", subtask=True, @@ -262,7 +262,7 @@ class EvalGroup: name="MMMLU (Korean)", description="MMLU in Korean (KO_KR)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_ko_kr", subtask=True, @@ -271,7 +271,7 @@ class EvalGroup: name="MMMLU (Portuguese)", description="MMLU in Portuguese Brazil (PT_BR)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_pt_br", subtask=True, @@ -280,7 +280,7 @@ class EvalGroup: name="MMMLU (Chinese)", description="MMLU in Chinese (ZH_CN)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_zh_cn", subtask=True, @@ -289,7 +289,7 @@ class EvalGroup: name="MMMLU (Swahili)", description="MMLU in Swahili (SW_KE)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_sw_ke", subtask=True, @@ -298,7 +298,7 @@ class EvalGroup: name="MMMLU (Yoruba)", description="MMLU in Yoruba (YO_NG)", category="core", - tags=["multiple-choice", "knowledge", "multilingual", "mmmlu"], + tags=["mcq", "knowledge", "multilingual", "mmmlu"], module_path="openbench.evals.mmmlu", function_name="mmmlu_yo_ng", subtask=True, @@ -342,7 +342,7 @@ class EvalGroup: name="GPQA Diamond", description="Graduate-level Google-Proof Q&A in biology, chemistry, and physics", category="core", - tags=["multiple-choice", "science", "graduate-level"], + tags=["mcq", "science", "graduate-level"], module_path="openbench.evals.gpqa_diamond", function_name="gpqa_diamond", ), @@ -350,7 +350,7 @@ class EvalGroup: name="GPQA", description="Graduate-level science questions (multiple choice) across physics, chemistry, and biology", category="core", - tags=["multiple-choice", "science", "graduate-level", "reasoning"], + tags=["mcq", "science", "graduate-level", "reasoning"], module_path="openbench.evals.gpqa", function_name="gpqa", ), @@ -444,7 +444,7 @@ class EvalGroup: name="OpenBookQA", description="Elementary-level science questions probing understanding of core facts", category="core", - tags=["multiple-choice", "science", "elementary", "open-book"], + tags=["mcq", "science", "elementary", "open-book"], module_path="openbench.evals.openbookqa", function_name="openbookqa", ), @@ -452,7 +452,7 @@ class EvalGroup: name="MuSR", description="Testing the Limits of Chain-of-thought with Multistep Soft Reasoning - includes murder mysteries, object placements, and team allocation tasks", category="core", - tags=["multiple-choice", "reasoning", "commonsense", "chain-of-thought"], + tags=["mcq", "reasoning", "commonsense", "chain-of-thought"], module_path="openbench.evals.musr", function_name="musr", ), @@ -461,7 +461,7 @@ class EvalGroup: description="MuSR murder mystery scenarios - who is the most likely murderer?", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "commonsense", "chain-of-thought", @@ -476,7 +476,7 @@ class EvalGroup: description="MuSR object placement reasoning - where would someone look for an object?", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "commonsense", "chain-of-thought", @@ -491,7 +491,7 @@ class EvalGroup: description="MuSR team allocation problems - how to allocate people to tasks efficiently?", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "commonsense", "chain-of-thought", @@ -513,7 +513,7 @@ class EvalGroup: name="SuperGPQA", description="Scaling LLM Evaluation across 285 Graduate Disciplines - 26,529 multiple-choice questions across science, engineering, medicine, economics, and philosophy", category="core", - tags=["multiple-choice", "knowledge", "graduate-level", "multidisciplinary"], + tags=["mcq", "knowledge", "graduate-level", "multidisciplinary"], module_path="openbench.evals.supergpqa", function_name="supergpqa", ), @@ -633,7 +633,7 @@ class EvalGroup: name="TUMLU", description="TUMLU is a comprehensive, multilingual, and natively developed language understanding benchmark specifically designed for Turkic languages.", category="community", - tags=["factuality", "question-answering", "multiple-choice", "reasoning"], + tags=["factuality", "question-answering", "reasoning"], module_path="openbench.evals.tumlu", function_name="tumlu", ), @@ -1036,7 +1036,7 @@ class EvalGroup: name="Global-MMLU (42 Languages)", description="Culturally adapted multilingual MMLU with 42 languages", category="core", - tags=["multiple-choice", "multilingual", "cultural-sensitivity", "mmlu"], + tags=["mcq", "multilingual", "cultural-sensitivity", "mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu", ), @@ -1045,7 +1045,7 @@ class EvalGroup: name="Global-MMLU: Amharic", description="Global-MMLU culturally adapted MMLU for Amharic (am)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_amharic", subtask=True, @@ -1054,7 +1054,7 @@ class EvalGroup: name="Global-MMLU: Arabic", description="Global-MMLU culturally adapted MMLU for Arabic (ar)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_arabic", subtask=True, @@ -1063,7 +1063,7 @@ class EvalGroup: name="Global-MMLU: Bengali", description="Global-MMLU culturally adapted MMLU for Bengali (bn)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_bengali", subtask=True, @@ -1072,7 +1072,7 @@ class EvalGroup: name="Global-MMLU: Czech", description="Global-MMLU culturally adapted MMLU for Czech (cs)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_czech", subtask=True, @@ -1081,7 +1081,7 @@ class EvalGroup: name="Global-MMLU: German", description="Global-MMLU culturally adapted MMLU for German (de)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_german", subtask=True, @@ -1090,7 +1090,7 @@ class EvalGroup: name="Global-MMLU: Greek", description="Global-MMLU culturally adapted MMLU for Greek (el)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_greek", subtask=True, @@ -1099,7 +1099,7 @@ class EvalGroup: name="Global-MMLU: English", description="Global-MMLU culturally adapted MMLU for English (en)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_english", subtask=True, @@ -1108,7 +1108,7 @@ class EvalGroup: name="Global-MMLU: Spanish", description="Global-MMLU culturally adapted MMLU for Spanish (es)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_spanish", subtask=True, @@ -1117,7 +1117,7 @@ class EvalGroup: name="Global-MMLU: Persian", description="Global-MMLU culturally adapted MMLU for Persian (fa)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_persian", subtask=True, @@ -1126,7 +1126,7 @@ class EvalGroup: name="Global-MMLU: Filipino", description="Global-MMLU culturally adapted MMLU for Filipino (fil)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_filipino", subtask=True, @@ -1135,7 +1135,7 @@ class EvalGroup: name="Global-MMLU: French", description="Global-MMLU culturally adapted MMLU for French (fr)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_french", subtask=True, @@ -1144,7 +1144,7 @@ class EvalGroup: name="Global-MMLU: Hausa", description="Global-MMLU culturally adapted MMLU for Hausa (ha)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_hausa", subtask=True, @@ -1153,7 +1153,7 @@ class EvalGroup: name="Global-MMLU: Hebrew", description="Global-MMLU culturally adapted MMLU for Hebrew (he)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_hebrew", subtask=True, @@ -1162,7 +1162,7 @@ class EvalGroup: name="Global-MMLU: Hindi", description="Global-MMLU culturally adapted MMLU for Hindi (hi)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_hindi", subtask=True, @@ -1171,7 +1171,7 @@ class EvalGroup: name="Global-MMLU: Indonesian", description="Global-MMLU culturally adapted MMLU for Indonesian (id)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_indonesian", subtask=True, @@ -1180,7 +1180,7 @@ class EvalGroup: name="Global-MMLU: Igbo", description="Global-MMLU culturally adapted MMLU for Igbo (ig)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_igbo", subtask=True, @@ -1189,7 +1189,7 @@ class EvalGroup: name="Global-MMLU: Italian", description="Global-MMLU culturally adapted MMLU for Italian (it)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_italian", subtask=True, @@ -1198,7 +1198,7 @@ class EvalGroup: name="Global-MMLU: Japanese", description="Global-MMLU culturally adapted MMLU for Japanese (ja)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_japanese", subtask=True, @@ -1207,7 +1207,7 @@ class EvalGroup: name="Global-MMLU: Korean", description="Global-MMLU culturally adapted MMLU for Korean (ko)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_korean", subtask=True, @@ -1216,7 +1216,7 @@ class EvalGroup: name="Global-MMLU: Kyrgyz", description="Global-MMLU culturally adapted MMLU for Kyrgyz (ky)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_kyrgyz", subtask=True, @@ -1225,7 +1225,7 @@ class EvalGroup: name="Global-MMLU: Lithuanian", description="Global-MMLU culturally adapted MMLU for Lithuanian (lt)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_lithuanian", subtask=True, @@ -1234,7 +1234,7 @@ class EvalGroup: name="Global-MMLU: Malagasy", description="Global-MMLU culturally adapted MMLU for Malagasy (mg)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_malagasy", subtask=True, @@ -1243,7 +1243,7 @@ class EvalGroup: name="Global-MMLU: Malay", description="Global-MMLU culturally adapted MMLU for Malay (ms)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_malay", subtask=True, @@ -1252,7 +1252,7 @@ class EvalGroup: name="Global-MMLU: Nepali", description="Global-MMLU culturally adapted MMLU for Nepali (ne)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_nepali", subtask=True, @@ -1261,7 +1261,7 @@ class EvalGroup: name="Global-MMLU: Dutch", description="Global-MMLU culturally adapted MMLU for Dutch (nl)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_dutch", subtask=True, @@ -1270,7 +1270,7 @@ class EvalGroup: name="Global-MMLU: Chichewa", description="Global-MMLU culturally adapted MMLU for Chichewa (ny)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_chichewa", subtask=True, @@ -1279,7 +1279,7 @@ class EvalGroup: name="Global-MMLU: Polish", description="Global-MMLU culturally adapted MMLU for Polish (pl)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_polish", subtask=True, @@ -1288,7 +1288,7 @@ class EvalGroup: name="Global-MMLU: Portuguese", description="Global-MMLU culturally adapted MMLU for Portuguese (pt)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_portuguese", subtask=True, @@ -1297,7 +1297,7 @@ class EvalGroup: name="Global-MMLU: Romanian", description="Global-MMLU culturally adapted MMLU for Romanian (ro)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_romanian", subtask=True, @@ -1306,7 +1306,7 @@ class EvalGroup: name="Global-MMLU: Russian", description="Global-MMLU culturally adapted MMLU for Russian (ru)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_russian", subtask=True, @@ -1315,7 +1315,7 @@ class EvalGroup: name="Global-MMLU: Sinhala", description="Global-MMLU culturally adapted MMLU for Sinhala (si)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_sinhala", subtask=True, @@ -1324,7 +1324,7 @@ class EvalGroup: name="Global-MMLU: Shona", description="Global-MMLU culturally adapted MMLU for Shona (sn)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_shona", subtask=True, @@ -1333,7 +1333,7 @@ class EvalGroup: name="Global-MMLU: Somali", description="Global-MMLU culturally adapted MMLU for Somali (so)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_somali", subtask=True, @@ -1342,7 +1342,7 @@ class EvalGroup: name="Global-MMLU: Serbian", description="Global-MMLU culturally adapted MMLU for Serbian (sr)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_serbian", subtask=True, @@ -1351,7 +1351,7 @@ class EvalGroup: name="Global-MMLU: Swedish", description="Global-MMLU culturally adapted MMLU for Swedish (sv)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_swedish", subtask=True, @@ -1360,7 +1360,7 @@ class EvalGroup: name="Global-MMLU: Swahili", description="Global-MMLU culturally adapted MMLU for Swahili (sw)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_swahili", subtask=True, @@ -1369,7 +1369,7 @@ class EvalGroup: name="Global-MMLU: Telugu", description="Global-MMLU culturally adapted MMLU for Telugu (te)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_telugu", subtask=True, @@ -1378,7 +1378,7 @@ class EvalGroup: name="Global-MMLU: Turkish", description="Global-MMLU culturally adapted MMLU for Turkish (tr)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_turkish", subtask=True, @@ -1387,7 +1387,7 @@ class EvalGroup: name="Global-MMLU: Ukrainian", description="Global-MMLU culturally adapted MMLU for Ukrainian (uk)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_ukrainian", subtask=True, @@ -1396,7 +1396,7 @@ class EvalGroup: name="Global-MMLU: Vietnamese", description="Global-MMLU culturally adapted MMLU for Vietnamese (vi)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_vietnamese", subtask=True, @@ -1405,7 +1405,7 @@ class EvalGroup: name="Global-MMLU: Yoruba", description="Global-MMLU culturally adapted MMLU for Yoruba (yo)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_yoruba", subtask=True, @@ -1414,7 +1414,7 @@ class EvalGroup: name="Global-MMLU: Chinese", description="Global-MMLU culturally adapted MMLU for Chinese (zh)", category="global-mmlu", - tags=["multiple-choice", "multilingual", "cultural-adaptation", "global-mmlu"], + tags=["mcq", "multilingual", "cultural-adaptation", "global-mmlu"], module_path="openbench.evals.global_mmlu", function_name="global_mmlu_chinese", subtask=True, @@ -1425,7 +1425,7 @@ class EvalGroup: name="BigBench: Anachronisms", description="BigBench MCQ task: anachronisms", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_anachronisms", subtask=True, @@ -1434,7 +1434,7 @@ class EvalGroup: name="BigBench: Analogical Similarity", description="BigBench MCQ task: analogical_similarity", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_analogical_similarity", subtask=True, @@ -1443,7 +1443,7 @@ class EvalGroup: name="BigBench: Analytic Entailment", description="BigBench MCQ task: analytic_entailment", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_analytic_entailment", subtask=True, @@ -1452,7 +1452,7 @@ class EvalGroup: name="BigBench: Arithmetic", description="BigBench MCQ task: arithmetic", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_arithmetic", subtask=True, @@ -1461,7 +1461,7 @@ class EvalGroup: name="BigBench: Authorship Verification", description="BigBench MCQ task: authorship_verification", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_authorship_verification", subtask=True, @@ -1470,7 +1470,7 @@ class EvalGroup: name="BigBench: Bbq Lite Json", description="BigBench MCQ task: bbq_lite_json", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_bbq_lite_json", subtask=True, @@ -1479,7 +1479,7 @@ class EvalGroup: name="BigBench: Causal Judgment", description="BigBench MCQ task: causal_judgment", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_causal_judgment", subtask=True, @@ -1488,7 +1488,7 @@ class EvalGroup: name="BigBench: Cause And Effect", description="BigBench MCQ task: cause_and_effect", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_cause_and_effect", subtask=True, @@ -1497,7 +1497,7 @@ class EvalGroup: name="BigBench: Checkmate In One", description="BigBench MCQ task: checkmate_in_one", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_checkmate_in_one", subtask=True, @@ -1506,7 +1506,7 @@ class EvalGroup: name="BigBench: Cifar10 Classification", description="BigBench MCQ task: cifar10_classification", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_cifar10_classification", subtask=True, @@ -1515,7 +1515,7 @@ class EvalGroup: name="BigBench: Code Line Description", description="BigBench MCQ task: code_line_description", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_code_line_description", subtask=True, @@ -1524,7 +1524,7 @@ class EvalGroup: name="BigBench: Color", description="BigBench MCQ task: color", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_color", subtask=True, @@ -1533,7 +1533,7 @@ class EvalGroup: name="BigBench: Common Morpheme", description="BigBench MCQ task: common_morpheme", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_common_morpheme", subtask=True, @@ -1542,7 +1542,7 @@ class EvalGroup: name="BigBench: Conceptual Combinations", description="BigBench MCQ task: conceptual_combinations", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_conceptual_combinations", subtask=True, @@ -1551,7 +1551,7 @@ class EvalGroup: name="BigBench: Contextual Parametric Knowledge Conflicts", description="BigBench MCQ task: contextual_parametric_knowledge_conflicts", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_contextual_parametric_knowledge_conflicts", subtask=True, @@ -1560,7 +1560,7 @@ class EvalGroup: name="BigBench: Crash Blossom", description="BigBench MCQ task: crash_blossom", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_crash_blossom", subtask=True, @@ -1569,7 +1569,7 @@ class EvalGroup: name="BigBench: Crass Ai", description="BigBench MCQ task: crass_ai", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_crass_ai", subtask=True, @@ -1578,7 +1578,7 @@ class EvalGroup: name="BigBench: Cryobiology Spanish", description="BigBench MCQ task: cryobiology_spanish", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_cryobiology_spanish", subtask=True, @@ -1587,7 +1587,7 @@ class EvalGroup: name="BigBench: Cs Algorithms", description="BigBench MCQ task: cs_algorithms", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_cs_algorithms", subtask=True, @@ -1596,7 +1596,7 @@ class EvalGroup: name="BigBench: Dark Humor Detection", description="BigBench MCQ task: dark_humor_detection", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_dark_humor_detection", subtask=True, @@ -1605,7 +1605,7 @@ class EvalGroup: name="BigBench: Date Understanding", description="BigBench MCQ task: date_understanding", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_date_understanding", subtask=True, @@ -1614,7 +1614,7 @@ class EvalGroup: name="BigBench: Disambiguation Qa", description="BigBench MCQ task: disambiguation_qa", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_disambiguation_qa", subtask=True, @@ -1623,7 +1623,7 @@ class EvalGroup: name="BigBench: Discourse Marker Prediction", description="BigBench MCQ task: discourse_marker_prediction", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_discourse_marker_prediction", subtask=True, @@ -1632,7 +1632,7 @@ class EvalGroup: name="BigBench: Dyck Languages", description="BigBench MCQ task: dyck_languages", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_dyck_languages", subtask=True, @@ -1641,7 +1641,7 @@ class EvalGroup: name="BigBench: Elementary Math Qa", description="BigBench MCQ task: elementary_math_qa", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_elementary_math_qa", subtask=True, @@ -1650,7 +1650,7 @@ class EvalGroup: name="BigBench: Emoji Movie", description="BigBench MCQ task: emoji_movie", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_emoji_movie", subtask=True, @@ -1659,7 +1659,7 @@ class EvalGroup: name="BigBench: Emojis Emotion Prediction", description="BigBench MCQ task: emojis_emotion_prediction", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_emojis_emotion_prediction", subtask=True, @@ -1668,7 +1668,7 @@ class EvalGroup: name="BigBench: Empirical Judgments", description="BigBench MCQ task: empirical_judgments", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_empirical_judgments", subtask=True, @@ -1677,7 +1677,7 @@ class EvalGroup: name="BigBench: English Proverbs", description="BigBench MCQ task: english_proverbs", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_english_proverbs", subtask=True, @@ -1686,7 +1686,7 @@ class EvalGroup: name="BigBench: English Russian Proverbs", description="BigBench MCQ task: english_russian_proverbs", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_english_russian_proverbs", subtask=True, @@ -1695,7 +1695,7 @@ class EvalGroup: name="BigBench: Entailed Polarity", description="BigBench MCQ task: entailed_polarity", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_entailed_polarity", subtask=True, @@ -1704,7 +1704,7 @@ class EvalGroup: name="BigBench: Entailed Polarity Hindi", description="BigBench MCQ task: entailed_polarity_hindi", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_entailed_polarity_hindi", subtask=True, @@ -1713,7 +1713,7 @@ class EvalGroup: name="BigBench: Epistemic Reasoning", description="BigBench MCQ task: epistemic_reasoning", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_epistemic_reasoning", subtask=True, @@ -1722,7 +1722,7 @@ class EvalGroup: name="BigBench: Evaluating Information Essentiality", description="BigBench MCQ task: evaluating_information_essentiality", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_evaluating_information_essentiality", subtask=True, @@ -1731,7 +1731,7 @@ class EvalGroup: name="BigBench: Fact Checker", description="BigBench MCQ task: fact_checker", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_fact_checker", subtask=True, @@ -1740,7 +1740,7 @@ class EvalGroup: name="BigBench: Fantasy Reasoning", description="BigBench MCQ task: fantasy_reasoning", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_fantasy_reasoning", subtask=True, @@ -1749,7 +1749,7 @@ class EvalGroup: name="BigBench: Figure Of Speech Detection", description="BigBench MCQ task: figure_of_speech_detection", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_figure_of_speech_detection", subtask=True, @@ -1758,7 +1758,7 @@ class EvalGroup: name="BigBench: Formal Fallacies Syllogisms Negation", description="BigBench MCQ task: formal_fallacies_syllogisms_negation", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_formal_fallacies_syllogisms_negation", subtask=True, @@ -1767,7 +1767,7 @@ class EvalGroup: name="BigBench: General Knowledge", description="BigBench MCQ task: general_knowledge", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_general_knowledge", subtask=True, @@ -1776,7 +1776,7 @@ class EvalGroup: name="BigBench: Geometric Shapes", description="BigBench MCQ task: geometric_shapes", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_geometric_shapes", subtask=True, @@ -1785,7 +1785,7 @@ class EvalGroup: name="BigBench: Goal Step Wikihow", description="BigBench MCQ task: goal_step_wikihow", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_goal_step_wikihow", subtask=True, @@ -1794,7 +1794,7 @@ class EvalGroup: name="BigBench: Gre Reading Comprehension", description="BigBench MCQ task: gre_reading_comprehension", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_gre_reading_comprehension", subtask=True, @@ -1803,7 +1803,7 @@ class EvalGroup: name="BigBench: Hhh Alignment", description="BigBench MCQ task: hhh_alignment", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_hhh_alignment", subtask=True, @@ -1812,7 +1812,7 @@ class EvalGroup: name="BigBench: Hindu Knowledge", description="BigBench MCQ task: hindu_knowledge", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_hindu_knowledge", subtask=True, @@ -1821,7 +1821,7 @@ class EvalGroup: name="BigBench: Hinglish Toxicity", description="BigBench MCQ task: hinglish_toxicity", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_hinglish_toxicity", subtask=True, @@ -1830,7 +1830,7 @@ class EvalGroup: name="BigBench: Human Organs Senses", description="BigBench MCQ task: human_organs_senses", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_human_organs_senses", subtask=True, @@ -1839,7 +1839,7 @@ class EvalGroup: name="BigBench: Hyperbaton", description="BigBench MCQ task: hyperbaton", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_hyperbaton", subtask=True, @@ -1848,7 +1848,7 @@ class EvalGroup: name="BigBench: Identify Math Theorems", description="BigBench MCQ task: identify_math_theorems", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_identify_math_theorems", subtask=True, @@ -1857,7 +1857,7 @@ class EvalGroup: name="BigBench: Identify Odd Metaphor", description="BigBench MCQ task: identify_odd_metaphor", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_identify_odd_metaphor", subtask=True, @@ -1866,7 +1866,7 @@ class EvalGroup: name="BigBench: Implicatures", description="BigBench MCQ task: implicatures", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_implicatures", subtask=True, @@ -1875,7 +1875,7 @@ class EvalGroup: name="BigBench: Implicit Relations", description="BigBench MCQ task: implicit_relations", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_implicit_relations", subtask=True, @@ -1884,7 +1884,7 @@ class EvalGroup: name="BigBench: Indic Cause And Effect", description="BigBench MCQ task: indic_cause_and_effect", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_indic_cause_and_effect", subtask=True, @@ -1893,7 +1893,7 @@ class EvalGroup: name="BigBench: Intent Recognition", description="BigBench MCQ task: intent_recognition", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_intent_recognition", subtask=True, @@ -1902,7 +1902,7 @@ class EvalGroup: name="BigBench: International Phonetic Alphabet Nli", description="BigBench MCQ task: international_phonetic_alphabet_nli", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_international_phonetic_alphabet_nli", subtask=True, @@ -1911,7 +1911,7 @@ class EvalGroup: name="BigBench: Intersect Geometry", description="BigBench MCQ task: intersect_geometry", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_intersect_geometry", subtask=True, @@ -1920,7 +1920,7 @@ class EvalGroup: name="BigBench: Irony Identification", description="BigBench MCQ task: irony_identification", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_irony_identification", subtask=True, @@ -1929,7 +1929,7 @@ class EvalGroup: name="BigBench: Kanji Ascii", description="BigBench MCQ task: kanji_ascii", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_kanji_ascii", subtask=True, @@ -1938,7 +1938,7 @@ class EvalGroup: name="BigBench: Kannada", description="BigBench MCQ task: kannada", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_kannada", subtask=True, @@ -1947,7 +1947,7 @@ class EvalGroup: name="BigBench: Key Value Maps", description="BigBench MCQ task: key_value_maps", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_key_value_maps", subtask=True, @@ -1956,7 +1956,7 @@ class EvalGroup: name="BigBench: Known Unknowns", description="BigBench MCQ task: known_unknowns", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_known_unknowns", subtask=True, @@ -1965,7 +1965,7 @@ class EvalGroup: name="BigBench: Language Identification", description="BigBench MCQ task: language_identification", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_language_identification", subtask=True, @@ -1974,7 +1974,7 @@ class EvalGroup: name="BigBench: Logic Grid Puzzle", description="BigBench MCQ task: logic_grid_puzzle", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_logic_grid_puzzle", subtask=True, @@ -1983,7 +1983,7 @@ class EvalGroup: name="BigBench: Logical Args", description="BigBench MCQ task: logical_args", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_logical_args", subtask=True, @@ -1992,7 +1992,7 @@ class EvalGroup: name="BigBench: Logical Deduction", description="BigBench MCQ task: logical_deduction", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_logical_deduction", subtask=True, @@ -2001,7 +2001,7 @@ class EvalGroup: name="BigBench: Logical Fallacy Detection", description="BigBench MCQ task: logical_fallacy_detection", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_logical_fallacy_detection", subtask=True, @@ -2010,7 +2010,7 @@ class EvalGroup: name="BigBench: Logical Sequence", description="BigBench MCQ task: logical_sequence", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_logical_sequence", subtask=True, @@ -2019,7 +2019,7 @@ class EvalGroup: name="BigBench: Mathematical Induction", description="BigBench MCQ task: mathematical_induction", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_mathematical_induction", subtask=True, @@ -2028,7 +2028,7 @@ class EvalGroup: name="BigBench: Medical Questions Russian", description="BigBench MCQ task: medical_questions_russian", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_medical_questions_russian", subtask=True, @@ -2037,7 +2037,7 @@ class EvalGroup: name="BigBench: Metaphor Boolean", description="BigBench MCQ task: metaphor_boolean", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_metaphor_boolean", subtask=True, @@ -2046,7 +2046,7 @@ class EvalGroup: name="BigBench: Metaphor Understanding", description="BigBench MCQ task: metaphor_understanding", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_metaphor_understanding", subtask=True, @@ -2055,7 +2055,7 @@ class EvalGroup: name="BigBench: Minute Mysteries Qa", description="BigBench MCQ task: minute_mysteries_qa", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_minute_mysteries_qa", subtask=True, @@ -2064,7 +2064,7 @@ class EvalGroup: name="BigBench: Misconceptions", description="BigBench MCQ task: misconceptions", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_misconceptions", subtask=True, @@ -2073,7 +2073,7 @@ class EvalGroup: name="BigBench: Misconceptions Russian", description="BigBench MCQ task: misconceptions_russian", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_misconceptions_russian", subtask=True, @@ -2082,7 +2082,7 @@ class EvalGroup: name="BigBench: Mnist Ascii", description="BigBench MCQ task: mnist_ascii", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_mnist_ascii", subtask=True, @@ -2091,7 +2091,7 @@ class EvalGroup: name="BigBench: Moral Permissibility", description="BigBench MCQ task: moral_permissibility", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_moral_permissibility", subtask=True, @@ -2100,7 +2100,7 @@ class EvalGroup: name="BigBench: Movie Dialog Same Or Different", description="BigBench MCQ task: movie_dialog_same_or_different", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_movie_dialog_same_or_different", subtask=True, @@ -2109,7 +2109,7 @@ class EvalGroup: name="BigBench: Movie Recommendation", description="BigBench MCQ task: movie_recommendation", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_movie_recommendation", subtask=True, @@ -2118,7 +2118,7 @@ class EvalGroup: name="BigBench: Navigate", description="BigBench MCQ task: navigate", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_navigate", subtask=True, @@ -2127,7 +2127,7 @@ class EvalGroup: name="BigBench: Nonsense Words Grammar", description="BigBench MCQ task: nonsense_words_grammar", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_nonsense_words_grammar", subtask=True, @@ -2136,7 +2136,7 @@ class EvalGroup: name="BigBench: Novel Concepts", description="BigBench MCQ task: novel_concepts", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_novel_concepts", subtask=True, @@ -2145,7 +2145,7 @@ class EvalGroup: name="BigBench: Odd One Out", description="BigBench MCQ task: odd_one_out", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_odd_one_out", subtask=True, @@ -2154,7 +2154,7 @@ class EvalGroup: name="BigBench: Parsinlu Qa", description="BigBench MCQ task: parsinlu_qa", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_parsinlu_qa", subtask=True, @@ -2163,7 +2163,7 @@ class EvalGroup: name="BigBench: Penguins In A Table", description="BigBench MCQ task: penguins_in_a_table", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_penguins_in_a_table", subtask=True, @@ -2172,7 +2172,7 @@ class EvalGroup: name="BigBench: Periodic Elements", description="BigBench MCQ task: periodic_elements", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_periodic_elements", subtask=True, @@ -2181,7 +2181,7 @@ class EvalGroup: name="BigBench: Persian Idioms", description="BigBench MCQ task: persian_idioms", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_persian_idioms", subtask=True, @@ -2190,7 +2190,7 @@ class EvalGroup: name="BigBench: Phrase Relatedness", description="BigBench MCQ task: phrase_relatedness", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_phrase_relatedness", subtask=True, @@ -2199,7 +2199,7 @@ class EvalGroup: name="BigBench: Physical Intuition", description="BigBench MCQ task: physical_intuition", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_physical_intuition", subtask=True, @@ -2208,7 +2208,7 @@ class EvalGroup: name="BigBench: Physics", description="BigBench MCQ task: physics", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_physics", subtask=True, @@ -2217,7 +2217,7 @@ class EvalGroup: name="BigBench: Play Dialog Same Or Different", description="BigBench MCQ task: play_dialog_same_or_different", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_play_dialog_same_or_different", subtask=True, @@ -2226,7 +2226,7 @@ class EvalGroup: name="BigBench: Presuppositions As Nli", description="BigBench MCQ task: presuppositions_as_nli", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_presuppositions_as_nli", subtask=True, @@ -2235,7 +2235,7 @@ class EvalGroup: name="BigBench: Question Selection", description="BigBench MCQ task: question_selection", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_question_selection", subtask=True, @@ -2244,7 +2244,7 @@ class EvalGroup: name="BigBench: Real Or Fake Text", description="BigBench MCQ task: real_or_fake_text", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_real_or_fake_text", subtask=True, @@ -2253,7 +2253,7 @@ class EvalGroup: name="BigBench: Reasoning About Colored Objects", description="BigBench MCQ task: reasoning_about_colored_objects", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_reasoning_about_colored_objects", subtask=True, @@ -2262,7 +2262,7 @@ class EvalGroup: name="BigBench: Rhyming", description="BigBench MCQ task: rhyming", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_rhyming", subtask=True, @@ -2271,7 +2271,7 @@ class EvalGroup: name="BigBench: Riddle Sense", description="BigBench MCQ task: riddle_sense", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_riddle_sense", subtask=True, @@ -2280,7 +2280,7 @@ class EvalGroup: name="BigBench: Ruin Names", description="BigBench MCQ task: ruin_names", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_ruin_names", subtask=True, @@ -2289,7 +2289,7 @@ class EvalGroup: name="BigBench: Salient Translation Error Detection", description="BigBench MCQ task: salient_translation_error_detection", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_salient_translation_error_detection", subtask=True, @@ -2298,7 +2298,7 @@ class EvalGroup: name="BigBench: Sentence Ambiguity", description="BigBench MCQ task: sentence_ambiguity", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_sentence_ambiguity", subtask=True, @@ -2307,7 +2307,7 @@ class EvalGroup: name="BigBench: Similarities Abstraction", description="BigBench MCQ task: similarities_abstraction", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_similarities_abstraction", subtask=True, @@ -2316,7 +2316,7 @@ class EvalGroup: name="BigBench: Simple Ethical Questions", description="BigBench MCQ task: simple_ethical_questions", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_simple_ethical_questions", subtask=True, @@ -2325,7 +2325,7 @@ class EvalGroup: name="BigBench: Snarks", description="BigBench MCQ task: snarks", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_snarks", subtask=True, @@ -2334,7 +2334,7 @@ class EvalGroup: name="BigBench: Social Iqa", description="BigBench MCQ task: social_iqa", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_social_iqa", subtask=True, @@ -2343,7 +2343,7 @@ class EvalGroup: name="BigBench: Social Support", description="BigBench MCQ task: social_support", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_social_support", subtask=True, @@ -2352,7 +2352,7 @@ class EvalGroup: name="BigBench: Sports Understanding", description="BigBench MCQ task: sports_understanding", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_sports_understanding", subtask=True, @@ -2361,7 +2361,7 @@ class EvalGroup: name="BigBench: Strange Stories", description="BigBench MCQ task: strange_stories", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_strange_stories", subtask=True, @@ -2370,7 +2370,7 @@ class EvalGroup: name="BigBench: Strategyqa", description="BigBench MCQ task: strategyqa", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_strategyqa", subtask=True, @@ -2379,7 +2379,7 @@ class EvalGroup: name="BigBench: Suicide Risk", description="BigBench MCQ task: suicide_risk", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_suicide_risk", subtask=True, @@ -2388,7 +2388,7 @@ class EvalGroup: name="BigBench: Swahili English Proverbs", description="BigBench MCQ task: swahili_english_proverbs", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_swahili_english_proverbs", subtask=True, @@ -2397,7 +2397,7 @@ class EvalGroup: name="BigBench: Swedish To German Proverbs", description="BigBench MCQ task: swedish_to_german_proverbs", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_swedish_to_german_proverbs", subtask=True, @@ -2406,7 +2406,7 @@ class EvalGroup: name="BigBench: Symbol Interpretation", description="BigBench MCQ task: symbol_interpretation", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_symbol_interpretation", subtask=True, @@ -2415,7 +2415,7 @@ class EvalGroup: name="BigBench: Temporal Sequences", description="BigBench MCQ task: temporal_sequences", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_temporal_sequences", subtask=True, @@ -2424,7 +2424,7 @@ class EvalGroup: name="BigBench: Timedial", description="BigBench MCQ task: timedial", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_timedial", subtask=True, @@ -2433,7 +2433,7 @@ class EvalGroup: name="BigBench: Tracking Shuffled Objects", description="BigBench MCQ task: tracking_shuffled_objects", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_tracking_shuffled_objects", subtask=True, @@ -2442,7 +2442,7 @@ class EvalGroup: name="BigBench: Understanding Fables", description="BigBench MCQ task: understanding_fables", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_understanding_fables", subtask=True, @@ -2451,7 +2451,7 @@ class EvalGroup: name="BigBench: Undo Permutation", description="BigBench MCQ task: undo_permutation", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_undo_permutation", subtask=True, @@ -2460,7 +2460,7 @@ class EvalGroup: name="BigBench: Unit Conversion", description="BigBench MCQ task: unit_conversion", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_unit_conversion", subtask=True, @@ -2469,7 +2469,7 @@ class EvalGroup: name="BigBench: Unit Interpretation", description="BigBench MCQ task: unit_interpretation", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_unit_interpretation", subtask=True, @@ -2478,7 +2478,7 @@ class EvalGroup: name="BigBench: Vitaminc Fact Verification", description="BigBench MCQ task: vitaminc_fact_verification", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_vitaminc_fact_verification", subtask=True, @@ -2487,7 +2487,7 @@ class EvalGroup: name="BigBench: What Is The Tao", description="BigBench MCQ task: what_is_the_tao", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_what_is_the_tao", subtask=True, @@ -2496,7 +2496,7 @@ class EvalGroup: name="BigBench: Which Wiki Edit", description="BigBench MCQ task: which_wiki_edit", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_which_wiki_edit", subtask=True, @@ -2505,7 +2505,7 @@ class EvalGroup: name="BigBench: Winowhy", description="BigBench MCQ task: winowhy", category="bigbench", - tags=["multiple-choice", "reasoning", "bigbench"], + tags=["mcq", "reasoning", "bigbench"], module_path="openbench.evals.bigbench", function_name="bigbench_winowhy", subtask=True, @@ -2516,7 +2516,7 @@ class EvalGroup: name="BBH: Causal Judgment", description="BigBench Hard - Causal judgment reasoning", category="core", - tags=["multiple-choice", "reasoning", "bigbench", "chain-of-thought"], + tags=["mcq", "reasoning", "bigbench", "chain-of-thought"], module_path="openbench.evals.bigbench_hard", function_name="bbh_causal_judgment", subtask=True, @@ -2525,7 +2525,7 @@ class EvalGroup: name="BBH: Date Understanding", description="BigBench Hard - Understanding and reasoning about dates", category="core", - tags=["multiple-choice", "reasoning", "bigbench", "chain-of-thought"], + tags=["mcq", "reasoning", "bigbench", "chain-of-thought"], module_path="openbench.evals.bigbench_hard", function_name="bbh_date_understanding", subtask=True, @@ -2534,7 +2534,7 @@ class EvalGroup: name="BBH: Disambiguation QA", description="BigBench Hard - Pronoun disambiguation in questions", category="core", - tags=["multiple-choice", "reasoning", "bigbench", "chain-of-thought"], + tags=["mcq", "reasoning", "bigbench", "chain-of-thought"], module_path="openbench.evals.bigbench_hard", function_name="bbh_disambiguation_qa", subtask=True, @@ -2544,7 +2544,7 @@ class EvalGroup: description="BigBench Hard - Reasoning about geometric shapes", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -2558,7 +2558,7 @@ class EvalGroup: name="BBH: Logical Deduction (5 Objects)", description="BigBench Hard - Logical deduction with five objects", category="core", - tags=["multiple-choice", "reasoning", "bigbench", "chain-of-thought", "logic"], + tags=["mcq", "reasoning", "bigbench", "chain-of-thought", "logic"], module_path="openbench.evals.bigbench_hard", function_name="bbh_logical_deduction_five_objects", subtask=True, @@ -2567,7 +2567,7 @@ class EvalGroup: name="BBH: Logical Deduction (7 Objects)", description="BigBench Hard - Logical deduction with seven objects", category="core", - tags=["multiple-choice", "reasoning", "bigbench", "chain-of-thought", "logic"], + tags=["mcq", "reasoning", "bigbench", "chain-of-thought", "logic"], module_path="openbench.evals.bigbench_hard", function_name="bbh_logical_deduction_seven_objects", subtask=True, @@ -2576,7 +2576,7 @@ class EvalGroup: name="BBH: Logical Deduction (3 Objects)", description="BigBench Hard - Logical deduction with three objects", category="core", - tags=["multiple-choice", "reasoning", "bigbench", "chain-of-thought", "logic"], + tags=["mcq", "reasoning", "bigbench", "chain-of-thought", "logic"], module_path="openbench.evals.bigbench_hard", function_name="bbh_logical_deduction_three_objects", subtask=True, @@ -2585,7 +2585,7 @@ class EvalGroup: name="BBH: Movie Recommendation", description="BigBench Hard - Movie recommendation reasoning", category="core", - tags=["multiple-choice", "reasoning", "bigbench", "chain-of-thought"], + tags=["mcq", "reasoning", "bigbench", "chain-of-thought"], module_path="openbench.evals.bigbench_hard", function_name="bbh_movie_recommendation", subtask=True, @@ -2595,7 +2595,7 @@ class EvalGroup: description="BigBench Hard - Spatial navigation reasoning", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -2609,7 +2609,7 @@ class EvalGroup: name="BBH: Reasoning About Colored Objects", description="BigBench Hard - Reasoning about colored objects", category="core", - tags=["multiple-choice", "reasoning", "bigbench", "chain-of-thought"], + tags=["mcq", "reasoning", "bigbench", "chain-of-thought"], module_path="openbench.evals.bigbench_hard", function_name="bbh_reasoning_about_colored_objects", subtask=True, @@ -2619,7 +2619,7 @@ class EvalGroup: description="BigBench Hard - Word manipulation and reasoning", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -2634,7 +2634,7 @@ class EvalGroup: description="BigBench Hard - Detecting translation errors", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -2649,7 +2649,7 @@ class EvalGroup: description="BigBench Hard - Understanding sarcasm and irony", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -2663,7 +2663,7 @@ class EvalGroup: name="BBH: Sports Understanding", description="BigBench Hard - Sports knowledge and reasoning", category="core", - tags=["multiple-choice", "reasoning", "bigbench", "chain-of-thought", "sports"], + tags=["mcq", "reasoning", "bigbench", "chain-of-thought", "sports"], module_path="openbench.evals.bigbench_hard", function_name="bbh_sports_understanding", subtask=True, @@ -2673,7 +2673,7 @@ class EvalGroup: description="BigBench Hard - Understanding temporal sequences", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -2688,7 +2688,7 @@ class EvalGroup: description="BigBench Hard - Tracking five shuffled objects", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -2703,7 +2703,7 @@ class EvalGroup: description="BigBench Hard - Tracking seven shuffled objects", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -2718,7 +2718,7 @@ class EvalGroup: description="BigBench Hard - Tracking three shuffled objects", category="core", tags=[ - "multiple-choice", + "mcq", "reasoning", "bigbench", "chain-of-thought", @@ -2733,7 +2733,7 @@ class EvalGroup: name="MedMCQA", description="Medical multiple-choice questions from Indian medical entrance exams (AIIMS & NEET PG)", category="core", - tags=["multiple-choice", "medical", "healthcare", "medicine"], + tags=["mcq", "medical", "healthcare", "medicine"], module_path="openbench.evals.medmcqa", function_name="medmcqa", ), @@ -2741,7 +2741,7 @@ class EvalGroup: name="MedQA", description="US Medical Licensing Exam (USMLE) questions for medical reasoning", category="core", - tags=["multiple-choice", "medical", "healthcare", "medicine", "clinical"], + tags=["mcq", "medical", "healthcare", "medicine", "clinical"], module_path="openbench.evals.medqa", function_name="medqa", ), @@ -2749,7 +2749,7 @@ class EvalGroup: name="PubMedQA", description="Biomedical question answering from PubMed abstracts", category="core", - tags=["multiple-choice", "medical", "biomedical", "research", "literature"], + tags=["mcq", "medical", "biomedical", "research", "literature"], module_path="openbench.evals.pubmedqa", function_name="pubmedqa", ), @@ -2757,7 +2757,7 @@ class EvalGroup: name="HeadQA", description="Spanish healthcare specialization exam questions (Spanish and English)", category="core", - tags=["multiple-choice", "medical", "healthcare", "multilingual"], + tags=["mcq", "medical", "healthcare", "multilingual"], module_path="openbench.evals.headqa", function_name="headqa", ), @@ -2765,7 +2765,7 @@ class EvalGroup: name="HeadQA (English)", description="Spanish healthcare specialization exam questions in English", category="core", - tags=["multiple-choice", "medical", "healthcare", "english"], + tags=["mcq", "medical", "healthcare", "english"], module_path="openbench.evals.headqa", function_name="headqa_en", subtask=True, @@ -2774,7 +2774,7 @@ class EvalGroup: name="HeadQA (Spanish)", description="Spanish healthcare specialization exam questions in Spanish", category="core", - tags=["multiple-choice", "medical", "healthcare", "spanish"], + tags=["mcq", "medical", "healthcare", "spanish"], module_path="openbench.evals.headqa", function_name="headqa_es", subtask=True, @@ -2784,7 +2784,7 @@ class EvalGroup: name="ARC-Easy", description="AI2 Reasoning Challenge - Easy questions from grade-school science exams", category="core", - tags=["multiple-choice", "science", "commonsense-reasoning"], + tags=["mcq", "science", "commonsense-reasoning"], module_path="openbench.evals.arc", function_name="arc_easy", subtask=True, @@ -2793,7 +2793,7 @@ class EvalGroup: name="ARC-Challenge", description="AI2 Reasoning Challenge - Challenging questions from grade-school science exams", category="core", - tags=["multiple-choice", "science", "commonsense-reasoning"], + tags=["mcq", "science", "commonsense-reasoning"], module_path="openbench.evals.arc", function_name="arc_challenge", subtask=True, @@ -2802,7 +2802,7 @@ class EvalGroup: name="BoolQ", description="BoolQ: A Question Answering Dataset for Boolean Reasoning", category="core", - tags=["boolean-reasoning", "question-answering"], + tags=["boolean-reasoning", "question-answering", "mcq"], module_path="openbench.evals.boolq", function_name="boolq", ), @@ -2810,7 +2810,7 @@ class EvalGroup: name="HellaSwag", description="Adversarially-filtered sentence completion benchmark for commonsense reasoning", category="core", - tags=["multiple-choice", "commonsense-reasoning", "sentence-completion"], + tags=["mcq", "commonsense-reasoning", "sentence-completion"], module_path="openbench.evals.hellaswag", function_name="hellaswag", ), @@ -2818,7 +2818,7 @@ class EvalGroup: name="PIQA", description="Physical Interaction Question Answering - commonsense about physical situations", category="core", - tags=["multiple-choice", "commonsense-reasoning", "physical-reasoning"], + tags=["mcq", "commonsense-reasoning", "physical-reasoning"], module_path="openbench.evals.piqa", function_name="piqa", ), @@ -2826,7 +2826,7 @@ class EvalGroup: name="PROST", description="Physical Reasoning about Objects through Space and Time", category="core", - tags=["multiple-choice", "commonsense-reasoning", "physical-reasoning"], + tags=["mcq", "commonsense-reasoning", "physical-reasoning"], module_path="openbench.evals.prost", function_name="prost", ), @@ -2834,7 +2834,7 @@ class EvalGroup: name="SWAG", description="Situations With Adversarial Generations - grounded commonsense inference", category="core", - tags=["multiple-choice", "commonsense-reasoning", "video-captions"], + tags=["mcq", "commonsense-reasoning", "video-captions"], module_path="openbench.evals.swag", function_name="swag", ), @@ -2842,7 +2842,7 @@ class EvalGroup: name="WinoGrande", description="Large-scale Winograd Schema Challenge for commonsense pronoun resolution", category="core", - tags=["multiple-choice", "commonsense-reasoning", "pronoun-resolution"], + tags=["mcq", "commonsense-reasoning", "pronoun-resolution"], module_path="openbench.evals.winogrande", function_name="winogrande", ), @@ -2850,7 +2850,7 @@ class EvalGroup: name="WSC273", description="Original Winograd Schema Challenge with 273 expert-crafted questions", category="core", - tags=["multiple-choice", "commonsense-reasoning", "pronoun-resolution"], + tags=["mcq", "commonsense-reasoning", "pronoun-resolution"], module_path="openbench.evals.wsc273", function_name="wsc273", ), @@ -2867,7 +2867,7 @@ class EvalGroup: name="GMCQ", description="GitHub Multiple Choice Questions", category="core", - tags=["code-understanding"], + tags=["code-understanding", "mcq"], module_path="openbench.evals.rootly_gmcq", function_name="rootly_gmcq", ), @@ -2875,7 +2875,7 @@ class EvalGroup: name="Terraform", description="Terraform Multiple Choice Questions", category="core", - tags=["code-understanding"], + tags=["code-understanding", "mcq"], module_path="openbench.evals.rootly_terraform", function_name="rootly_terraform", ), @@ -2892,7 +2892,7 @@ class EvalGroup: name="MMMU", description="Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark with 11.5K questions across 30 subjects from college exams, quizzes, and textbooks", category="core", - tags=["multimodal", "multiple-choice", "reasoning", "college-level", "images"], + tags=["multimodal", "mcq", "reasoning", "college-level", "images"], module_path="openbench.evals.mmmu", function_name="mmmu", is_alpha=False, @@ -2901,7 +2901,7 @@ class EvalGroup: name="MMMU Art", description="MMMU Art subset focusing on art and visual design questions", category="core", - tags=["multimodal", "multiple-choice", "art", "visual-design", "images"], + tags=["multimodal", "mcq", "art", "visual-design", "images"], module_path="openbench.evals.mmmu", function_name="mmmu_art", is_alpha=False, @@ -2911,7 +2911,7 @@ class EvalGroup: name="MMMU Biology", description="MMMU Biology subset focusing on biological sciences", category="core", - tags=["multimodal", "multiple-choice", "biology", "science", "images"], + tags=["multimodal", "mcq", "biology", "science", "images"], module_path="openbench.evals.mmmu", function_name="mmmu_biology", is_alpha=False, @@ -2921,7 +2921,7 @@ class EvalGroup: name="MMMU Chemistry", description="MMMU Chemistry subset focusing on chemical sciences", category="core", - tags=["multimodal", "multiple-choice", "chemistry", "science", "images"], + tags=["multimodal", "mcq", "chemistry", "science", "images"], module_path="openbench.evals.mmmu", function_name="mmmu_chemistry", is_alpha=False, @@ -2931,7 +2931,7 @@ class EvalGroup: name="MMMU Math", description="MMMU Mathematics subset focusing on mathematical reasoning", category="math", - tags=["multimodal", "multiple-choice", "mathematics", "reasoning", "images"], + tags=["multimodal", "mcq", "mathematics", "reasoning", "images"], module_path="openbench.evals.mmmu", function_name="mmmu_math", is_alpha=False, @@ -2941,7 +2941,7 @@ class EvalGroup: name="MMMU Physics", description="MMMU Physics subset focusing on physics and physical sciences", category="core", - tags=["multimodal", "multiple-choice", "physics", "science", "images"], + tags=["multimodal", "mcq", "physics", "science", "images"], module_path="openbench.evals.mmmu", function_name="mmmu_physics", is_alpha=False, @@ -2951,7 +2951,7 @@ class EvalGroup: name="MMMU Accounting", description="MMMU Accounting subset focusing on accounting principles and practices", category="core", - tags=["multimodal", "multiple-choice", "accounting", "business", "images"], + tags=["multimodal", "mcq", "accounting", "business", "images"], module_path="openbench.evals.mmmu", function_name="mmmu_accounting", is_alpha=False, @@ -2961,7 +2961,7 @@ class EvalGroup: name="MMMU Agriculture", description="MMMU Agriculture subset focusing on agricultural sciences and practices", category="core", - tags=["multimodal", "multiple-choice", "agriculture", "science", "images"], + tags=["multimodal", "mcq", "agriculture", "science", "images"], module_path="openbench.evals.mmmu", function_name="mmmu_agriculture", is_alpha=False, @@ -2973,7 +2973,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "architecture", "engineering", "design", @@ -2988,7 +2988,7 @@ class EvalGroup: name="MMMU Art Theory", description="MMMU Art Theory subset focusing on art history and theoretical concepts", category="core", - tags=["multimodal", "multiple-choice", "art", "theory", "history", "images"], + tags=["multimodal", "mcq", "art", "theory", "history", "images"], module_path="openbench.evals.mmmu", function_name="mmmu_art_theory", is_alpha=False, @@ -3000,7 +3000,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "medicine", "science", "health", @@ -3017,7 +3017,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "medicine", "clinical", "health", @@ -3034,7 +3034,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "design", "visual", "creative", @@ -3051,7 +3051,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "medicine", "diagnostics", "laboratory", @@ -3068,7 +3068,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "electronics", "engineering", "technology", @@ -3085,7 +3085,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "energy", "power", "engineering", @@ -3102,7 +3102,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "finance", "business", "economics", @@ -3119,7 +3119,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "geography", "earth-science", "spatial", @@ -3136,7 +3136,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "history", "humanities", "culture", @@ -3153,7 +3153,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "literature", "humanities", "language", @@ -3170,7 +3170,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "management", "business", "leadership", @@ -3187,7 +3187,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "marketing", "business", "communication", @@ -3204,7 +3204,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "materials", "science", "engineering", @@ -3221,7 +3221,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "mechanical", "engineering", "design", @@ -3236,7 +3236,7 @@ class EvalGroup: name="MMMU Music", description="MMMU Music subset focusing on music theory and analysis", category="core", - tags=["multimodal", "multiple-choice", "music", "arts", "theory", "images"], + tags=["multimodal", "mcq", "music", "arts", "theory", "images"], module_path="openbench.evals.mmmu", function_name="mmmu_music", is_alpha=False, @@ -3248,7 +3248,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "pharmacy", "medicine", "health", @@ -3265,7 +3265,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "public-health", "health", "population", @@ -3282,7 +3282,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "sociology", "social-science", "society", @@ -3297,7 +3297,7 @@ class EvalGroup: name="MMMU MCQ", description="MMMU MCQ subset focusing on multiple choice questions", category="core", - tags=["multimodal", "multiple-choice", "images"], + tags=["multimodal", "mcq", "images"], module_path="openbench.evals.mmmu", function_name="mmmu_mcq", is_alpha=False, @@ -3307,7 +3307,7 @@ class EvalGroup: name="MMMU Open", description="MMMU Open subset focusing on open-ended questions", category="core", - tags=["multimodal", "open-ended", "images"], + tags=["multimodal", "open-ended", "images", "mcq"], module_path="openbench.evals.mmmu", function_name="mmmu_open", is_alpha=False, @@ -3319,7 +3319,7 @@ class EvalGroup: category="core", tags=[ "multimodal", - "multiple-choice", + "mcq", "reasoning", "images", "mmmu-pro", @@ -3332,7 +3332,7 @@ class EvalGroup: name="MMMU-Pro (Vision)", description="MMMU-Pro vision subset with images and multiple-choice questions", category="core", - tags=["multimodal", "vision", "multiple-choice", "images", "mmmu-pro"], + tags=["multimodal", "vision", "mcq", "images", "mmmu-pro"], module_path="openbench.evals.mmmu_pro", function_name="mmmu_pro_vision", is_alpha=False, @@ -3398,7 +3398,7 @@ class EvalGroup: name="ANLI (All Rounds)", description="Adversarial Natural Language Inference - challenging NLI benchmark", category="glue", - tags=["multiple-choice", "nli", "adversarial", "reasoning"], + tags=["mcq", "nli", "adversarial", "reasoning"], module_path="openbench.evals.anli", function_name="anli", ), @@ -3406,7 +3406,7 @@ class EvalGroup: name="ANLI Round 1", description="Adversarial NLI Round 1", category="glue", - tags=["multiple-choice", "nli", "adversarial", "reasoning"], + tags=["mcq", "nli", "adversarial", "reasoning"], module_path="openbench.evals.anli", function_name="anli_r1", subtask=True, @@ -3415,7 +3415,7 @@ class EvalGroup: name="ANLI Round 2", description="Adversarial NLI Round 2", category="glue", - tags=["multiple-choice", "nli", "adversarial", "reasoning"], + tags=["mcq", "nli", "adversarial", "reasoning"], module_path="openbench.evals.anli", function_name="anli_r2", subtask=True, @@ -3424,7 +3424,7 @@ class EvalGroup: name="ANLI Round 3", description="Adversarial NLI Round 3", category="glue", - tags=["multiple-choice", "nli", "adversarial", "reasoning"], + tags=["mcq", "nli", "adversarial", "reasoning"], module_path="openbench.evals.anli", function_name="anli_r3", subtask=True, @@ -3433,7 +3433,7 @@ class EvalGroup: name="COPA", description="Choice of Plausible Alternatives for causal reasoning", category="glue", - tags=["multiple-choice", "superglue", "nli", "reasoning"], + tags=["mcq", "superglue", "nli", "reasoning"], module_path="openbench.evals.glue", function_name="copa", subtask=True, @@ -3442,7 +3442,7 @@ class EvalGroup: name="RTE (SuperGLUE)", description="Recognizing Textual Entailment from SuperGLUE", category="glue", - tags=["multiple-choice", "superglue", "nli", "reasoning"], + tags=["mcq", "superglue", "nli", "reasoning"], module_path="openbench.evals.glue", function_name="rte", subtask=True, @@ -3451,7 +3451,7 @@ class EvalGroup: name="WiC", description="Word in Context - word sense disambiguation", category="glue", - tags=["multiple-choice", "superglue", "nli", "wsd", "reasoning"], + tags=["mcq", "superglue", "nli", "wsd", "reasoning"], module_path="openbench.evals.glue", function_name="wic", subtask=True, @@ -3461,7 +3461,7 @@ class EvalGroup: description="Winograd Schema Challenge - coreference resolution", category="glue", tags=[ - "multiple-choice", + "mcq", "superglue", "nli", "reasoning", @@ -3475,7 +3475,7 @@ class EvalGroup: name="CommitmentBank", description="Natural language inference with commitment", category="glue", - tags=["multiple-choice", "superglue", "nli", "reasoning"], + tags=["mcq", "superglue", "nli", "reasoning"], module_path="openbench.evals.glue", function_name="cb", subtask=True, @@ -3484,7 +3484,7 @@ class EvalGroup: name="MultiRC", description="Multi-Sentence Reading Comprehension", category="glue", - tags=["multiple-choice", "superglue", "nli", "reasoning"], + tags=["mcq", "superglue", "nli", "reasoning"], module_path="openbench.evals.glue", function_name="multirc", subtask=True, @@ -3493,7 +3493,7 @@ class EvalGroup: name="SuperGLUE (All Tasks)", description="SuperGLUE benchmark suite - run any subset by name (boolq, cb, copa, multirc, rte, wic, wsc)", category="glue", - tags=["multiple-choice", "superglue", "nli", "reasoning"], + tags=["mcq", "superglue", "nli", "reasoning"], module_path="openbench.evals.glue", function_name="superglue", ), @@ -3501,7 +3501,7 @@ class EvalGroup: name="GLUE (All Tasks)", description="General Language Understanding Evaluation benchmark suite", category="glue", - tags=["multiple-choice", "glue", "nli", "sentiment", "similarity"], + tags=["mcq", "glue", "nli", "sentiment", "similarity"], module_path="openbench.evals.glue_standard", function_name="glue", ), @@ -3509,7 +3509,7 @@ class EvalGroup: name="GLUE: CoLA", description="Corpus of Linguistic Acceptability", category="glue", - tags=["multiple-choice", "glue", "nli"], + tags=["mcq", "glue", "nli"], module_path="openbench.evals.glue_standard", function_name="glue_cola", subtask=True, @@ -3518,7 +3518,7 @@ class EvalGroup: name="GLUE: SST-2", description="Stanford Sentiment Treebank", category="glue", - tags=["multiple-choice", "glue", "nli"], + tags=["mcq", "glue", "nli"], module_path="openbench.evals.glue_standard", function_name="glue_sst2", subtask=True, @@ -3527,7 +3527,7 @@ class EvalGroup: name="GLUE: MRPC", description="Microsoft Research Paraphrase Corpus", category="glue", - tags=["multiple-choice", "glue", "nli"], + tags=["mcq", "glue", "nli"], module_path="openbench.evals.glue_standard", function_name="glue_mrpc", subtask=True, @@ -3536,7 +3536,7 @@ class EvalGroup: name="GLUE: QQP", description="Quora Question Pairs", category="glue", - tags=["multiple-choice", "glue", "nli"], + tags=["mcq", "glue", "nli"], module_path="openbench.evals.glue_standard", function_name="glue_qqp", subtask=True, @@ -3545,7 +3545,7 @@ class EvalGroup: name="GLUE: STS-B", description="Semantic Textual Similarity Benchmark", category="glue", - tags=["multiple-choice", "glue", "nli"], + tags=["mcq", "glue", "nli"], module_path="openbench.evals.glue_standard", function_name="glue_stsb", subtask=True, @@ -3554,7 +3554,7 @@ class EvalGroup: name="GLUE: MNLI", description="Multi-Genre Natural Language Inference", category="glue", - tags=["multiple-choice", "glue", "nli"], + tags=["mcq", "glue", "nli"], module_path="openbench.evals.glue_standard", function_name="glue_mnli", subtask=True, @@ -3563,7 +3563,7 @@ class EvalGroup: name="GLUE: MNLI-MM", description="MNLI Mismatched", category="glue", - tags=["multiple-choice", "glue", "nli"], + tags=["mcq", "glue", "nli"], module_path="openbench.evals.glue_standard", function_name="glue_mnli_mismatched", subtask=True, @@ -3572,7 +3572,7 @@ class EvalGroup: name="GLUE: QNLI", description="Question Natural Language Inference", category="glue", - tags=["multiple-choice", "glue", "nli"], + tags=["mcq", "glue", "nli"], module_path="openbench.evals.glue_standard", function_name="glue_qnli", subtask=True, @@ -3581,7 +3581,7 @@ class EvalGroup: name="GLUE: RTE", description="Recognizing Textual Entailment", category="glue", - tags=["multiple-choice", "glue", "nli"], + tags=["mcq", "glue", "nli"], module_path="openbench.evals.glue_standard", function_name="glue_rte", subtask=True, @@ -3590,7 +3590,7 @@ class EvalGroup: name="GLUE: WNLI", description="Winograd Natural Language Inference", category="glue", - tags=["multiple-choice", "glue", "nli"], + tags=["mcq", "glue", "nli"], module_path="openbench.evals.glue_standard", function_name="glue_wnli", subtask=True, @@ -3600,7 +3600,7 @@ class EvalGroup: name="XCOPA (11 Languages)", description="Cross-lingual Choice of Plausible Alternatives for causal commonsense reasoning", category="cross-lingual", - tags=["multiple-choice", "causal-reasoning", "commonsense", "multilingual"], + tags=["mcq", "causal-reasoning", "commonsense", "multilingual"], module_path="openbench.evals.xcopa", function_name="xcopa", ), @@ -3609,7 +3609,7 @@ class EvalGroup: description="XCOPA causal reasoning for Estonian (et)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -3624,7 +3624,7 @@ class EvalGroup: description="XCOPA causal reasoning for Haitian Creole (ht)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -3639,7 +3639,7 @@ class EvalGroup: description="XCOPA causal reasoning for Indonesian (id)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -3654,7 +3654,7 @@ class EvalGroup: description="XCOPA causal reasoning for Italian (it)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -3669,7 +3669,7 @@ class EvalGroup: description="XCOPA causal reasoning for Quechua (qu)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -3684,7 +3684,7 @@ class EvalGroup: description="XCOPA causal reasoning for Swahili (sw)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -3699,7 +3699,7 @@ class EvalGroup: description="XCOPA causal reasoning for Tamil (ta)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -3714,7 +3714,7 @@ class EvalGroup: description="XCOPA causal reasoning for Thai (th)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -3729,7 +3729,7 @@ class EvalGroup: description="XCOPA causal reasoning for Turkish (tr)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -3744,7 +3744,7 @@ class EvalGroup: description="XCOPA causal reasoning for Vietnamese (vi)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -3759,7 +3759,7 @@ class EvalGroup: description="XCOPA causal reasoning for Chinese (zh)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "causal-reasoning", "commonsense", "multilingual", @@ -3773,7 +3773,7 @@ class EvalGroup: name="XStoryCloze (11 Languages)", description="Cross-lingual story completion for commonsense reasoning", category="cross-lingual", - tags=["multiple-choice", "story-completion", "commonsense", "multilingual"], + tags=["mcq", "story-completion", "commonsense", "multilingual"], module_path="openbench.evals.xstorycloze", function_name="xstorycloze", ), @@ -3782,7 +3782,7 @@ class EvalGroup: description="XStoryCloze story completion for English (en)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -3797,7 +3797,7 @@ class EvalGroup: description="XStoryCloze story completion for Russian (ru)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -3812,7 +3812,7 @@ class EvalGroup: description="XStoryCloze story completion for Chinese (zh)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -3827,7 +3827,7 @@ class EvalGroup: description="XStoryCloze story completion for Spanish (es)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -3842,7 +3842,7 @@ class EvalGroup: description="XStoryCloze story completion for Arabic (ar)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -3857,7 +3857,7 @@ class EvalGroup: description="XStoryCloze story completion for Hindi (hi)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -3872,7 +3872,7 @@ class EvalGroup: description="XStoryCloze story completion for Indonesian (id)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -3887,7 +3887,7 @@ class EvalGroup: description="XStoryCloze story completion for Telugu (te)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -3902,7 +3902,7 @@ class EvalGroup: description="XStoryCloze story completion for Swahili (sw)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -3917,7 +3917,7 @@ class EvalGroup: description="XStoryCloze story completion for Basque (eu)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -3932,7 +3932,7 @@ class EvalGroup: description="XStoryCloze story completion for Burmese (my)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "story-completion", "commonsense", "multilingual", @@ -3946,7 +3946,7 @@ class EvalGroup: name="XWinograd (6 Languages)", description="Cross-lingual Winograd Schema Challenge for pronoun resolution", category="cross-lingual", - tags=["multiple-choice", "pronoun-resolution", "commonsense", "multilingual"], + tags=["mcq", "pronoun-resolution", "commonsense", "multilingual"], module_path="openbench.evals.xwinograd", function_name="xwinograd", ), @@ -3955,7 +3955,7 @@ class EvalGroup: description="XWinograd pronoun resolution for English (en)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -3970,7 +3970,7 @@ class EvalGroup: description="XWinograd pronoun resolution for French (fr)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -3985,7 +3985,7 @@ class EvalGroup: description="XWinograd pronoun resolution for Japanese (jp)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -4000,7 +4000,7 @@ class EvalGroup: description="XWinograd pronoun resolution for Portuguese (pt)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -4015,7 +4015,7 @@ class EvalGroup: description="XWinograd pronoun resolution for Russian (ru)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -4030,7 +4030,7 @@ class EvalGroup: description="XWinograd pronoun resolution for Chinese (zh)", category="cross-lingual", tags=[ - "multiple-choice", + "mcq", "pronoun-resolution", "commonsense", "multilingual", @@ -4044,7 +4044,7 @@ class EvalGroup: name="LogiQA", description="Logical reasoning dataset from Chinese civil service exam questions - tests deductive reasoning skills", category="knowledge-qa", - tags=["multiple-choice", "logical-reasoning", "deduction", "critical-thinking"], + tags=["mcq", "logical-reasoning", "deduction", "critical-thinking"], module_path="openbench.evals.logiqa", function_name="logiqa", ), @@ -4052,7 +4052,7 @@ class EvalGroup: name="MathQA", description="Mathematical word problems with multiple-choice answers and solution rationales", category="knowledge-qa", - tags=["multiple-choice", "mathematics", "word-problems", "reasoning"], + tags=["mcq", "mathematics", "word-problems", "reasoning"], module_path="openbench.evals.mathqa", function_name="math_qa", ), @@ -4060,7 +4060,7 @@ class EvalGroup: name="SciQ", description="Science exam questions covering Physics, Chemistry, Biology, and other scientific domains", category="knowledge-qa", - tags=["multiple-choice", "science", "physics", "chemistry", "biology"], + tags=["mcq", "science", "physics", "chemistry", "biology"], module_path="openbench.evals.sciq", function_name="sciq", ), @@ -4068,7 +4068,7 @@ class EvalGroup: name="TruthfulQA", description="Tests if models generate truthful answers to questions that humans often answer falsely due to misconceptions", category="knowledge-qa", - tags=["multiple-choice", "truthfulness", "misconceptions", "factuality"], + tags=["mcq", "truthfulness", "misconceptions", "factuality"], module_path="openbench.evals.truthfulqa", function_name="truthfulqa", ), @@ -4077,7 +4077,7 @@ class EvalGroup: name="BLiMP (67 Linguistic Phenomena)", description="Benchmark of Linguistic Minimal Pairs testing grammatical knowledge through minimal pair comparisons", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "morphology"], + tags=["mcq", "linguistics", "grammar", "syntax", "morphology"], module_path="openbench.evals.blimp", function_name="blimp", ), @@ -4085,7 +4085,7 @@ class EvalGroup: name="BLiMP: Adjunct island effects", description="BLiMP Adjunct island effects", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_adjunct_island", subtask=True, @@ -4094,7 +4094,7 @@ class EvalGroup: name="BLiMP: Anaphor gender agreement", description="BLiMP Anaphor gender agreement", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_anaphor_gender_agreement", subtask=True, @@ -4103,7 +4103,7 @@ class EvalGroup: name="BLiMP: Anaphor number agreement", description="BLiMP Anaphor number agreement", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_anaphor_number_agreement", subtask=True, @@ -4112,7 +4112,7 @@ class EvalGroup: name="BLiMP: Animate subject in passive constructions", description="BLiMP Animate subject in passive constructions", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_animate_subject_passive", subtask=True, @@ -4121,7 +4121,7 @@ class EvalGroup: name="BLiMP: Animate subject in transitive constructions", description="BLiMP Animate subject in transitive constructions", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_animate_subject_trans", subtask=True, @@ -4130,7 +4130,7 @@ class EvalGroup: name="BLiMP: Causative constructions", description="BLiMP Causative constructions", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_causative", subtask=True, @@ -4139,7 +4139,7 @@ class EvalGroup: name="BLiMP: Complex NP island effects", description="BLiMP Complex NP island effects", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_complex_NP_island", subtask=True, @@ -4148,7 +4148,7 @@ class EvalGroup: name="BLiMP: Coordinate structure constraint - complex left branch", description="BLiMP Coordinate structure constraint - complex left branch", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_coordinate_structure_constraint_complex_left_branch", subtask=True, @@ -4157,7 +4157,7 @@ class EvalGroup: name="BLiMP: Coordinate structure constraint - object extraction", description="BLiMP Coordinate structure constraint - object extraction", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_coordinate_structure_constraint_object_extraction", subtask=True, @@ -4166,7 +4166,7 @@ class EvalGroup: name="BLiMP: Determiner-noun agreement (1)", description="BLiMP Determiner-noun agreement (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_determiner_noun_agreement_1", subtask=True, @@ -4175,7 +4175,7 @@ class EvalGroup: name="BLiMP: Determiner-noun agreement (2)", description="BLiMP Determiner-noun agreement (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_determiner_noun_agreement_2", subtask=True, @@ -4184,7 +4184,7 @@ class EvalGroup: name="BLiMP: Determiner-noun agreement with irregular nouns (1)", description="BLiMP Determiner-noun agreement with irregular nouns (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_determiner_noun_agreement_irregular_1", subtask=True, @@ -4193,7 +4193,7 @@ class EvalGroup: name="BLiMP: Determiner-noun agreement with irregular nouns (2)", description="BLiMP Determiner-noun agreement with irregular nouns (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_determiner_noun_agreement_irregular_2", subtask=True, @@ -4202,7 +4202,7 @@ class EvalGroup: name="BLiMP: Determiner-noun agreement with adjective (2)", description="BLiMP Determiner-noun agreement with adjective (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_determiner_noun_agreement_with_adj_2", subtask=True, @@ -4211,7 +4211,7 @@ class EvalGroup: name="BLiMP: Determiner-noun agreement with adjective and irregular nouns (1)", description="BLiMP Determiner-noun agreement with adjective and irregular nouns (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_determiner_noun_agreement_with_adj_irregular_1", subtask=True, @@ -4220,7 +4220,7 @@ class EvalGroup: name="BLiMP: Determiner-noun agreement with adjective and irregular nouns (2)", description="BLiMP Determiner-noun agreement with adjective and irregular nouns (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_determiner_noun_agreement_with_adj_irregular_2", subtask=True, @@ -4229,7 +4229,7 @@ class EvalGroup: name="BLiMP: Determiner-noun agreement with adjective (1)", description="BLiMP Determiner-noun agreement with adjective (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_determiner_noun_agreement_with_adjective_1", subtask=True, @@ -4238,7 +4238,7 @@ class EvalGroup: name="BLiMP: Distractor agreement with relational nouns", description="BLiMP Distractor agreement with relational nouns", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_distractor_agreement_relational_noun", subtask=True, @@ -4247,7 +4247,7 @@ class EvalGroup: name="BLiMP: Distractor agreement in relative clauses", description="BLiMP Distractor agreement in relative clauses", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_distractor_agreement_relative_clause", subtask=True, @@ -4256,7 +4256,7 @@ class EvalGroup: name="BLiMP: Dropped argument", description="BLiMP Dropped argument", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_drop_argument", subtask=True, @@ -4265,7 +4265,7 @@ class EvalGroup: name="BLiMP: N-bar ellipsis (1)", description="BLiMP N-bar ellipsis (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_ellipsis_n_bar_1", subtask=True, @@ -4274,7 +4274,7 @@ class EvalGroup: name="BLiMP: N-bar ellipsis (2)", description="BLiMP N-bar ellipsis (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_ellipsis_n_bar_2", subtask=True, @@ -4283,7 +4283,7 @@ class EvalGroup: name="BLiMP: Existential 'there' with object raising", description="BLiMP Existential 'there' with object raising", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_existential_there_object_raising", subtask=True, @@ -4292,7 +4292,7 @@ class EvalGroup: name="BLiMP: Existential 'there' with quantifiers (1)", description="BLiMP Existential 'there' with quantifiers (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_existential_there_quantifiers_1", subtask=True, @@ -4301,7 +4301,7 @@ class EvalGroup: name="BLiMP: Existential 'there' with quantifiers (2)", description="BLiMP Existential 'there' with quantifiers (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_existential_there_quantifiers_2", subtask=True, @@ -4310,7 +4310,7 @@ class EvalGroup: name="BLiMP: Existential 'there' with subject raising", description="BLiMP Existential 'there' with subject raising", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_existential_there_subject_raising", subtask=True, @@ -4319,7 +4319,7 @@ class EvalGroup: name="BLiMP: Expletive 'it' with object raising", description="BLiMP Expletive 'it' with object raising", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_expletive_it_object_raising", subtask=True, @@ -4328,7 +4328,7 @@ class EvalGroup: name="BLiMP: Inchoative constructions", description="BLiMP Inchoative constructions", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_inchoative", subtask=True, @@ -4337,7 +4337,7 @@ class EvalGroup: name="BLiMP: Intransitive verbs", description="BLiMP Intransitive verbs", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_intransitive", subtask=True, @@ -4346,7 +4346,7 @@ class EvalGroup: name="BLiMP: Irregular past participles as adjectives", description="BLiMP Irregular past participles as adjectives", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_irregular_past_participle_adjectives", subtask=True, @@ -4355,7 +4355,7 @@ class EvalGroup: name="BLiMP: Irregular past participles in verbs", description="BLiMP Irregular past participles in verbs", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_irregular_past_participle_verbs", subtask=True, @@ -4364,7 +4364,7 @@ class EvalGroup: name="BLiMP: Subject-verb agreement with irregular plurals (1)", description="BLiMP Subject-verb agreement with irregular plurals (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_irregular_plural_subject_verb_agreement_1", subtask=True, @@ -4373,7 +4373,7 @@ class EvalGroup: name="BLiMP: Subject-verb agreement with irregular plurals (2)", description="BLiMP Subject-verb agreement with irregular plurals (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_irregular_plural_subject_verb_agreement_2", subtask=True, @@ -4382,7 +4382,7 @@ class EvalGroup: name="BLiMP: Left branch island effects in echo questions", description="BLiMP Left branch island effects in echo questions", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_left_branch_island_echo_question", subtask=True, @@ -4391,7 +4391,7 @@ class EvalGroup: name="BLiMP: Left branch island effects in simple questions", description="BLiMP Left branch island effects in simple questions", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_left_branch_island_simple_question", subtask=True, @@ -4400,7 +4400,7 @@ class EvalGroup: name="BLiMP: Matrix question NPI licensor present", description="BLiMP Matrix question NPI licensor present", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_matrix_question_npi_licensor_present", subtask=True, @@ -4409,7 +4409,7 @@ class EvalGroup: name="BLiMP: Negative polarity items present (1)", description="BLiMP Negative polarity items present (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_npi_present_1", subtask=True, @@ -4418,7 +4418,7 @@ class EvalGroup: name="BLiMP: Negative polarity items present (2)", description="BLiMP Negative polarity items present (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_npi_present_2", subtask=True, @@ -4427,7 +4427,7 @@ class EvalGroup: name="BLiMP: 'Only' as NPI licensor", description="BLiMP 'Only' as NPI licensor", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_only_npi_licensor_present", subtask=True, @@ -4436,7 +4436,7 @@ class EvalGroup: name="BLiMP: 'Only' NPI scope", description="BLiMP 'Only' NPI scope", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_only_npi_scope", subtask=True, @@ -4445,7 +4445,7 @@ class EvalGroup: name="BLiMP: Passive constructions (1)", description="BLiMP Passive constructions (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_passive_1", subtask=True, @@ -4454,7 +4454,7 @@ class EvalGroup: name="BLiMP: Passive constructions (2)", description="BLiMP Passive constructions (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_passive_2", subtask=True, @@ -4463,7 +4463,7 @@ class EvalGroup: name="BLiMP: Binding Principle A - c-command", description="BLiMP Binding Principle A - c-command", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_principle_A_c_command", subtask=True, @@ -4472,7 +4472,7 @@ class EvalGroup: name="BLiMP: Binding Principle A - case (1)", description="BLiMP Binding Principle A - case (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_principle_A_case_1", subtask=True, @@ -4481,7 +4481,7 @@ class EvalGroup: name="BLiMP: Binding Principle A - case (2)", description="BLiMP Binding Principle A - case (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_principle_A_case_2", subtask=True, @@ -4490,7 +4490,7 @@ class EvalGroup: name="BLiMP: Binding Principle A - domain (1)", description="BLiMP Binding Principle A - domain (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_principle_A_domain_1", subtask=True, @@ -4499,7 +4499,7 @@ class EvalGroup: name="BLiMP: Binding Principle A - domain (2)", description="BLiMP Binding Principle A - domain (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_principle_A_domain_2", subtask=True, @@ -4508,7 +4508,7 @@ class EvalGroup: name="BLiMP: Binding Principle A - domain (3)", description="BLiMP Binding Principle A - domain (3)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_principle_A_domain_3", subtask=True, @@ -4517,7 +4517,7 @@ class EvalGroup: name="BLiMP: Binding Principle A - reconstruction", description="BLiMP Binding Principle A - reconstruction", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_principle_A_reconstruction", subtask=True, @@ -4526,7 +4526,7 @@ class EvalGroup: name="BLiMP: Subject-verb agreement with regular plurals (1)", description="BLiMP Subject-verb agreement with regular plurals (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_regular_plural_subject_verb_agreement_1", subtask=True, @@ -4535,7 +4535,7 @@ class EvalGroup: name="BLiMP: Subject-verb agreement with regular plurals (2)", description="BLiMP Subject-verb agreement with regular plurals (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_regular_plural_subject_verb_agreement_2", subtask=True, @@ -4544,7 +4544,7 @@ class EvalGroup: name="BLiMP: Sentential negation as NPI licensor", description="BLiMP Sentential negation as NPI licensor", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_sentential_negation_npi_licensor_present", subtask=True, @@ -4553,7 +4553,7 @@ class EvalGroup: name="BLiMP: Sentential negation NPI scope", description="BLiMP Sentential negation NPI scope", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_sentential_negation_npi_scope", subtask=True, @@ -4562,7 +4562,7 @@ class EvalGroup: name="BLiMP: Sentential subject island effects", description="BLiMP Sentential subject island effects", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_sentential_subject_island", subtask=True, @@ -4571,7 +4571,7 @@ class EvalGroup: name="BLiMP: Superlative quantifiers (1)", description="BLiMP Superlative quantifiers (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_superlative_quantifiers_1", subtask=True, @@ -4580,7 +4580,7 @@ class EvalGroup: name="BLiMP: Superlative quantifiers (2)", description="BLiMP Superlative quantifiers (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_superlative_quantifiers_2", subtask=True, @@ -4589,7 +4589,7 @@ class EvalGroup: name="BLiMP: Tough vs raising constructions (1)", description="BLiMP Tough vs raising constructions (1)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_tough_vs_raising_1", subtask=True, @@ -4598,7 +4598,7 @@ class EvalGroup: name="BLiMP: Tough vs raising constructions (2)", description="BLiMP Tough vs raising constructions (2)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_tough_vs_raising_2", subtask=True, @@ -4607,7 +4607,7 @@ class EvalGroup: name="BLiMP: Transitive verbs", description="BLiMP Transitive verbs", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_transitive", subtask=True, @@ -4616,7 +4616,7 @@ class EvalGroup: name="BLiMP: Wh-island effects", description="BLiMP Wh-island effects", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_wh_island", subtask=True, @@ -4625,7 +4625,7 @@ class EvalGroup: name="BLiMP: Wh-questions with object gap", description="BLiMP Wh-questions with object gap", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_wh_questions_object_gap", subtask=True, @@ -4634,7 +4634,7 @@ class EvalGroup: name="BLiMP: Wh-questions with subject gap", description="BLiMP Wh-questions with subject gap", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_wh_questions_subject_gap", subtask=True, @@ -4643,7 +4643,7 @@ class EvalGroup: name="BLiMP: Wh-questions with long-distance subject gap", description="BLiMP Wh-questions with long-distance subject gap", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_wh_questions_subject_gap_long_distance", subtask=True, @@ -4652,7 +4652,7 @@ class EvalGroup: name="BLiMP: Wh vs that complementizers without gap", description="BLiMP Wh vs that complementizers without gap", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_wh_vs_that_no_gap", subtask=True, @@ -4661,7 +4661,7 @@ class EvalGroup: name="BLiMP: Wh vs that complementizers without gap (long-distance)", description="BLiMP Wh vs that complementizers without gap (long-distance)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_wh_vs_that_no_gap_long_distance", subtask=True, @@ -4670,7 +4670,7 @@ class EvalGroup: name="BLiMP: Wh vs that complementizers with gap", description="BLiMP Wh vs that complementizers with gap", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_wh_vs_that_with_gap", subtask=True, @@ -4679,7 +4679,7 @@ class EvalGroup: name="BLiMP: Wh vs that complementizers with gap (long-distance)", description="BLiMP Wh vs that complementizers with gap (long-distance)", category="linguistic", - tags=["multiple-choice", "linguistics", "grammar", "syntax", "blimp"], + tags=["mcq", "linguistics", "grammar", "syntax", "blimp"], module_path="openbench.evals.blimp", function_name="blimp_wh_vs_that_with_gap_long_distance", subtask=True, @@ -4690,7 +4690,7 @@ class EvalGroup: name="RACE", description="Reading comprehension from middle and high school English exams (combined)", category="reading-comprehension", - tags=["multiple-choice", "reading-comprehension", "english-exam"], + tags=["mcq", "reading-comprehension", "english-exam"], module_path="openbench.evals.race", function_name="race", ), @@ -4699,7 +4699,7 @@ class EvalGroup: description="High school level reading comprehension from English exams for Chinese students - passages with multiple questions", category="reading-comprehension", tags=[ - "multiple-choice", + "mcq", "reading-comprehension", "english-exam", "high-school", @@ -4713,7 +4713,7 @@ class EvalGroup: description="Middle school level reading comprehension from English exams for Chinese students", category="reading-comprehension", tags=[ - "multiple-choice", + "mcq", "reading-comprehension", "english-exam", "middle-school", @@ -4726,7 +4726,7 @@ class EvalGroup: name="QA4MRE (All Years)", description="Question Answering for Machine Reading Evaluation - CLEF shared tasks 2011-2013", category="reading-comprehension", - tags=["multiple-choice", "reading-comprehension", "clef", "machine-reading"], + tags=["mcq", "reading-comprehension", "clef", "machine-reading"], module_path="openbench.evals.qa4mre", function_name="qa4mre", ), @@ -4735,7 +4735,7 @@ class EvalGroup: description="Question Answering for Machine Reading Evaluation (English, 2011)", category="reading-comprehension", tags=[ - "multiple-choice", + "mcq", "reading-comprehension", "clef", "machine-reading", @@ -4750,7 +4750,7 @@ class EvalGroup: description="Question Answering for Machine Reading Evaluation (English, 2012)", category="reading-comprehension", tags=[ - "multiple-choice", + "mcq", "reading-comprehension", "clef", "machine-reading", @@ -4765,7 +4765,7 @@ class EvalGroup: description="Question Answering for Machine Reading Evaluation (English, 2013)", category="reading-comprehension", tags=[ - "multiple-choice", + "mcq", "reading-comprehension", "clef", "machine-reading", @@ -4780,7 +4780,7 @@ class EvalGroup: description="Question Answering on Scientific Papers - binary yes/no questions on research paper abstracts", category="reading-comprehension", tags=[ - "multiple-choice", + "mcq", "reading-comprehension", "scientific-papers", "binary-classification", @@ -4793,7 +4793,7 @@ class EvalGroup: name="ETHICS (All Dimensions)", description="Aligning AI With Shared Human Values - tests moral reasoning across 5 fundamental dimensions", category="ethics-social", - tags=["multiple-choice", "ethics", "moral-reasoning", "philosophy"], + tags=["mcq", "ethics", "moral-reasoning", "philosophy"], module_path="openbench.evals.ethics", function_name="ethics", ), @@ -4801,7 +4801,7 @@ class EvalGroup: name="ETHICS: Justice", description="Tests fairness and impartiality in ethical decision-making", category="ethics-social", - tags=["multiple-choice", "ethics", "moral-reasoning", "justice", "ethics"], + tags=["mcq", "ethics", "moral-reasoning", "justice", "ethics"], module_path="openbench.evals.ethics", function_name="ethics_justice", subtask=True, @@ -4810,7 +4810,7 @@ class EvalGroup: name="ETHICS: Deontology", description="Tests duty-based ethics and understanding of moral rules", category="ethics-social", - tags=["multiple-choice", "ethics", "moral-reasoning", "deontology", "ethics"], + tags=["mcq", "ethics", "moral-reasoning", "deontology", "ethics"], module_path="openbench.evals.ethics", function_name="ethics_deontology", subtask=True, @@ -4819,7 +4819,7 @@ class EvalGroup: name="ETHICS: Virtue", description="Tests character-based ethics and recognition of virtuous behavior", category="ethics-social", - tags=["multiple-choice", "ethics", "moral-reasoning", "virtue", "ethics"], + tags=["mcq", "ethics", "moral-reasoning", "virtue", "ethics"], module_path="openbench.evals.ethics", function_name="ethics_virtue", subtask=True, @@ -4829,7 +4829,7 @@ class EvalGroup: description="Tests consequence-based ethics and utility maximization", category="ethics-social", tags=[ - "multiple-choice", + "mcq", "ethics", "moral-reasoning", "utilitarianism", @@ -4843,7 +4843,7 @@ class EvalGroup: name="ETHICS: Commonsense", description="Tests everyday moral reasoning and common ethical intuitions", category="ethics-social", - tags=["multiple-choice", "ethics", "moral-reasoning", "commonsense", "ethics"], + tags=["mcq", "ethics", "moral-reasoning", "commonsense", "ethics"], module_path="openbench.evals.ethics", function_name="ethics_commonsense", subtask=True, @@ -4852,7 +4852,7 @@ class EvalGroup: name="BBQ (Main Function)", description="BBQ bias evaluation for a specific category - use individual category tasks instead", category="ethics-social", - tags=["multiple-choice", "bias", "fairness", "social-bias", "qa"], + tags=["mcq", "bias", "fairness", "social-bias", "qa"], module_path="openbench.evals.bbq", function_name="bbq", ), @@ -4860,7 +4860,7 @@ class EvalGroup: name="BBQ: Age", description="Evaluate age-related biases in question-answering", category="ethics-social", - tags=["multiple-choice", "bias", "fairness", "social-bias", "qa", "age"], + tags=["mcq", "bias", "fairness", "social-bias", "qa", "age"], module_path="openbench.evals.bbq", function_name="bbq_age", subtask=True, @@ -4869,7 +4869,7 @@ class EvalGroup: name="BBQ: Disability Status", description="Evaluate disability-related biases in question-answering", category="ethics-social", - tags=["multiple-choice", "bias", "fairness", "social-bias", "qa", "disability"], + tags=["mcq", "bias", "fairness", "social-bias", "qa", "disability"], module_path="openbench.evals.bbq", function_name="bbq_disability_status", subtask=True, @@ -4878,7 +4878,7 @@ class EvalGroup: name="BBQ: Gender Identity", description="Evaluate gender identity-related biases in question-answering", category="ethics-social", - tags=["multiple-choice", "bias", "fairness", "social-bias", "qa", "gender"], + tags=["mcq", "bias", "fairness", "social-bias", "qa", "gender"], module_path="openbench.evals.bbq", function_name="bbq_gender_identity", subtask=True, @@ -4888,7 +4888,7 @@ class EvalGroup: description="Evaluate nationality-related biases in question-answering", category="ethics-social", tags=[ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -4903,7 +4903,7 @@ class EvalGroup: name="BBQ: Physical Appearance", description="Evaluate physical appearance-related biases in question-answering", category="ethics-social", - tags=["multiple-choice", "bias", "fairness", "social-bias", "qa", "appearance"], + tags=["mcq", "bias", "fairness", "social-bias", "qa", "appearance"], module_path="openbench.evals.bbq", function_name="bbq_physical_appearance", subtask=True, @@ -4912,7 +4912,7 @@ class EvalGroup: name="BBQ: Race/Ethnicity", description="Evaluate race and ethnicity-related biases in question-answering", category="ethics-social", - tags=["multiple-choice", "bias", "fairness", "social-bias", "qa", "race"], + tags=["mcq", "bias", "fairness", "social-bias", "qa", "race"], module_path="openbench.evals.bbq", function_name="bbq_race_ethnicity", subtask=True, @@ -4922,7 +4922,7 @@ class EvalGroup: description="Evaluate intersectional race and socioeconomic status biases", category="ethics-social", tags=[ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -4939,7 +4939,7 @@ class EvalGroup: description="Evaluate intersectional race and gender biases", category="ethics-social", tags=[ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -4955,7 +4955,7 @@ class EvalGroup: name="BBQ: Religion", description="Evaluate religion-related biases in question-answering", category="ethics-social", - tags=["multiple-choice", "bias", "fairness", "social-bias", "qa", "religion"], + tags=["mcq", "bias", "fairness", "social-bias", "qa", "religion"], module_path="openbench.evals.bbq", function_name="bbq_religion", subtask=True, @@ -4964,7 +4964,7 @@ class EvalGroup: name="BBQ: Socioeconomic Status", description="Evaluate socioeconomic status-related biases in question-answering", category="ethics-social", - tags=["multiple-choice", "bias", "fairness", "social-bias", "qa", "ses"], + tags=["mcq", "bias", "fairness", "social-bias", "qa", "ses"], module_path="openbench.evals.bbq", function_name="bbq_ses", subtask=True, @@ -4974,7 +4974,7 @@ class EvalGroup: description="Evaluate sexual orientation-related biases in question-answering", category="ethics-social", tags=[ - "multiple-choice", + "mcq", "bias", "fairness", "social-bias", @@ -4990,7 +4990,7 @@ class EvalGroup: description="Social Intelligence Question Answering - tests reasoning about social situations, emotions, and mental states", category="ethics-social", tags=[ - "multiple-choice", + "mcq", "social-intelligence", "emotional-reasoning", "theory-of-mind", @@ -5002,7 +5002,7 @@ class EvalGroup: name="ToxiGen", description="Toxicity detection benchmark - tests ability to identify toxic and hateful language", category="ethics-social", - tags=["multiple-choice", "toxicity-detection", "hate-speech", "safety"], + tags=["mcq", "toxicity-detection", "hate-speech", "safety"], module_path="openbench.evals.toxigen", function_name="toxigen", ), @@ -5019,7 +5019,7 @@ class EvalGroup: name="AGIEval (All Subsets)", description="Human-centric benchmark with 17 official qualifying exam questions testing general cognitive abilities", category="agieval", - tags=["multiple-choice", "academic-exams", "reasoning", "cognitive-abilities"], + tags=["mcq", "academic-exams", "reasoning", "cognitive-abilities"], module_path="openbench.evals.agieval", function_name="agieval", ), @@ -5027,7 +5027,7 @@ class EvalGroup: name="AGIEval: AQUA-RAT", description="Algebraic question answering and reasoning", category="agieval", - tags=["multiple-choice", "algebra", "reasoning", "math", "agieval"], + tags=["mcq", "algebra", "reasoning", "math", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_aqua_rat", subtask=True, @@ -5036,7 +5036,7 @@ class EvalGroup: name="AGIEval: LogiQA (English)", description="Logical reasoning questions in English", category="agieval", - tags=["multiple-choice", "logic", "reasoning", "english", "agieval"], + tags=["mcq", "logic", "reasoning", "english", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_logiqa_en", subtask=True, @@ -5045,7 +5045,7 @@ class EvalGroup: name="AGIEval: LogiQA (Chinese)", description="Logical reasoning questions in Chinese", category="agieval", - tags=["multiple-choice", "logic", "reasoning", "chinese", "agieval"], + tags=["mcq", "logic", "reasoning", "chinese", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_logiqa_zh", subtask=True, @@ -5054,7 +5054,7 @@ class EvalGroup: name="AGIEval: LSAT Analytical Reasoning", description="Law School Admission Test - Analytical Reasoning section", category="agieval", - tags=["multiple-choice", "law", "analytical-reasoning", "lsat", "agieval"], + tags=["mcq", "law", "analytical-reasoning", "lsat", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_lsat_ar", subtask=True, @@ -5063,7 +5063,7 @@ class EvalGroup: name="AGIEval: LSAT Logical Reasoning", description="Law School Admission Test - Logical Reasoning section", category="agieval", - tags=["multiple-choice", "law", "logical-reasoning", "lsat", "agieval"], + tags=["mcq", "law", "logical-reasoning", "lsat", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_lsat_lr", subtask=True, @@ -5072,7 +5072,7 @@ class EvalGroup: name="AGIEval: LSAT Reading Comprehension", description="Law School Admission Test - Reading Comprehension section", category="agieval", - tags=["multiple-choice", "law", "reading-comprehension", "lsat", "agieval"], + tags=["mcq", "law", "reading-comprehension", "lsat", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_lsat_rc", subtask=True, @@ -5081,7 +5081,7 @@ class EvalGroup: name="AGIEval: SAT English", description="Scholastic Assessment Test - English section", category="agieval", - tags=["multiple-choice", "sat", "english", "reading", "agieval"], + tags=["mcq", "sat", "english", "reading", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_sat_en", subtask=True, @@ -5090,7 +5090,7 @@ class EvalGroup: name="AGIEval: SAT English (No Passage)", description="SAT English questions without reading passages", category="agieval", - tags=["multiple-choice", "sat", "english", "grammar", "agieval"], + tags=["mcq", "sat", "english", "grammar", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_sat_en_without_passage", subtask=True, @@ -5099,7 +5099,7 @@ class EvalGroup: name="AGIEval: SAT Math", description="Scholastic Assessment Test - Math section", category="agieval", - tags=["multiple-choice", "sat", "mathematics", "problem-solving", "agieval"], + tags=["mcq", "sat", "mathematics", "problem-solving", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_sat_math", subtask=True, @@ -5108,7 +5108,7 @@ class EvalGroup: name="AGIEval: Gaokao Biology", description="Chinese national college entrance exam - Biology", category="agieval", - tags=["multiple-choice", "gaokao", "biology", "science", "agieval"], + tags=["mcq", "gaokao", "biology", "science", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_gaokao_biology", subtask=True, @@ -5117,7 +5117,7 @@ class EvalGroup: name="AGIEval: Gaokao Chemistry", description="Chinese national college entrance exam - Chemistry", category="agieval", - tags=["multiple-choice", "gaokao", "chemistry", "science", "agieval"], + tags=["mcq", "gaokao", "chemistry", "science", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_gaokao_chemistry", subtask=True, @@ -5126,7 +5126,7 @@ class EvalGroup: name="AGIEval: Gaokao Chinese", description="Chinese national college entrance exam - Chinese language", category="agieval", - tags=["multiple-choice", "gaokao", "chinese", "language", "agieval"], + tags=["mcq", "gaokao", "chinese", "language", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_gaokao_chinese", subtask=True, @@ -5135,7 +5135,7 @@ class EvalGroup: name="AGIEval: Gaokao English", description="Chinese national college entrance exam - English", category="agieval", - tags=["multiple-choice", "gaokao", "english", "language", "agieval"], + tags=["mcq", "gaokao", "english", "language", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_gaokao_english", subtask=True, @@ -5144,7 +5144,7 @@ class EvalGroup: name="AGIEval: Gaokao Geography", description="Chinese national college entrance exam - Geography", category="agieval", - tags=["multiple-choice", "gaokao", "geography", "social-studies", "agieval"], + tags=["mcq", "gaokao", "geography", "social-studies", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_gaokao_geography", subtask=True, @@ -5153,7 +5153,7 @@ class EvalGroup: name="AGIEval: Gaokao History", description="Chinese national college entrance exam - History", category="agieval", - tags=["multiple-choice", "gaokao", "history", "social-studies", "agieval"], + tags=["mcq", "gaokao", "history", "social-studies", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_gaokao_history", subtask=True, @@ -5162,7 +5162,7 @@ class EvalGroup: name="AGIEval: Gaokao Math", description="Chinese national college entrance exam - Mathematics", category="agieval", - tags=["multiple-choice", "gaokao", "mathematics", "problem-solving", "agieval"], + tags=["mcq", "gaokao", "mathematics", "problem-solving", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_gaokao_mathqa", subtask=True, @@ -5171,7 +5171,7 @@ class EvalGroup: name="AGIEval: Gaokao Physics", description="Chinese national college entrance exam - Physics", category="agieval", - tags=["multiple-choice", "gaokao", "physics", "science", "agieval"], + tags=["mcq", "gaokao", "physics", "science", "agieval"], module_path="openbench.evals.agieval", function_name="agieval_gaokao_physics", subtask=True, @@ -5180,7 +5180,7 @@ class EvalGroup: name="LegalSupport", description="Legal citation support identification - identify which citation provides stronger support for a legal argument", category="domain-specific", - tags=["multiple-choice", "legal", "reasoning", "citation-analysis"], + tags=["mcq", "legal", "reasoning", "citation-analysis"], module_path="openbench.evals.legalsupport", function_name="legalsupport", ), @@ -5188,7 +5188,7 @@ class EvalGroup: name="Arabic Exams (40 Subjects)", description="Multi-task Arabic language understanding benchmark from school exams across North Africa, the Levant, and the Gulf", category="domain-specific", - tags=["multiple-choice", "arabic", "multilingual", "education", "msa"], + tags=["mcq", "arabic", "multilingual", "education", "msa"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams", ), @@ -5196,7 +5196,7 @@ class EvalGroup: name="Arabic Exams: Accounting (University)", description="Arabic MMLU - Accounting questions from university-level exams", category="domain-specific", - tags=["multiple-choice", "arabic", "accounting", "university"], + tags=["mcq", "arabic", "accounting", "university"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_accounting_university", subtask=True, @@ -5205,7 +5205,7 @@ class EvalGroup: name="Arabic Exams: Arabic Language (General)", description="Arabic MMLU - Arabic language questions from general exams", category="domain-specific", - tags=["multiple-choice", "arabic", "language", "general"], + tags=["mcq", "arabic", "language", "general"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_arabic_language_general", subtask=True, @@ -5214,7 +5214,7 @@ class EvalGroup: name="Arabic Exams: Computer Science (High School)", description="Arabic MMLU - Computer science questions from high school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "computer-science", "high-school"], + tags=["mcq", "arabic", "computer-science", "high-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_computer_science_high_school", subtask=True, @@ -5223,7 +5223,7 @@ class EvalGroup: name="Arabic Exams: Computer Science (University)", description="Arabic MMLU - Computer science questions from university-level exams", category="domain-specific", - tags=["multiple-choice", "arabic", "computer-science", "university"], + tags=["mcq", "arabic", "computer-science", "university"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_computer_science_university", subtask=True, @@ -5232,7 +5232,7 @@ class EvalGroup: name="Arabic Exams: Islamic Studies (General)", description="Arabic MMLU - Islamic studies questions from general exams", category="domain-specific", - tags=["multiple-choice", "arabic", "islamic-studies", "general"], + tags=["mcq", "arabic", "islamic-studies", "general"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_islamic_studies_general", subtask=True, @@ -5241,25 +5241,16 @@ class EvalGroup: name="Arabic Exams: Math (Primary School)", description="Arabic MMLU - Math questions from primary school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "math", "primary-school"], + tags=["mcq", "arabic", "math", "primary-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_math_primary_school", subtask=True, ), - "arabic_exams_math_high_school": BenchmarkMetadata( - name="Arabic Exams: Math (High School)", - description="Arabic MMLU - Math questions from high school exams", - category="domain-specific", - tags=["multiple-choice", "arabic", "math", "high-school"], - module_path="openbench.evals.arabic_exams", - function_name="arabic_exams_math_high_school", - subtask=True, - ), "arabic_exams_physics_high_school": BenchmarkMetadata( name="Arabic Exams: Physics (High School)", description="Arabic MMLU - Physics questions from high school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "physics", "high-school"], + tags=["mcq", "arabic", "physics", "high-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_physics_high_school", subtask=True, @@ -5269,7 +5260,7 @@ class EvalGroup: name="Arabic Exams: Arabic Language (Grammar)", description="Arabic MMLU - Arabic language grammar questions", category="domain-specific", - tags=["multiple-choice", "arabic", "language", "grammar"], + tags=["mcq", "arabic", "language", "grammar"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_arabic_language_grammar", subtask=True, @@ -5278,7 +5269,7 @@ class EvalGroup: name="Arabic Exams: Arabic Language (High School)", description="Arabic MMLU - Arabic language questions from high school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "language", "high-school"], + tags=["mcq", "arabic", "language", "high-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_arabic_language_high_school", subtask=True, @@ -5287,7 +5278,7 @@ class EvalGroup: name="Arabic Exams: Arabic Language (Middle School)", description="Arabic MMLU - Arabic language questions from middle school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "language", "middle-school"], + tags=["mcq", "arabic", "language", "middle-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_arabic_language_middle_school", subtask=True, @@ -5296,7 +5287,7 @@ class EvalGroup: name="Arabic Exams: Arabic Language (Primary School)", description="Arabic MMLU - Arabic language questions from primary school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "language", "primary-school"], + tags=["mcq", "arabic", "language", "primary-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_arabic_language_primary_school", subtask=True, @@ -5306,7 +5297,7 @@ class EvalGroup: name="Arabic Exams: Biology (High School)", description="Arabic MMLU - Biology questions from high school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "biology", "high-school"], + tags=["mcq", "arabic", "biology", "high-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_biology_high_school", subtask=True, @@ -5316,7 +5307,7 @@ class EvalGroup: name="Arabic Exams: Civics (High School)", description="Arabic MMLU - Civics questions from high school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "civics", "high-school"], + tags=["mcq", "arabic", "civics", "high-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_civics_high_school", subtask=True, @@ -5325,7 +5316,7 @@ class EvalGroup: name="Arabic Exams: Civics (Middle School)", description="Arabic MMLU - Civics questions from middle school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "civics", "middle-school"], + tags=["mcq", "arabic", "civics", "middle-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_civics_middle_school", subtask=True, @@ -5335,7 +5326,7 @@ class EvalGroup: name="Arabic Exams: Computer Science (Middle School)", description="Arabic MMLU - Computer science questions from middle school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "computer-science", "middle-school"], + tags=["mcq", "arabic", "computer-science", "middle-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_computer_science_middle_school", subtask=True, @@ -5344,7 +5335,7 @@ class EvalGroup: name="Arabic Exams: Computer Science (Primary School)", description="Arabic MMLU - Computer science questions from primary school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "computer-science", "primary-school"], + tags=["mcq", "arabic", "computer-science", "primary-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_computer_science_primary_school", subtask=True, @@ -5354,7 +5345,7 @@ class EvalGroup: name="Arabic Exams: Driving Test", description="Arabic MMLU - Driving test questions", category="domain-specific", - tags=["multiple-choice", "arabic", "driving"], + tags=["mcq", "arabic", "driving"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_driving_test", subtask=True, @@ -5364,7 +5355,7 @@ class EvalGroup: name="Arabic Exams: Economics (High School)", description="Arabic MMLU - Economics questions from high school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "economics", "high-school"], + tags=["mcq", "arabic", "economics", "high-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_economics_high_school", subtask=True, @@ -5373,7 +5364,7 @@ class EvalGroup: name="Arabic Exams: Economics (Middle School)", description="Arabic MMLU - Economics questions from middle school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "economics", "middle-school"], + tags=["mcq", "arabic", "economics", "middle-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_economics_middle_school", subtask=True, @@ -5382,7 +5373,7 @@ class EvalGroup: name="Arabic Exams: Economics (University)", description="Arabic MMLU - Economics questions from university-level exams", category="domain-specific", - tags=["multiple-choice", "arabic", "economics", "university"], + tags=["mcq", "arabic", "economics", "university"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_economics_university", subtask=True, @@ -5392,7 +5383,7 @@ class EvalGroup: name="Arabic Exams: General Knowledge", description="Arabic MMLU - General knowledge questions", category="domain-specific", - tags=["multiple-choice", "arabic", "general-knowledge"], + tags=["mcq", "arabic", "general-knowledge"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_general_knowledge", subtask=True, @@ -5401,7 +5392,7 @@ class EvalGroup: name="Arabic Exams: General Knowledge (Middle School)", description="Arabic MMLU - General knowledge questions from middle school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "general-knowledge", "middle-school"], + tags=["mcq", "arabic", "general-knowledge", "middle-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_general_knowledge_middle_school", subtask=True, @@ -5410,7 +5401,7 @@ class EvalGroup: name="Arabic Exams: General Knowledge (Primary School)", description="Arabic MMLU - General knowledge questions from primary school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "general-knowledge", "primary-school"], + tags=["mcq", "arabic", "general-knowledge", "primary-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_general_knowledge_primary_school", subtask=True, @@ -5420,7 +5411,7 @@ class EvalGroup: name="Arabic Exams: Geography (High School)", description="Arabic MMLU - Geography questions from high school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "geography", "high-school"], + tags=["mcq", "arabic", "geography", "high-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_geography_high_school", subtask=True, @@ -5429,7 +5420,7 @@ class EvalGroup: name="Arabic Exams: Geography (Middle School)", description="Arabic MMLU - Geography questions from middle school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "geography", "middle-school"], + tags=["mcq", "arabic", "geography", "middle-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_geography_middle_school", subtask=True, @@ -5438,7 +5429,7 @@ class EvalGroup: name="Arabic Exams: Geography (Primary School)", description="Arabic MMLU - Geography questions from primary school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "geography", "primary-school"], + tags=["mcq", "arabic", "geography", "primary-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_geography_primary_school", subtask=True, @@ -5448,7 +5439,7 @@ class EvalGroup: name="Arabic Exams: History (High School)", description="Arabic MMLU - History questions from high school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "history", "high-school"], + tags=["mcq", "arabic", "history", "high-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_history_high_school", subtask=True, @@ -5457,7 +5448,7 @@ class EvalGroup: name="Arabic Exams: History (Middle School)", description="Arabic MMLU - History questions from middle school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "history", "middle-school"], + tags=["mcq", "arabic", "history", "middle-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_history_middle_school", subtask=True, @@ -5466,7 +5457,7 @@ class EvalGroup: name="Arabic Exams: History (Primary School)", description="Arabic MMLU - History questions from primary school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "history", "primary-school"], + tags=["mcq", "arabic", "history", "primary-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_history_primary_school", subtask=True, @@ -5476,7 +5467,7 @@ class EvalGroup: name="Arabic Exams: Islamic Studies (High School)", description="Arabic MMLU - Islamic studies questions from high school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "islamic-studies", "high-school"], + tags=["mcq", "arabic", "islamic-studies", "high-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_islamic_studies_high_school", subtask=True, @@ -5485,7 +5476,7 @@ class EvalGroup: name="Arabic Exams: Islamic Studies (Middle School)", description="Arabic MMLU - Islamic studies questions from middle school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "islamic-studies", "middle-school"], + tags=["mcq", "arabic", "islamic-studies", "middle-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_islamic_studies_middle_school", subtask=True, @@ -5494,7 +5485,7 @@ class EvalGroup: name="Arabic Exams: Islamic Studies (Primary School)", description="Arabic MMLU - Islamic studies questions from primary school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "islamic-studies", "primary-school"], + tags=["mcq", "arabic", "islamic-studies", "primary-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_islamic_studies_primary_school", subtask=True, @@ -5504,7 +5495,7 @@ class EvalGroup: name="Arabic Exams: Law (Professional)", description="Arabic MMLU - Law questions from professional exams", category="domain-specific", - tags=["multiple-choice", "arabic", "law", "professional"], + tags=["mcq", "arabic", "law", "professional"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_law_professional", subtask=True, @@ -5514,7 +5505,7 @@ class EvalGroup: name="Arabic Exams: Management (University)", description="Arabic MMLU - Management questions from university-level exams", category="domain-specific", - tags=["multiple-choice", "arabic", "management", "university"], + tags=["mcq", "arabic", "management", "university"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_management_university", subtask=True, @@ -5524,7 +5515,7 @@ class EvalGroup: name="Arabic Exams: Natural Science (Middle School)", description="Arabic MMLU - Natural science questions from middle school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "natural-science", "middle-school"], + tags=["mcq", "arabic", "natural-science", "middle-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_natural_science_middle_school", subtask=True, @@ -5533,7 +5524,7 @@ class EvalGroup: name="Arabic Exams: Natural Science (Primary School)", description="Arabic MMLU - Natural science questions from primary school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "natural-science", "primary-school"], + tags=["mcq", "arabic", "natural-science", "primary-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_natural_science_primary_school", subtask=True, @@ -5543,7 +5534,7 @@ class EvalGroup: name="Arabic Exams: Philosophy (High School)", description="Arabic MMLU - Philosophy questions from high school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "philosophy", "high-school"], + tags=["mcq", "arabic", "philosophy", "high-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_philosophy_high_school", subtask=True, @@ -5553,7 +5544,7 @@ class EvalGroup: name="Arabic Exams: Political Science (University)", description="Arabic MMLU - Political science questions from university-level exams", category="domain-specific", - tags=["multiple-choice", "arabic", "political-science", "university"], + tags=["mcq", "arabic", "political-science", "university"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_political_science_university", subtask=True, @@ -5563,7 +5554,7 @@ class EvalGroup: name="Arabic Exams: Social Science (Middle School)", description="Arabic MMLU - Social science questions from middle school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "social-science", "middle-school"], + tags=["mcq", "arabic", "social-science", "middle-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_social_science_middle_school", subtask=True, @@ -5572,7 +5563,7 @@ class EvalGroup: name="Arabic Exams: Social Science (Primary School)", description="Arabic MMLU - Social science questions from primary school exams", category="domain-specific", - tags=["multiple-choice", "arabic", "social-science", "primary-school"], + tags=["mcq", "arabic", "social-science", "primary-school"], module_path="openbench.evals.arabic_exams", function_name="arabic_exams_social_science_primary_school", subtask=True, @@ -6666,7 +6657,7 @@ def get_eval_metadata(path_like: str) -> BenchmarkMetadata | None: ), "arabic_exams": EvalGroup( name="Arabic Exams", - description="Aggregate of 40+ Arabic exam tasks", + description="Aggregate of 40 Arabic exam tasks", benchmarks=[ "arabic_exams_accounting_university", "arabic_exams_arabic_language_general", @@ -6700,7 +6691,6 @@ def get_eval_metadata(path_like: str) -> BenchmarkMetadata | None: "arabic_exams_islamic_studies_primary_school", "arabic_exams_law_professional", "arabic_exams_management_university", - "arabic_exams_math_high_school", "arabic_exams_math_primary_school", "arabic_exams_natural_science_middle_school", "arabic_exams_natural_science_primary_school",