// namesToBackend: , , , , , , , , , , , , ', , , ,

export const tests = [
  {
    id: 1,
    name: 'math',
    nameToBackend: 'MATH',
    displayName: 'MATH',
    description: 'Measures performance on mathematical problem-solving tasks.',
    categories: ['Mathematics'],
  },
  {
    id: 2,
    name: 'bbh',
    nameToBackend: 'BBH',
    displayName: 'BIG-Bench',
    description:
      'Assesses performance on difficult and challenging benchmarks from the BIG-Bench dataset.',
    categories: ['Problem-Solving'],
  },
  {
    id: 3,
    name: 'drop',
    nameToBackend: 'DROP',
    displayName: 'DROP',
    description:
      "Evaluates the model's ability to perform discrete reasoning in reading comprehension tasks.",
    categories: ['Reasoning'],
  },
  {
    id: 4,
    name: 'gsm8k',
    nameToBackend: 'GSM8k',
    displayName: 'GSM8k',
    description:
      'Performance on the GSM8k dataset, which consists of grade-school math problems.',
    categories: ['Mathematics'],
  },
  {
    id: 5,
    name: 'agieval',
    nameToBackend: 'AGIEval',
    displayName: 'AGI-Eval',
    description:
      'Measures general AI problem-solving abilities, often across a diverse set of tasks.',
    categories: ['General AI'],
  },
  {
    id: 6,
    name: 'triviaqa',
    nameToBackend: 'TriviaQA',
    displayName: 'TriviaQA',
    description:
      "Tests the model's ability to answer trivia questions, reflecting its fact-recall capabilities.",
    categories: ['Knowledge'],
  },
  // {
  //   id: 7,
  //   name: 'mbpp',
  //   nameToBackend: 'MBPP',
  //   displayName: 'MBPP',
  //   description:
  //     "Evaluates the model's ability to solve multiple-choice programming problems.",
  //   categories: ['Programming'],
  // },
  {
    id: 8,
    name: 'mmlu',
    nameToBackend: 'MMLU',
    displayName: 'MMLU',
    description:
      'Assesses performance across a wide range of subjects and tasks.',
    categories: ['Problem-Solving'],
  },
  {
    id: 9,
    name: 'hellaswag',
    nameToBackend: 'HellaSwag',
    displayName: 'HellaSwag',
    description: "Tests the model's commonsense reasoning capabilities.",
    categories: ['Reasoning'],
  },
  {
    id: 10,
    name: 'boolq',
    nameToBackend: 'BoolQ',
    displayName: 'BoolQ',
    description:
      "Measures the model's performance on binary (yes/no) questions.",
    categories: ['Problem-Solving'],
  },
  {
    id: 11,
    name: 'gpqa',
    nameToBackend: 'GPQA',
    displayName: 'GPQA',
    description: 'General-purpose question answering performance.',
    categories: ['Problem-Solving'],
  },
  {
    id: 12,
    name: 'piqa',
    nameToBackend: 'PIQA',
    displayName: 'PIQA',
    description: "Assesses the model's understanding of physical interactions.",
    categories: ['Problem-Solving'],
  },
  {
    id: 13,
    name: 'openbookqa',
    nameToBackend: 'OpenBookQA',
    displayName: 'OpenBookQA',
    description:
      'Evaluates performance on open-book question-answering tasks, testing both knowledge and reasoning.',
    categories: ['Knowledge'],
  },
  {
    id: 14,
    name: 'arc',
    nameToBackend: 'ARC',
    displayName: 'ARC',
    description:
      'Measures the ability to answer questions that require reasoning.',
    categories: ['Problem-Solving'],
  },
  {
    id: 15,
    name: 'commonsenseqa',
    nameToBackend: 'CommonsenseQA',
    displayName: 'CommonsenseQA',
    description: 'Tests commonsense reasoning abilities.',
    categories: ['Reasoning'],
  },
  {
    id: 16,
    name: 'siqa',
    nameToBackend: 'SIQA',
    displayName: 'SIQA',
    description:
      'Evaluates social intelligence by assessing how well the model understands social situations.',
    categories: ['Problem-Solving'],
  },
  {
    id: 17,
    name: 'winogrande',
    nameToBackend: 'WinoGrande',
    displayName: 'WinoGrande',
    description:
      'Measures the model’s ability to resolve coreference in a sentence.',
    categories: ['Problem-Solving'],
  },
  {
    id: 18,
    name: 'needlehaystack',
    nameToBackend: 'needlehaystack',
    displayName: 'Needle in a Haystack',
    description: "Measures the model's ability to find a needle in a haystack.",
    categories: ['Problem-Solving'],
  },
];

export const mixEvalTests = [
  'AGIEval',
  'ARC',
  'BBH',
  'BoolQ',
  'CommonsenseQA',
  'DROP',
  'GPQA',
  'GSM8k',
  'HellaSwag',
  'MATH',
  'MMLU',
  'OpenBookQA',
  'PIQA',
  'SIQA',
  'TriviaQA',
  'WinoGrande',
];
