{"ok":true,"data":{"count":2,"results":[{"id":"baseline-2026-q2-chat-quality","capability":"chat","prompt_id":"subject-grader-math","dataset":"gsm8k-50","generated_at":"2026-05-25T08:00:00Z","notes":"GSM8K math reasoning baseline. Quality measured by gold-answer exact match + step-correctness LLM judge.","vendor_count":5},{"id":"baseline-2026-q2-translate-zh-id","capability":"translate","prompt_id":"mandarea-indo-tutor","dataset":"mandarea-eval-100","generated_at":"2026-05-26T03:00:00Z","notes":"Indonesian → Chinese translation. Quality scored by native bilingual reviewers (1-5 scale, normalized to 0-1).","vendor_count":3}],"note":"Seeded baseline. T1 will replace with live eval runs. POST /system/eval/run to queue."}}