{
  "schema_version": "2026-05-14.3",
  "generated_at": "2026-06-25T08:28:53.664Z",
  "product": "BuildPilled agent-audit",
  "docs_url": "https://buildpilled.io/eval",
  "rubric_docs_url": "https://buildpilled.io/eval",
  "anchor_standards": [
    {
      "id": "NIST AI RMF v1.0 (NIST AI 100-1)",
      "role": "primary structural"
    },
    {
      "id": "NIST AI 600-1 GenAI Profile (Jul 2024)",
      "role": "primary technical"
    },
    {
      "id": "OWASP LLM Top 10 (2025)",
      "role": "engineer cross-reference"
    },
    {
      "id": "MITRE ATLAS",
      "role": "adversarial cross-reference"
    }
  ],
  "tiers": {
    "surface": {
      "price_usd": "25.00",
      "tagline": "Static categorization + system-prompt analysis. No live calls.",
      "what_runs": "Categorize the system, run rubric-driven static analysis on the system prompt + tool list, return a NIST-tagged findings report. No outbound traffic to your endpoints.",
      "status": "live"
    },
    "active": {
      "price_usd": "250.00",
      "tagline": "Attacker-agent runs against your declared endpoint, N=50.",
      "what_runs": "Surface tier + budget-capped attacker agent against your declared endpoint with reproductions. Available in limited rollout while the runner hardens.",
      "status": "limited_rollout"
    },
    "compliance": {
      "price_usd": "800.00",
      "tagline": "Active + code-scan (SAST) + compliance evidence pack.",
      "what_runs": "Active tier + MEASURE-2.6/2.11 probes + code-scan findings + an evidence pack suitable for SOC 2 / customer questionnaires.",
      "status": "planned"
    }
  },
  "rmf_coverage": [
    {
      "id": "MAP-2.1",
      "description": "Tasks, methods, and side-effect characterization of the AI system.",
      "coverage": {
        "surface": "covered",
        "active": "covered",
        "compliance": "covered"
      }
    },
    {
      "id": "MAP-2.2",
      "description": "Knowledge limits, scope-of-competence, refusal-on-uncertainty, and instruction-precedence in the system prompt.",
      "coverage": {
        "surface": "covered",
        "active": "covered",
        "compliance": "covered"
      }
    },
    {
      "id": "MAP-3.3",
      "description": "Targeted application scope vs. tool capability blast radius.",
      "coverage": {
        "surface": "covered",
        "active": "covered",
        "compliance": "covered"
      }
    },
    {
      "id": "MAP-3.4",
      "description": "Operator proficiency requirements vs. supporting affordances; autonomy interrupt/budget gates.",
      "coverage": {
        "surface": "covered",
        "active": "covered",
        "compliance": "covered"
      }
    },
    {
      "id": "MAP-4.1",
      "description": "Component and legal-risk surface, including dependencies.",
      "coverage": {
        "surface": "not_in_tier",
        "active": "partial",
        "compliance": "covered"
      },
      "notes": "Compliance tier adds code-side dependency and source findings."
    },
    {
      "id": "MEASURE-1.1",
      "description": "Evidence that measurement approaches are appropriate.",
      "coverage": {
        "surface": "covered",
        "active": "covered",
        "compliance": "covered"
      }
    },
    {
      "id": "MEASURE-2.5",
      "description": "AI system is valid and reliable for the intended task.",
      "coverage": {
        "surface": "not_in_tier",
        "active": "partial",
        "compliance": "covered"
      }
    },
    {
      "id": "MEASURE-2.6",
      "description": "AI system is safe — harm avoidance and dangerous-output gating.",
      "coverage": {
        "surface": "not_in_tier",
        "active": "not_in_tier",
        "compliance": "covered"
      }
    },
    {
      "id": "MEASURE-2.7",
      "description": "AI system is secure and resilient — adversarial probes.",
      "coverage": {
        "surface": "not_in_tier",
        "active": "covered",
        "compliance": "covered"
      },
      "notes": "Active tier adds budget-capped adversarial runs against declared endpoints. Surface uses static analysis only."
    },
    {
      "id": "MEASURE-2.8",
      "description": "Transparency and accountability characterization.",
      "coverage": {
        "surface": "not_in_tier",
        "active": "partial",
        "compliance": "covered"
      }
    },
    {
      "id": "MEASURE-2.9",
      "description": "Model explanation and output interpretation evidence.",
      "coverage": {
        "surface": "not_in_tier",
        "active": "covered",
        "compliance": "covered"
      }
    },
    {
      "id": "MEASURE-2.10",
      "description": "Privacy risk — PII leakage, training-data extraction.",
      "coverage": {
        "surface": "not_in_tier",
        "active": "covered",
        "compliance": "covered"
      }
    },
    {
      "id": "MEASURE-2.11",
      "description": "Fairness and harmful-bias evaluation.",
      "coverage": {
        "surface": "not_in_tier",
        "active": "not_in_tier",
        "compliance": "covered"
      }
    }
  ],
  "rmf_excluded": [
    {
      "id": "GOVERN",
      "reason": "Organizational policy and accountability — not testable per-call."
    },
    {
      "id": "MANAGE",
      "reason": "Risk response and remediation processes — not testable per-call."
    },
    {
      "id": "MAP-1",
      "reason": "Context establishment beyond the technical surface."
    },
    {
      "id": "MAP-5",
      "reason": "Impact assessment beyond the technical surface."
    },
    {
      "id": "MEASURE-2.12",
      "reason": "Environmental impact and sustainability — not testable per-call."
    },
    {
      "id": "MEASURE-2.13",
      "reason": "TEVV effectiveness — process-level over time, not per-call."
    }
  ],
  "ai_600_1_risks": [
    {
      "id": "Confabulation",
      "tiers": [
        "surface",
        "active",
        "compliance"
      ]
    },
    {
      "id": "Dangerous, Violent, or Hateful Content",
      "tiers": [
        "active",
        "compliance"
      ]
    },
    {
      "id": "Data Privacy",
      "tiers": [
        "active",
        "compliance"
      ]
    },
    {
      "id": "Environmental Impact",
      "tiers": []
    },
    {
      "id": "Harmful Bias and Homogenization",
      "tiers": [
        "compliance"
      ]
    },
    {
      "id": "Human-AI Configuration",
      "tiers": [
        "surface",
        "active",
        "compliance"
      ]
    },
    {
      "id": "Information Integrity",
      "tiers": [
        "surface",
        "active",
        "compliance"
      ]
    },
    {
      "id": "Information Security",
      "tiers": [
        "surface",
        "active",
        "compliance"
      ]
    },
    {
      "id": "Intellectual Property",
      "tiers": [
        "compliance"
      ]
    },
    {
      "id": "Obscene, Degrading, and/or Abusive Content",
      "tiers": [
        "active",
        "compliance"
      ]
    },
    {
      "id": "Toxicity, Bias, and Homogenization",
      "tiers": [
        "compliance"
      ]
    },
    {
      "id": "Value Chain / Component Integration",
      "tiers": [
        "surface",
        "active",
        "compliance"
      ]
    }
  ],
  "latency": {
    "surface": {
      "p50_ms": null,
      "p95_ms": null,
      "measurement_status": "pending",
      "note": "Surface-tier latency-measurement instrumentation in flight; numbers will land here once we have a representative window. Manual single-call timing is currently sub-second."
    },
    "active": {
      "p50_ms": null,
      "p95_ms": null,
      "measurement_status": "tier_in_dev"
    },
    "compliance": {
      "p50_ms": null,
      "p95_ms": null,
      "measurement_status": "tier_in_dev"
    }
  },
  "eval_baselines": [
    {
      "name": "Anthropic computer-use",
      "archetype": "autonomous-loop, high-risk",
      "repo": "anthropics/claude-quickstarts",
      "commit": "4b2549e8",
      "license": "MIT",
      "findings_count": 5,
      "severity_max": "MEDIUM"
    },
    {
      "name": "Aider editblock coder",
      "archetype": "developer-facing, medium-risk",
      "repo": "Aider-AI/aider",
      "commit": "3ec8ec5a",
      "license": "Apache-2.0",
      "findings_count": 9,
      "severity_max": "HIGH"
    },
    {
      "name": "MCP filesystem (modelcontextprotocol/servers)",
      "archetype": "file-management, medium-risk",
      "repo": "modelcontextprotocol/servers",
      "commit": "4503e2d1",
      "license": "MIT",
      "findings_count": 12,
      "severity_max": "MEDIUM"
    }
  ],
  "eval_scorecard": {
    "schema_version": "agent-audit-eval-scorecard.v1",
    "evidence_gates": {
      "three_real_app_baselines": true,
      "hand_graded_historical_findings": false,
      "pipeline_comparison_against_garak_pyrit_llm": false
    },
    "aggregate": {
      "baseline_count": 3,
      "total_findings": 26,
      "severity_counts": {
        "critical": 0,
        "high": 1,
        "medium": 9,
        "low": 16,
        "informational": 0
      },
      "severity_max": "high",
      "covered_subcategories": [
        "MAP-2.1",
        "MAP-2.2",
        "MAP-3.3"
      ],
      "ai_600_1_risks": [
        "Confabulation",
        "Human-AI Configuration",
        "Information Integrity",
        "Information Security",
        "Value Chain / Component Integration"
      ],
      "owasp_llm_2025": [
        "LLM01",
        "LLM06",
        "LLM07",
        "LLM09"
      ],
      "mitre_atlas": [],
      "system_types": [
        "rag",
        "tool_using_agent"
      ]
    },
    "baselines": [
      {
        "id": "computer-use",
        "name": "Anthropic computer-use",
        "archetype": "autonomous-loop, high-risk",
        "repo": "anthropics/claude-quickstarts",
        "commit": "4b2549e8",
        "license": "MIT",
        "hand_grade_status": "pending",
        "tier": "surface",
        "rubric_version": "0.1.0",
        "report_generated_at": "2026-05-01T03:29:09.498Z",
        "system_type": "tool_using_agent",
        "tool_count": 3,
        "tool_side_effect_counts": {
          "executes_code": 1,
          "unknown": 2
        },
        "findings_count": 5,
        "severity_counts": {
          "critical": 0,
          "high": 0,
          "medium": 2,
          "low": 3,
          "informational": 0
        },
        "severity_max": "medium",
        "covered_subcategories": [
          "MAP-2.1",
          "MAP-2.2"
        ],
        "ai_600_1_risks": [
          "Confabulation",
          "Human-AI Configuration",
          "Information Integrity",
          "Information Security",
          "Value Chain / Component Integration"
        ],
        "owasp_llm_2025": [
          "LLM01",
          "LLM07",
          "LLM09"
        ],
        "mitre_atlas": []
      },
      {
        "id": "aider-editblock",
        "name": "Aider editblock coder",
        "archetype": "developer-facing, medium-risk",
        "repo": "Aider-AI/aider",
        "commit": "3ec8ec5a",
        "license": "Apache-2.0",
        "hand_grade_status": "pending",
        "tier": "surface",
        "rubric_version": "0.1.0",
        "report_generated_at": "2026-05-01T03:29:10.137Z",
        "system_type": "rag",
        "tool_count": 3,
        "tool_side_effect_counts": {
          "executes_code": 1,
          "writes_external_state": 2
        },
        "findings_count": 9,
        "severity_counts": {
          "critical": 0,
          "high": 1,
          "medium": 3,
          "low": 5,
          "informational": 0
        },
        "severity_max": "high",
        "covered_subcategories": [
          "MAP-2.1",
          "MAP-2.2",
          "MAP-3.3"
        ],
        "ai_600_1_risks": [
          "Confabulation",
          "Human-AI Configuration",
          "Information Integrity",
          "Information Security",
          "Value Chain / Component Integration"
        ],
        "owasp_llm_2025": [
          "LLM06",
          "LLM07",
          "LLM09"
        ],
        "mitre_atlas": []
      },
      {
        "id": "mcp-filesystem",
        "name": "MCP filesystem (modelcontextprotocol/servers)",
        "archetype": "file-management, medium-risk",
        "repo": "modelcontextprotocol/servers",
        "commit": "4503e2d1",
        "license": "MIT",
        "hand_grade_status": "pending",
        "tier": "surface",
        "rubric_version": "0.1.0",
        "report_generated_at": "2026-05-01T03:29:10.771Z",
        "system_type": "rag",
        "tool_count": 13,
        "tool_side_effect_counts": {
          "read_only": 9,
          "writes_external_state": 4
        },
        "findings_count": 12,
        "severity_counts": {
          "critical": 0,
          "high": 0,
          "medium": 4,
          "low": 8,
          "informational": 0
        },
        "severity_max": "medium",
        "covered_subcategories": [
          "MAP-2.1",
          "MAP-2.2",
          "MAP-3.3"
        ],
        "ai_600_1_risks": [
          "Confabulation",
          "Human-AI Configuration",
          "Information Integrity",
          "Information Security",
          "Value Chain / Component Integration"
        ],
        "owasp_llm_2025": [
          "LLM01",
          "LLM06",
          "LLM07",
          "LLM09"
        ],
        "mitre_atlas": []
      }
    ],
    "next_calibration_gates": [
      "Hand-grade at least 10 historical findings for true/false positive and severity alignment.",
      "Compare BP Surface output against Garak, PyRIT, and pure-LLM review on the same targets.",
      "Promote recurring calibration gaps into focused rubric tests before Active-tier launch."
    ]
  },
  "endpoints": {
    "surface": "https://buildpilled.io/agent-audit",
    "active": null,
    "compliance": null
  },
  "example_402_response": {
    "status": 402,
    "headers": {
      "content-type": "application/problem+json",
      "www-authenticate": "Payment realm=\"buildpilled.io\", method=\"stripe\", intent=\"charge\", id=\"<challenge-id>\", request=\"<base64-encoded-PaymentRequest>\""
    },
    "body": {
      "type": "https://paymentauth.org/problems/payment-required",
      "title": "Payment Required",
      "status": 402,
      "detail": "Payment is required (BuildPilled agent-audit · Surface tier).",
      "challengeId": "<challenge-id>",
      "challenge": {
        "id": "<challenge-id>",
        "method": "stripe",
        "intent": "charge",
        "realm": "buildpilled.io",
        "request": {
          "amount": "2500",
          "currency": "usd",
          "methodDetails": {
            "networkId": "buildpilled",
            "paymentMethodTypes": [
              "card"
            ]
          }
        },
        "description": "BuildPilled agent-audit · Surface tier",
        "expires": "<iso8601-expiry>"
      }
    }
  },
  "sandbox_verification": {
    "callable": true,
    "note": "Sandbox Shared Payment Tokens are accepted on the live Surface endpoint for no-cost verification. Use Stripe's sandbox card flow; receipts identify whether a call settled on the sandbox or live payment rail."
  }
}