PinchBench

· v1.0.0

High Risk

Run PinchBench benchmarks to evaluate OpenClaw agent performance across real-world tasks. Use when testing model capabilities, comparing models, submitting benchmark results to the leaderboard, or checking how well your OpenClaw setup handles calendar, email, research, coding, and multi-step workflows.

H:4 D:4 A:0 C:1

⚠️ Hazard Flags

EXEC CODE_EXEC FS_READ_USER FS_DELETE NET_EGRESS_ANY CREDS_ENV CREDS_FILES PI_WEB

📋 Capabilities

Execution

✅ Shell execution
✅ Code execution
❌ Install dependencies
❌ Persistence
Privilege: user

Filesystem

❌ Read workspace
❌ Write workspace
✅ Read home
❌ Write home
❌ Read system
✅ Delete

Network

Egress: any
❌ Ingress

Credentials

✅ Environment vars
✅ Credential files
❌ Browser data
❌ Keychain

Actions

❌ send messages❌ post public❌ purchase❌ transfer money❌ deploy❌ delete external

🔒 Containment

Level: maximum

Required:

SANDBOX_CONTAINER: Code execution capability

Recommended:

LOG_ACTIONS: Audit trail for all actions

⚡ Risks

Unauthorized tool use: INSTRUCTED_GIT_CLONE_AND_BUILD, MCP_SYS_CRITICAL_ACCESS high

Mitigation: Use pre-built packages or vendored dependencies instead of cloning repos

Command injection risk: MCP_SQL_BLIND, COMMAND_INJECTION_EVAL critical

Mitigation: Remove SQL exploitation patterns.

Social engineering indicators: SOCIAL_ENG_VAGUE_DESCRIPTION, SOCIAL_ENG_ANTHROPIC_IMPERSONATION low

Mitigation: Provide clear, detailed description of skill functionality

Data exfiltration patterns: DATA_EXFIL_ENV_VARS medium

Mitigation: Minimize access to environment variables

Want a deeper analysis?

This report was generated by static analysis. Get an LLM-powered deep review with behavioral reasoning and attack surface mapping.

🧠 Deep Analysis — $5.00

🚨 Incident Response

Kill switch: Stop the agent process

Containment: Review logs for unexpected actions

Recovery: Depends on skill capabilities

📄 Raw SSDS JSON click to expand

{
  "meta": {
    "document_id": "ssds:auto:pinchbench:1.0.0",
    "ssds_version": "0.2.0",
    "scanner_version": "0.4.0+fe6fd9123d50",
    "created_at": "2026-03-05T14:22:02.778Z",
    "created_by": {
      "agent": "safeagentskills-cli/generate-ssds"
    },
    "language": "en",
    "notes": "Auto-generated SSDS. Manual review recommended."
  },
  "skill": {
    "name": "PinchBench",
    "version": "1.0.0",
    "format": "agent_skill",
    "description": "Run PinchBench benchmarks to evaluate OpenClaw agent performance across real-world tasks. Use when testing model capabilities, comparing models, submitting benchmark results to the leaderboard, or checking how well your OpenClaw setup handles calendar, email, research, coding, and multi-step workflows.",
    "publisher": "unknown",
    "source": {
      "channel": "local"
    },
    "artifact": {
      "sha256": "53497432c7c70dcb6e252bb445d03eb4d5ada4d6a9a31eb0cf69390d09b49954",
      "hash_method": "files_sorted"
    }
  },
  "capabilities": {
    "execution": {
      "can_exec_shell": true,
      "can_exec_code": true,
      "privilege_level": "user",
      "can_install_deps": false,
      "can_persist": false
    },
    "filesystem": {
      "reads_workspace": false,
      "reads_user_home": true,
      "reads_system": false,
      "writes_workspace": false,
      "writes_user_home": false,
      "writes_system": false,
      "can_delete": true
    },
    "network": {
      "egress": "any",
      "ingress": false
    },
    "credentials": {
      "reads_env_vars": true,
      "reads_credential_files": true,
      "reads_browser_data": false,
      "reads_keychain": false
    },
    "services": [],
    "actions": {
      "can_send_messages": false,
      "can_post_public": false,
      "can_purchase": false,
      "can_transfer_money": false,
      "can_deploy": false,
      "can_delete_external": false
    },
    "prompt_injection_surfaces": [
      "web"
    ],
    "content_types": [
      "general"
    ]
  },
  "hazards": {
    "hdac": {
      "H": 4,
      "D": 4,
      "A": 0,
      "C": 1
    },
    "flags": [
      "EXEC",
      "CODE_EXEC",
      "FS_READ_USER",
      "FS_DELETE",
      "NET_EGRESS_ANY",
      "CREDS_ENV",
      "CREDS_FILES",
      "PI_WEB"
    ],
    "custom_flags": [
      {
        "code": "FILE_DELETE",
        "name": "File Deletion",
        "description": "Can delete files in: scripts/lib_agent.py"
      },
      {
        "code": "TOOL_ABUSE",
        "name": "Unauthorized Tool Use",
        "description": "INSTRUCTED_GIT_CLONE_AND_BUILD, MCP_SYS_CRITICAL_ACCESS: Instructs agent to clone and potentially build from source"
      },
      {
        "code": "SOCIAL_ENGINEERING",
        "name": "Social Engineering Risk",
        "description": "SOCIAL_ENG_VAGUE_DESCRIPTION, SOCIAL_ENG_ANTHROPIC_IMPERSONATION: Skill description is too vague or missing"
      },
      {
        "code": "COMMAND_INJECTION",
        "name": "Command Injection Risk",
        "description": "MCP_SQL_BLIND, COMMAND_INJECTION_EVAL: Blind SQL injection, system table access, or stored procedure abuse"
      },
      {
        "code": "DATA_EXFILTRATION",
        "name": "Data Exfiltration Risk",
        "description": "DATA_EXFIL_ENV_VARS: Reading environment variables that may contain secrets"
      }
    ],
    "confidence": {
      "level": "medium",
      "basis": [
        "static_analysis"
      ],
      "notes": "Detected 5 security patterns (10 vendored rule hits). Review recommended."
    },
    "rationale": {
      "H": "H4: Critical: Privilege escalation or malware detected",
      "D": "D4: Critical: Credential theft or data exfiltration",
      "A": "A0: No side effects detected",
      "C": "C1: General content"
    }
  },
  "containment": {
    "level": "maximum",
    "required": [
      {
        "control": "SANDBOX_CONTAINER",
        "reason": "Code execution capability"
      }
    ],
    "recommended": [
      {
        "control": "LOG_ACTIONS",
        "reason": "Audit trail for all actions"
      }
    ],
    "uncontained_risk": "Risk level depends on manual review of actual capabilities."
  },
  "risks": {
    "risks": [
      {
        "risk": "Unauthorized tool use: INSTRUCTED_GIT_CLONE_AND_BUILD, MCP_SYS_CRITICAL_ACCESS",
        "severity": "high",
        "mitigation": "Use pre-built packages or vendored dependencies instead of cloning repos"
      },
      {
        "risk": "Command injection risk: MCP_SQL_BLIND, COMMAND_INJECTION_EVAL",
        "severity": "critical",
        "mitigation": "Remove SQL exploitation patterns."
      },
      {
        "risk": "Social engineering indicators: SOCIAL_ENG_VAGUE_DESCRIPTION, SOCIAL_ENG_ANTHROPIC_IMPERSONATION",
        "severity": "low",
        "mitigation": "Provide clear, detailed description of skill functionality"
      },
      {
        "risk": "Data exfiltration patterns: DATA_EXFIL_ENV_VARS",
        "severity": "medium",
        "mitigation": "Minimize access to environment variables"
      }
    ],
    "limitations": [
      "Static analysis only - runtime behavior not verified"
    ]
  },
  "incident_response": {
    "kill_switch": [
      "Stop the agent process"
    ],
    "containment": [
      "Review logs for unexpected actions"
    ],
    "recovery": [
      "Depends on skill capabilities"
    ]
  },
  "evidence": [
    {
      "evidence_id": "EV:file-1",
      "type": "file_excerpt",
      "title": "pyproject.toml",
      "file_path": "pyproject.toml"
    },
    {
      "evidence_id": "EV:file-2",
      "type": "file_excerpt",
      "title": "README.md",
      "file_path": "README.md"
    },
    {
      "evidence_id": "EV:file-3",
      "type": "file_excerpt",
      "title": "scripts/benchmark.py",
      "file_path": "scripts/benchmark.py"
    },
    {
      "evidence_id": "EV:file-4",
      "type": "file_excerpt",
      "title": "scripts/lib_agent.py",
      "file_path": "scripts/lib_agent.py"
    },
    {
      "evidence_id": "EV:file-5",
      "type": "file_excerpt",
      "title": "scripts/lib_grading.py",
      "file_path": "scripts/lib_grading.py"
    },
    {
      "evidence_id": "EV:file-6",
      "type": "file_excerpt",
      "title": "scripts/lib_tasks.py",
      "file_path": "scripts/lib_tasks.py"
    },
    {
      "evidence_id": "EV:file-7",
      "type": "file_excerpt",
      "title": "scripts/lib_upload.py",
      "file_path": "scripts/lib_upload.py"
    },
    {
      "evidence_id": "EV:file-8",
      "type": "file_excerpt",
      "title": "scripts/run.sh",
      "file_path": "scripts/run.sh"
    },
    {
      "evidence_id": "EV:file-9",
      "type": "file_excerpt",
      "title": "SKILL.md",
      "file_path": "SKILL.md"
    },
    {
      "evidence_id": "EV:file-10",
      "type": "file_excerpt",
      "title": "_meta.json",
      "file_path": "_meta.json"
    },
    {
      "evidence_id": "EV:cisco-1",
      "type": "file_excerpt",
      "title": "INSTRUCTED_GIT_CLONE_AND_BUILD [MEDIUM] README.md:27: git clone https://github.com/pinchbench/skill.git",
      "file_path": "README.md"
    },
    {
      "evidence_id": "EV:cisco-2",
      "type": "file_excerpt",
      "title": "MCP_SQL_BLIND [HIGH] scripts/benchmark.py:298: time.sleep(5)",
      "file_path": "scripts/benchmark.py"
    },
    {
      "evidence_id": "EV:cisco-3",
      "type": "file_excerpt",
      "title": "TIRITH_ANSI_ESCAPE_IN_STRING [HIGH] scripts/benchmark.py:279: colored_lines.append(f\"\\x1b[38;2;255;{green_blue};{green_blue}m{line}\\x1b[0m\")",
      "file_path": "scripts/benchmark.py"
    },
    {
      "evidence_id": "EV:cisco-4",
      "type": "file_excerpt",
      "title": "MCP_SQL_BLIND [HIGH] scripts/lib_agent.py:294: time.sleep(1.0)",
      "file_path": "scripts/lib_agent.py"
    },
    {
      "evidence_id": "EV:cisco-5",
      "type": "file_excerpt",
      "title": "COMMAND_INJECTION_EVAL [CRITICAL] scripts/lib_grading.py:94: exec(grading_code, namespace)",
      "file_path": "scripts/lib_grading.py"
    },
    {
      "evidence_id": "EV:cisco-6",
      "type": "file_excerpt",
      "title": "DATA_EXFIL_ENV_VARS [MEDIUM] scripts/lib_upload.py:286: env_token = os.environ.get(\"PINCHBENCH_TOKEN\")",
      "file_path": "scripts/lib_upload.py"
    },
    {
      "evidence_id": "EV:cisco-7",
      "type": "file_excerpt",
      "title": "MCP_SYS_CRITICAL_ACCESS [HIGH] scripts/run.sh:1: #!/usr/bin/env bash",
      "file_path": "scripts/run.sh"
    },
    {
      "evidence_id": "EV:cisco-8",
      "type": "file_excerpt",
      "title": "SOCIAL_ENG_VAGUE_DESCRIPTION [LOW] SKILL.md:1: ---",
      "file_path": "SKILL.md"
    },
    {
      "evidence_id": "EV:cisco-9",
      "type": "file_excerpt",
      "title": "SOCIAL_ENG_ANTHROPIC_IMPERSONATION [MEDIUM] SKILL.md:27: uv run benchmark.py --model anthropic/claude-sonnet-4",
      "file_path": "SKILL.md"
    },
    {
      "evidence_id": "EV:cisco-10",
      "type": "file_excerpt",
      "title": "MCP_SQL_BLIND [HIGH] SKILL.md:88: # Run benchmark (auto-uploads with token)",
      "file_path": "SKILL.md"
    }
  ]
}