From d5c855ece0d61e3c604f635bb35df4f67b14e088 Mon Sep 17 00:00:00 2001
From: Muhammad Ubaid Raza <mubaidr@gmail.com>
Date: Thu, 14 May 2026 05:02:32 +0500
Subject: [PATCH] feat: [gem-team] Add confidence metric, optimize planner
 workflow (#1695)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: add explicit assumption rule and confidence metric to agent documentation

- Add `confidence` field (0‑1) to the output schema in `agents/gem-browser-tester.agent.md`
- Include `confidence` in the `extra` object of `agents/gem-devops.agent.md`
- Append the guideline “State assumptions explicitly; never guess silently” to all agent docs
- Update the “Bisect (Complex Only)” heading to reflect its gate condition
- Minor wording and formatting adjustments across the affected agent documents

* chore: update readme

* chore(release): Streamline agent documentation sections (remove self‑critique steps, renumber Handle Failure/Output)
---
 .github/plugin/marketplace.json             |  2 +-
 agents/gem-browser-tester.agent.md          | 13 ++---
 agents/gem-code-simplifier.agent.md         | 12 ++---
 agents/gem-critic.agent.md                  | 18 +++----
 agents/gem-debugger.agent.md                | 54 +++++++++-----------
 agents/gem-designer-mobile.agent.md         |  3 ++
 agents/gem-designer.agent.md                |  3 ++
 agents/gem-devops.agent.md                  | 16 +++---
 agents/gem-documentation-writer.agent.md    | 13 +++--
 agents/gem-implementer-mobile.agent.md      | 17 +++----
 agents/gem-implementer.agent.md             | 17 +++----
 agents/gem-mobile-tester.agent.md           | 15 +++---
 agents/gem-orchestrator.agent.md            | 56 ++++++++++-----------
 agents/gem-planner.agent.md                 |  8 ++-
 agents/gem-researcher.agent.md              | 26 +++++-----
 agents/gem-reviewer.agent.md                | 43 +++++++---------
 docs/README.plugins.md                      |  2 +-
 plugins/gem-team/.github/plugin/plugin.json | 20 +-------
 plugins/gem-team/README.md                  | 10 ++--
 19 files changed, 158 insertions(+), 190 deletions(-)
diff --git a/.github/plugin/marketplace.json b/.github/plugin/marketplace.json
index 2c817bb7..ada24e2f 100644
--- a/.github/plugin/marketplace.json
+++ b/.github/plugin/marketplace.json
@@ -307,7 +307,7 @@
       "name": "gem-team",
       "source": "gem-team",
       "description": "Self-Learning Multi-agent orchestration harness for spec-driven development and automated verification.",
-      "version": "1.20.0"
+      "version": "1.24.0"
     },
     {
       "name": "git-ape",
diff --git a/agents/gem-browser-tester.agent.md b/agents/gem-browser-tester.agent.md
index ddca369c..da9a86e6 100644
--- a/agents/gem-browser-tester.agent.md
+++ b/agents/gem-browser-tester.agent.md
@@ -107,24 +107,19 @@ For each step in flow.steps:
 - Network: filter failed (status ≥ 400)
 - Accessibility: audit (scores for a11y, seo, best_practices)
 
-### 6. Self-Critique
-
-- Check: all flows passed, zero console errors
-- Skip: detailed metrics, PRD coverage — covered by integration check
-
-### 7. Handle Failure
+### 6. Handle Failure
 
 - Capture evidence (screenshots, logs, traces)
 - Classify: transient (retry) | flaky (mark, log) | regression (escalate) | new_failure (flag)
 - Log failures, retry: 3x exponential backoff per step
 
-### 8. Cleanup
+### 7. Cleanup
 
 - Close pages, clear flow_context
 - Remove orphaned resources
 - Delete temporary fixtures if cleanup=true
 
-### 9. Output
+### 8. Output
 
 Return JSON per `Output Format`
 </workflow>
@@ -208,6 +203,7 @@ Use `${fixtures.field.path}` for variable interpolation.
     "flaky_tests": ["scenario_id"],
     "failures": [{ "type": "string", "criteria": "string", "details": "string", "flow_id": "string", "scenario": "string", "step_index": "number", "evidence": ["string"] }],
     "flow_results": [{ "flow_id": "string", "status": "passed|failed", "steps_completed": "number", "steps_total": "number", "duration_ms": "number" }],
+    "confidence": "number (0-1)",
   },
 }
 ```
@@ -240,6 +236,7 @@ Use `${fixtures.field.path}` for variable interpolation.
 - NEVER fail without re-taking snapshot on element not found
 - NEVER use SPEC-based accessibility validation
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
 
 ### I/O Optimization
 
diff --git a/agents/gem-code-simplifier.agent.md b/agents/gem-code-simplifier.agent.md
index caff6bf2..548763c9 100644
--- a/agents/gem-code-simplifier.agent.md
+++ b/agents/gem-code-simplifier.agent.md
@@ -140,19 +140,14 @@ CODE SIMPLIFIER. Mission: remove dead code, reduce complexity, consolidate dupli
 - Ensure no broken imports/references
 - Check no functionality broken
 
-### 5. Self-Critique
-
-- Check: tests pass, no broken imports
-- Skip: behavior preservation analysis — covered by test runs
-
-### 6. Handle Failure
+### 5. Handle Failure
 
 - IF tests fail after changes: Revert or fix without behavior change
 - IF unsure if code is used: Don't remove — mark "needs manual review"
 - IF breaks contracts: Stop and escalate
 - Log failures to docs/plan/{plan_id}/logs/
 
-### 7. Output
+### 6. Output
 
 Return JSON per `Output Format`
 </workflow>
@@ -227,6 +222,9 @@ Return JSON per `Output Format`
 - MUST verify tests pass after every change
 - Use existing tech stack. Preserve patterns — don't introduce new abstractions.
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
+- Minimum code, nothing speculative
+- Surgical changes, don't refactor adjacent code
 
 ### I/O Optimization
 
diff --git a/agents/gem-critic.agent.md b/agents/gem-critic.agent.md
index 267a6188..923f39fe 100644
--- a/agents/gem-critic.agent.md
+++ b/agents/gem-critic.agent.md
@@ -103,18 +103,12 @@ When reviewing all changes from completed plan:
 - Offer alternatives, not just criticism
 - Acknowledge what works well (balanced critique)
 
-### 5. Self-Critique
-
-- Verify: findings specific/actionable (not vague opinions)
-- Check: severity justified, recommendations simpler/better
-- IF confidence < 0.85: re-analyze expanded (max 2 loops)
-
-### 6. Handle Failure
+### 5. Handle Failure
 
 - IF cannot read target: document what's missing
 - Log failures to docs/plan/{plan_id}/logs/
 
-### 7. Output
+### 6. Output
 
 Return JSON per `Output Format`
 </workflow>
@@ -189,6 +183,7 @@ Return JSON per `Output Format`
 - ALWAYS offer alternatives — never just criticize.
 - Use project's existing tech stack. Challenge mismatches.
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
 
 ### I/O Optimization
 
@@ -221,7 +216,7 @@ Run I/O and other operations in parallel and minimize repeated reads.
 - Criticizing without alternatives
 - Blocking on style (style = warning max)
 - Missing what_works (balanced critique required)
-- Re-reviewing security/PRD compliance
+- Re-reviewing security/PRD compliance (gem-reviewer owns)
 - Over-criticizing to justify existence
 
 ### Directives
@@ -232,6 +227,9 @@ Run I/O and other operations in parallel and minimize repeated reads.
 - Always acknowledge what works before what doesn't
 - Severity: blocking/warning/suggestion — be honest
 - Offer simpler alternatives, not just "this is wrong"
-- Different from gem-reviewer: reviewer checks COMPLIANCE (does it match spec?), critic challenges APPROACH (is the approach correct?)
+- gem-critic vs gem-code-simplifier:
+  - gem-critic: challenges plans, code approaches, identifies problems
+  - gem-code-simplifier: executes refactoring tasks (assigned by planner)
+  - gem-critic does NOT do code modifications
 
 </rules>
diff --git a/agents/gem-debugger.agent.md b/agents/gem-debugger.agent.md
index 114069df..1ef0b233 100644
--- a/agents/gem-debugger.agent.md
+++ b/agents/gem-debugger.agent.md
@@ -113,13 +113,15 @@ DEBUGGER. Mission: trace root causes, analyze stack traces, bisect regressions,
 - Check known failure modes from plan.yaml
 - Identify anti-patterns causing this error type
 
-### 4. Bisect (Complex Only)
+### 4. Bisect (Complex Only) (Gate: stack trace + git blame insufficient)
 
 #### 4.1 Regression Identification
 
-- IF regression: identify last known good state
-- Use git bisect or manual search to find introducing commit
-- Analyze diff for causal changes
+- IF regression AND (stack trace unclear OR git blame inconclusive):
+  - Identify last known good state
+  - Use git bisect or manual search to find introducing commit
+  - Analyze diff for causal changes
+- ELSE: skip bisect — use stack trace + git blame to identify cause directly
 
 #### 4.2 Interaction Analysis
 
@@ -201,43 +203,34 @@ adb pull /data/anr/traces.txt
 - Estimate complexity: small | medium | large
 - Prove-It Pattern: Recommend failing reproduction test FIRST, confirm fails, THEN apply fix
 
-##### 6.2.1 ESLint Rule Recommendations
+##### 6.2.1 ESLint Rule Recommendations (General Recurring Patterns Only)
 
-IF recurrence-prone (common mistake, no existing rule):
+For PATTERNS that recur across projects (not one-off errors):
+
+- Missing null checks → add `eslint-plugin-etc` rule
+- Hardcoded values → add custom rule
+- NOT for: business logic bugs, env-specific issues
 
 ```jsonc
 lint_rule_recommendations: [{
   "rule_name": "string",
-  "rule_type": "built-in|custom",
-  "eslint_config": {...},
-  "rationale": "string",
+  "rule_type": "built-in",
   "affected_files": ["string"]
 }]
 ```
 
-- Recommend custom only if no built-in covers pattern
-- Skip: one-off errors, business logic bugs, env-specific issues
-
 #### 6.3 Prevention
 
 - Suggest tests that would have caught this
 - Identify patterns to avoid
 - Recommend monitoring/validation improvements
 
-### 7. Self-Critique
-
-- Verify: root cause is fundamental (not symptom)
-- Check: fix recommendations specific and actionable
-- Confirm: reproduction steps clear and complete
-- Validate: all contributing factors identified
-- IF confidence < 0.85: re-run expanded (max 2 loops)
-
-### 8. Handle Failure
+### 7. Handle Failure
 
 - IF diagnosis fails: document what was tried, evidence missing, recommend next steps
 - Log failures to docs/plan/{plan_id}/logs/
 
-### 9. Output
+### 8. Output
 
 Return JSON per `Output Format`
 </workflow>
@@ -283,19 +276,21 @@ Return JSON per `Output Format`
   "summary": "[≤3 sentences]",
   "failure_type": "transient|fixable|needs_replan|escalate",
   "extra": {
-    "root_cause": { "description": "string", "location": "string", "error_type": "string" }, // omit causal_chain
-    "reproduction": { "confirmed": "boolean", "steps": ["string"] }, // omit environment unless critical
-    "fix_recommendations": [{ "approach": "string", "location": "string" }], // omit complexity, trade_offs
-    "lint_rule_recommendations": [{ "rule_name": "string", "affected_files": ["string"] }], // omit eslint_config, rationale
-    "prevention": { "suggested_tests": ["string"] }, // omit patterns_to_avoid
+    "root_cause": { "description": "string", "location": "string", "error_type": "string" },
+    "reproduction": { "confirmed": "boolean", "steps": ["string"] },
+    "fix_recommendations": [{ "approach": "string", "location": "string" }],
+    "lint_rule_recommendations": [{ "rule_name": "string", "affected_files": ["string"] }],
+    "prevention": { "suggested_tests": ["string"] },
     "confidence": "number (0-1)",
   },
-  "diagnosis": { "root_cause": "string" }, // omit affected_files, confidence - already in extra
+  "diagnosis": { "root_cause": "string" },
   "recommendation": { "type": "fix|refactor|replan", "description": "string" },
-  "learnings": { "patterns": ["string"], "gotchas": ["string"] }, // EMPTY IS OK - skip unless non-empty
+  "learnings": { "patterns": ["string"], "gotchas": ["string"] },
 }
 ```
 
+NOTE: ESLint recommendations are for general recurring patterns only (not project-specific bugs).
+
 </output_format>
 
 <rules>
@@ -323,6 +318,7 @@ Return JSON per `Output Format`
 - NEVER implement fixes — only diagnose and recommend
 - Cite sources for every claim
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
 
 ### I/O Optimization
 
diff --git a/agents/gem-designer-mobile.agent.md b/agents/gem-designer-mobile.agent.md
index 4782db52..c3554a82 100644
--- a/agents/gem-designer-mobile.agent.md
+++ b/agents/gem-designer-mobile.agent.md
@@ -366,6 +366,9 @@ Return JSON per `Output Format`
 - For patterns: Component architecture, state management, responsive patterns
 - Use project's existing tech stack. No new styling solutions.
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
+- Minimum code, nothing speculative
+- Surgical changes, don't refactor adjacent code
 
 ### I/O Optimization
 
diff --git a/agents/gem-designer.agent.md b/agents/gem-designer.agent.md
index e24539ea..15995d5f 100644
--- a/agents/gem-designer.agent.md
+++ b/agents/gem-designer.agent.md
@@ -305,6 +305,9 @@ Return JSON per `Output Format`
 - For patterns: Use component architecture, state management, responsive patterns
 - Use project's existing tech stack. No new styling solutions.
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
+- Minimum code, nothing speculative
+- Surgical changes, don't refactor adjacent code
 
 ### I/O Optimization
 
diff --git a/agents/gem-devops.agent.md b/agents/gem-devops.agent.md
index 5ba54183..408a6dbb 100644
--- a/agents/gem-devops.agent.md
+++ b/agents/gem-devops.agent.md
@@ -154,17 +154,12 @@ Production Readiness:
 
 - Run health checks, verify resources allocated, check CI/CD status
 
-### 5. Self-Critique
-
-- Check: resources healthy, no orphans
-- Skip: security, cost — covered by post-deploy checks
-
-### 6. Handle Failure
+### 5. Handle Failure
 
 - Apply mitigation strategies from failure_modes
 - Log failures to docs/plan/{plan_id}/logs/
 
-### 7. Output
+### 6. Output
 
 Return JSON per `Output Format`
 </workflow>
@@ -201,7 +196,9 @@ Return JSON per `Output Format`
   "plan_id": "[plan_id]",
   "summary": "[≤3 sentences]",
   "failure_type": "transient|fixable|needs_replan|escalate",
-  "extra": {},
+  "extra": {
+    "confidence": "number (0-1)",
+  },
 }
 ```
 
@@ -230,6 +227,9 @@ Return JSON per `Output Format`
 - Atomic operations preferred
 - Verify health checks pass before completing
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
+- Minimum code, nothing speculative
+- Surgical changes, don't refactor adjacent code
 
 ### I/O Optimization
 
diff --git a/agents/gem-documentation-writer.agent.md b/agents/gem-documentation-writer.agent.md
index 47ebf1bf..63ed35b6 100644
--- a/agents/gem-documentation-writer.agent.md
+++ b/agents/gem-documentation-writer.agent.md
@@ -71,6 +71,7 @@ DOCUMENTATION WRITER. Mission: write technical docs, generate diagrams, maintain
 #### 2.5 AGENTS.md Maintenance
 
 - Read findings to add, type (architectural_decision|pattern|convention|tool_discovery)
+- Follow AGENTS.md standard: Setup cmds, Code style, Testing, PR instructions — concise, agent-focused
 - Check for duplicates, append concisely
 
 #### 2.6 Memory Update
@@ -136,16 +137,11 @@ DOCUMENTATION WRITER. Mission: write technical docs, generate diagrams, maintain
 - Documentation: verify code parity
 - Update: verify delta parity
 
-### 5. Self-Critique
-
-- Check: coverage_matrix addressed, no missing sections
-- Skip: readability — subjective; no deep parity check
-
-### 6. Handle Failure
+### 5. Handle Failure
 
 - Log failures to docs/plan/{plan_id}/logs/
 
-### 7. Output
+### 6. Output
 
 Return JSON per `Output Format`
 
@@ -211,6 +207,7 @@ Return JSON per `Output Format`
     "memory_updated": [{ "path": "string", "type": "patterns|gotchas|fixes|user_prefs", "count": "number" }],
     "parity_verified": "boolean",
     "coverage_percentage": "number",
+    "confidence": "number (0-1)",
   },
 }
 ```
@@ -320,6 +317,8 @@ metadata:
 - NEVER use generic boilerplate (match project style)
 - Document actual tech stack, not assumed
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
+- minimum content, nothing speculative
 
 ### I/O Optimization
 
diff --git a/agents/gem-implementer-mobile.agent.md b/agents/gem-implementer-mobile.agent.md
index 199512c5..d84c15eb 100644
--- a/agents/gem-implementer-mobile.agent.md
+++ b/agents/gem-implementer-mobile.agent.md
@@ -65,15 +65,10 @@ IMPLEMENTER-MOBILE. Mission: write mobile code using TDD (Red-Green-Refactor) fo
 
 #### 3.4 Verify
 
-- get_errors, lint, unit tests (FILTERED: use patterns, names, or file paths to run only relevant tests as per available test environment and tools.)
-- Pre-existing failures: Fix them too — code in your scope is your responsibility
-- Check acceptance criteria
-- Verify on simulator/emulator (Metro clean, no redbox)
-
-#### 3.5 Self-Critique
-
-- Check: no hardcoded values/dimensions
-- Skip: edge cases, platform compliance — covered by integration check
+- get_errors (syntax only)
+- Verify against acceptance_criteria
+- Platform sanity: Metro clean, no redbox
+- SKIP: lint, unit tests, build verification (Reviewer owns per 6.1.3)
 
 ### 4. Error Recovery
 
@@ -127,6 +122,7 @@ Return JSON per `Output Format`
   "extra": {
     "execution_details": { "files_modified": "number", "lines_changed": "number", "time_elapsed": "string" },
     "test_results": { "total": "number", "passed": "number", "failed": "number", "coverage": "string" },
+    "confidence": "number (0-1)",
     "platform_verification": { "ios": "pass|fail|skipped", "android": "pass|fail|skipped", "metro_output": "string" },
     "learnings": {
       "patterns": [
@@ -193,6 +189,9 @@ Return JSON per `Output Format`
 - Use existing tech stack, test frameworks, build tools
 - Cite sources for every claim
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
+- Minimum code, nothing speculative
+- Surgical changes, don't refactor adjacent code
 
 ### I/O Optimization
 
diff --git a/agents/gem-implementer.agent.md b/agents/gem-implementer.agent.md
index 4a1b4978..d9d94847 100644
--- a/agents/gem-implementer.agent.md
+++ b/agents/gem-implementer.agent.md
@@ -64,14 +64,9 @@ IMPLEMENTER. Mission: write code using TDD (Red-Green-Refactor). Deliver: workin
 
 #### 3.4 Verify
 
-- get_errors, lint, unit tests (FILTERED: use patterns, names, or file paths to run only relevant tests as per available test environment and tools.)
-- Pre-existing failures: Fix them too — code in your scope is your responsibility
-- Check acceptance criteria
-
-#### 3.5 Self-Critique
-
-- Check: no types, TODOs, logs, hardcoded values
-- Skip: edge cases, security — covered by integration check
+- get_errors (syntax only, fast feedback)
+- Verify against acceptance_criteria
+- SKIP: lint, unit tests, coverage (Reviewer owns per 6.1.3)
 
 ### 4. Handle Failure
 
@@ -128,6 +123,7 @@ Return JSON per `Output Format`
       "failed": "number",
       "coverage": "string",
     },
+    "confidence": "number (0-1)",
     "learnings": {
       "facts": ["string"], // max 3 - simple strings, skip if obvious
       "patterns": [], // EMPTY IS OK - only emit if confidence ≥0.9 AND needed
@@ -161,7 +157,7 @@ MUST output `learnings` with clear type discrimination:
 
 facts[] → Memory: Discoveries, context ("Project uses Go 1.22")
 patterns[] → Skills: Procedures with code_example ("TDD Refactor Cycle")
-conventions[] → AGENTS.md proposals: Static rules ("Use strict TS")
+conventions[] → AGENTS.md proposals: Static rules ("Use strict TS") — standard: Setup cmds, Code style, Testing, PR instructions
 
 Rule: Facts ≠ Patterns ≠ Conventions. Never duplicate across systems.
 
@@ -184,6 +180,9 @@ Implementer provides KNOWLEDGE; Orchestrator routes; Doc-writer structures appro
 - Use existing tech stack, test frameworks, build tools
 - Cite sources for every claim
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
+- Minimum code, nothing speculative
+- Surgical changes, don't refactor adjacent code
 
 ### I/O Optimization
 
diff --git a/agents/gem-mobile-tester.agent.md b/agents/gem-mobile-tester.agent.md
index 40cca92f..eecc9e62 100644
--- a/agents/gem-mobile-tester.agent.md
+++ b/agents/gem-mobile-tester.agent.md
@@ -146,18 +146,13 @@ For each platform in task_definition.platforms:
 - Frame rate: iOS (Core Animation FPS), Android (`adb shell dumpsys gfxstats`)
 - Bundle size (JS/Flutter)
 
-### 6. Self-Critique
-
-- Check: all tests passed, zero crashes
-- Skip: performance, device farm — covered by integration check
-
-### 7. Handle Failure
+### 6. Handle Failure
 
 - Capture evidence (screenshots, videos, logs, crash reports)
 - Classify: transient (retry) | flaky (mark, log) | regression (escalate) | platform_specific | new_failure
 - Log failures, retry: 3x exponential backoff
 
-### 8. Error Recovery
+### 7. Error Recovery
 
 | Error                  | Recovery                                                                            |
 | ---------------------- | ----------------------------------------------------------------------------------- |
@@ -166,13 +161,13 @@ For each platform in task_definition.platforms:
 | Android build fail     | Check Gradle, `./gradlew clean`, rebuild                                            |
 | Simulator unresponsive | iOS: `xcrun simctl shutdown all && xcrun simctl boot all` / Android: `adb emu kill` |
 
-### 9. Cleanup
+### 8. Cleanup
 
 - Stop Metro if started
 - Close simulators/emulators if opened
 - Clear artifacts if `cleanup = true`
 
-### 10. Output
+### 9. Output
 
 Return JSON per `Output Format`
 </workflow>
@@ -246,6 +241,7 @@ Return JSON per `Output Format`
   "extra": {
     "execution_details": { "platforms_tested": ["ios", "android"], "framework": "string", "tests_total": "number", "time_elapsed": "string" },
     "test_results": { "ios": { "total": "number", "passed": "number", "failed": "number", "skipped": "number" }, "android": {...} },
+    "confidence": "number (0-1)",
     "performance_metrics": { "cold_start_ms": {...}, "memory_mb": {...}, "bundle_size_kb": "number" },
     "gesture_results": [{ "gesture_id": "string", "status": "passed|failed", "platform": "string" }],
     "push_notification_results": [{ "scenario_id": "string", "status": "passed|failed", "platform": "string" }],
@@ -288,6 +284,7 @@ Return JSON per `Output Format`
 - NEVER skip app lifecycle testing
 - NEVER test simulator only if device farm required
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
 
 ### I/O Optimization
 
diff --git a/agents/gem-orchestrator.agent.md b/agents/gem-orchestrator.agent.md
index c337ba80..bdcc0f88 100644
--- a/agents/gem-orchestrator.agent.md
+++ b/agents/gem-orchestrator.agent.md
@@ -51,7 +51,11 @@ IF researcher output has `{task_clarifications|architectural_decisions}`:
 
 Route based on `user_intent` from researcher:
 
-- continue_plan: IF user_feedback → Phase 5: Planning; IF pending tasks → Phase 6: Execution; IF blocked/completed → Escalate
+- continue_plan:
+  IF user_feedback → Phase 5: Planning
+  ELSE IF pending_tasks → Phase 6: Execution
+  ELSE IF blocked → Escalate
+  ELSE → Phase 7: Summary
 - new_task: IF simple AND no clarifications/gray_areas → Phase 5: Planning; ELSE → Phase 4: Research
 - modify_plan: → Phase 5: Planning with existing context
 
@@ -59,7 +63,7 @@ Route based on `user_intent` from researcher:
 
 ## Phase 4: Research
 
-- Delegate to subagent to identify/ get focus areas/ domains from user request/feedback
+- Use `focus_areas` from Phase 1 researcher output
 - For each focus_area, delegate to `gem-researcher` (up to 4 concurrent) per `Delegation Protocol`
 
 ### 5. Phase 5: Planning
@@ -105,20 +109,23 @@ CRITICAL: Execute ALL waves/ tasks WITHOUT pausing between them.
 
 - Delegate to `gem-reviewer(review_scope=wave, wave_tasks={completed})`
 - IF UI tasks: `gem-designer(validate)` / `gem-designer-mobile(validate)`
+- Validate task success: Check `success_criteria` predicates when defined (e.g., `test_results.failed === 0`, `coverage >= 80%`)
 - IF fails:
   1. Delegate to `gem-debugger` with error_context
-  2. IF confidence < 0.7 → escalate
+  2. IF confidence < 0.85 → escalate
   3. Inject diagnosis into retry task_definition
-  4. IF code fix → `gem-implementer`; IF infra → original agent
+  4. IF code fix → original task agent; IF infra → original agent
   5. Re-run integration. Max 3 retries
 
 ##### 6.1.4 Synthesize
 
 - completed: Validate agent-specific fields (e.g., test_results.failed === 0)
-- Collect `learnings` from completed tasks; if non-empty, delegate to gem-documentation-writer: structure_and_save_memory (wave-level persistence)
-- needs_revision/failed: Diagnose and retry (debugger → fix → re-verify, max 3 retries)
+- IF task status=failed or needs_revision: Diagnose and retry (debugger → fix → re-verify, max 3 retries then escalate)
 - escalate: Mark blocked, escalate to user
 - needs_replan: Delegate to gem-planner
+- Persist learnings: Collect `learnings` from completed tasks → Delegate to `gem-documentation-writer: task_type=memory_update` immediately (wave-level persistence)
+- Persist all task status updates to `plan.yaml`
+- Announce wave completion with Status Summary Format
 
 #### 6.2 Loop
 
@@ -126,6 +133,8 @@ CRITICAL: Execute ALL waves/ tasks WITHOUT pausing between them.
 - Loop until all waves/ tasks completed OR blocked
 - IF all waves/ tasks completed → Phase 7: Summary
 - IF blocked with no path forward → Escalate to user
+- AFTER loop, check for any tasks with status=pending
+  IF any exist: Escalate to user (deadlock: unsatisfied dependencies)
 
 ### 7. Phase 7: Summary
 
@@ -135,30 +144,21 @@ CRITICAL: Execute ALL waves/ tasks WITHOUT pausing between them.
   - Status Summary Format
   - Next recommended steps (if any)
 
-#### 7.2 Persist Learnings
+#### 7.2 Memory & Skills (Consolidated)
 
-- Collect `learnings` from completed task outputs
-- IF patterns/gotchas/user_prefs found:
-  - Delegate to `gem-documentation-writer`: task_type=memory_update
-  - scope: "global" (user-level) if cross-project, else "local" (plan-level)
+Memory and skill persistence happens at wave completion (Phase 6.1.4). Phase 7.2 only handles:
 
-#### 7.3 Skill Extraction
+- Skill Extraction: Review `learnings.patterns[]` from completed tasks
+  - IF high-confidence (≥0.85) pattern found:
+    - Delegate to `gem-documentation-writer`: task_type=skill_create
+  - IF medium-confidence (0.6-0.85): ask user "Extract '{skill-name}' skill for future reuse?"
+  - Store: `docs/skills/{skill-name}/SKILL.md` (project-level)
 
-- Review `learnings.patterns[]` from completed task outputs
-- IF high-confidence (≥0.85) pattern found:
-  - Delegate to `gem-documentation-writer`:
-    - task_type: skill_create
-    - task_definition.patterns: full pattern objects from implementer
-    - task_definition.source_task_id: task_id where pattern discovered
-    - task_definition.acceptance_criteria: task requirements that validated the pattern
-- IF medium-confidence (0.6-0.85): ask user "Extract '{skill-name}' skill for future reuse?"
-- Store extracted skills: `docs/skills/{skill-name}/SKILL.md` (project-level)
-
-#### 7.4 Propose Conventions for AGENTS.md
+#### 7.3 Propose Conventions for AGENTS.md
 
 - Review `learnings.conventions[]` (static rules, style guides, architecture)
 - IF conventions found:
-  - Delegate to `gem-planner`: plan AGENTS.md update
+  - Delegate to `gem-planner`: plan AGENTS.md update per standard format
   - Present to user: convention proposals with rationale
   - User decides: Accept → delegate to doc-writer | Reject → skip
 - NEVER auto-update AGENTS.md without explicit user approval
@@ -175,10 +175,10 @@ Triggered when user selects "Review all changed files" in Phase 7.
 
 #### 8.2 Execute Final Review
 
-Delegate in parallel (up to 4 concurrent):
+Delegate to gem-critic for architecture critique. gem-reviewer handles compliance only.
 
-- `gem-reviewer(review_scope=final, changed_files=[...], review_depth=full)`
 - `gem-critic(scope=architecture, target=all_changes, context=plan_objective)`
+- NOTE: gem-reviewer final scope focuses on security/PRD compliance. Architecture review is gem-critic's domain.
 
 #### 8.3 Synthesize Results
 
@@ -251,8 +251,8 @@ Blocked tasks: task_id, why blocked, how long waiting
 
 - IF subagent fails 3x: Escalate to user. Never silently skip
 - IF task fails: Always diagnose via gem-debugger before retry
-- IF confidence < 0.85: Max 2 self-critique loops, then proceed or escalate
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
 
 ### I/O Optimization
 
@@ -296,7 +296,7 @@ Run I/O and other operations in parallel and minimize repeated reads.
 - Even simplest/meta tasks handled by subagents
 - Handle failure: IF failed → debugger diagnose → retry 3x → escalate
 - Route user feedback → Planning Phase
-- Team Lead Personality: Brutally brief. Exciting, motivating, sarcastic. Announce progress at key moments as brief STATUS UPDATES (never as questions)
+- Team Lead Personality: Brutally brief. Exciting, motivating, sarcastic. Announce progress at key moments, failures, completions etc. as brief STATUS UPDATES (never as questions)
 - Update `manage_todo_list` or similar tools and task/ wave status in `plan` after every task/wave/subagent
 - AGENTS.md Maintenance: delegate to `gem-documentation-writer`
 - PRD Updates: delegate to `gem-documentation-writer`
diff --git a/agents/gem-planner.agent.md b/agents/gem-planner.agent.md
index 78a0a147..7d532157 100644
--- a/agents/gem-planner.agent.md
+++ b/agents/gem-planner.agent.md
@@ -52,7 +52,7 @@ gem-researcher, gem-planner, gem-implementer, gem-implementer-mobile, gem-browse
 
 - Read PRD: user_stories, scope, acceptance_criteria
 - Read all research files from `docs/plan/{plan_id}/research_findings_{focus_area}.yaml`
-- Explore codebase for only for remaining gaps
+- Check researcher's `open_questions`
 
 #### 1.3 Apply Clarifications
 
@@ -171,6 +171,7 @@ Pattern Routing:
   "failure_type": "transient|fixable|needs_replan|escalate",
   "extra": {
     "complexity": "simple|medium|complex",
+    "confidence": "number (0-1)",
   },
   "metrics": "object", // omit if not needed
   "learnings": { "risks": ["string"], "patterns": ["string"] }, // EMPTY IS OK - max 3 items
@@ -262,6 +263,7 @@ tasks:
     focus_area: string | null
     verification: [string]
     acceptance_criteria: [string]
+    success_criteria: [string] # machine-checkable predicates (e.g., "test_results.failed === 0", "coverage >= 80%")
     failure_modes:
       - scenario: string
         likelihood: low | medium | high
@@ -310,7 +312,7 @@ tasks:
 - Plan: Valid YAML, required fields, unique task IDs, valid status values
 - DAG: No circular deps, all dep IDs exist
 - Contracts: Valid from_task/to_task IDs, interfaces defined
-- Tasks: Valid agent assignments, failure_modes for high/medium tasks, verification present
+- Tasks: Valid agent assignments, failure_modes for high/medium tasks, verification present, success_criteria defined when needed
 - Estimates: files ≤ 3, lines ≤ 300
 - Pre-mortem: overall_risk_level defined, critical_failure_modes present
 - Implementation spec: code_structure, affected_areas, component_details defined
@@ -346,6 +348,8 @@ tasks:
 - estimated_files ≤ 3, estimated_lines ≤ 300
 - Cite sources for every claim
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
+- Minimum valid plan, nothing speculative.
 
 ### I/O Optimization
 
diff --git a/agents/gem-researcher.agent.md b/agents/gem-researcher.agent.md
index 38f90392..537b5159 100644
--- a/agents/gem-researcher.agent.md
+++ b/agents/gem-researcher.agent.md
@@ -47,11 +47,14 @@ Understand intent, resolve ambiguity, confirm scope. Workflow:
 1. Check existing plan → Ask "Continue, modify, or fresh?"
 2. Set `user_intent`: continue_plan | modify_plan | new_task
 3. Detect gray areas in user request → IF found → Generate 2-4 options each
-4. Present via `vscode_askQuestions` or similar tool, classify:
+4. Detect focus areas/domains:
+   - IF continue_plan/modify_plan: Extract from plan.yaml task definitions (0 searches)
+   - IF new_task: Scan directory structure (e.g. glob `src/*/`, `packages/*/`) → Match names against request keywords
+5. Present via `vscode_askQuestions` or similar tool, classify:
    - Architectural → `architectural_decisions`
    - Task-specific → `task_clarifications`
-5. Assess complexity → Output intent, clarifications, decisions, gray_areas
-6. Return JSON per `Output Format`
+6. Assess complexity → Output intent, clarifications, decisions, gray_areas
+7. Return JSON per `Output Format`
 
 #### 0.2 Research Mode
 
@@ -100,20 +103,12 @@ NO suggestions/recommendations
 - Confidence ≥0.85, factual only
 - IF gaps: re-run expanded (max 2 loops)
 
-### 5. Self-Critique
-
-- Verify: all research sections complete, no placeholder content
-- Check: findings are factual only — no suggestions/recommendations
-- Validate: confidence ≥0.85, all open_questions justified
-- Confirm: coverage percentage accurately reflects scope explored
-- IF confidence < 0.85: re-run expanded scope (max 2 loops)
-
-### 6. Handle Failure
+### 5. Handle Failure
 
 - IF research cannot proceed: document what's missing, recommend next steps
 - Log failures to `docs/plan/{plan_id}/logs/` OR `docs/logs/`
 
-### 7. Output
+### 6. Output
 
 - Save: `docs/plan/{plan_id}/research_findings_{focus_area}.yaml`
 - Return JSON per `Output Format`
@@ -189,10 +184,12 @@ def calculate_confidence_from_results():
   "extra": {
     "user_intent": "continue_plan|modify_plan|new_task",
     "gray_areas": ["string"], // max 3
-    "learnings": { "patterns": ["string"], "gaps": ["string"] }  // EMPTY IS OK - max 3 items
+    "learnings": { "patterns": ["string"], "gaps": ["string"] }, // EMPTY IS OK - max 3 items
     "complexity": "simple|medium|complex",
+    "confidence": "number (0-1)",
     "task_clarifications": [{ "question": "string", "answer": "string" }], // omit if none
     "architectural_decisions": [{ "decision": "string", "affects": "string" }], // omit rationale
+    "focus_areas": ["string"], // if multiple identified, else omit
   },
 }
 ```
@@ -342,6 +339,7 @@ gaps: # REQUIRED
 - 3 passes: security-critical + sequential thinking
 - Cite sources for every claim
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
 
 ### I/O Optimization
 
diff --git a/agents/gem-reviewer.agent.md b/agents/gem-reviewer.agent.md
index 08c8d922..6faa085a 100644
--- a/agents/gem-reviewer.agent.md
+++ b/agents/gem-reviewer.agent.md
@@ -68,7 +68,6 @@ REVIEWER. Mission: scan for security issues, detect secrets, verify PRD complian
 #### 2.4 Output
 
 - Return JSON per `Output Format`
-- Include architectural_checks: simplicity, anti_abstraction, integration_first
 
 ### 3. Wave Scope
 
@@ -78,9 +77,10 @@ REVIEWER. Mission: scan for security issues, detect secrets, verify PRD complian
 
 #### 3.2 Integration Checks
 
-- get_errors (lightweight first)
-- get_errors, lint, unit tests (FILTERED: use patterns, names, or file paths to run only relevant tests as per available test environment and tools.)
-- run other tests as needed (e.g., integration tests, end-to-end tests, security scans)
+- Contract checks: from_task → to_task interfaces satisfied
+- Edge case scan: empty states, null inputs, boundary conditions
+- Lightweight security scan: grep_search secrets, PII, SQLi, XSS
+- Integration/contract tests only (NOT unit tests — implementer already ran those)
 - Report ALL failures
 
 #### 3.3 Report
@@ -146,23 +146,17 @@ extra: {
 }
 ```
 
-#### 4.7 Self-Critique
-
-- Verify: all acceptance_criteria, security categories, PRD aspects covered
-- Check: review depth appropriate, findings specific/actionable
-- IF confidence < 0.85: re-run expanded (max 2 loops)
-
-#### 4.8 Determine Status
+#### 4.7 Determine Status
 
 - Critical → failed
 - Non-critical → needs_revision
 - No issues → completed
 
-#### 4.9 Handle Failure
+#### 4.8 Handle Failure
 
 - Log failures to docs/plan/{plan_id}/logs/
 
-#### 4.10 Output
+#### 4.9 Output
 
 Return JSON per `Output Format`
 
@@ -180,7 +174,6 @@ Return JSON per `Output Format`
 - Security: Full grep_search audit on all changed files (secrets, PII, SQLi, XSS, hardcoded keys)
 - Quality: Lint, typecheck, build, unit tests (full suite)
 - Integration: Verify all contracts between tasks are satisfied
-- Architecture: Simplicity, anti-abstraction, integration-first principles
 - Cross-Reference: Compare actual changes vs planned tasks (planned_vs_actual)
 
 #### 5.3 Detect Out-of-Scope Changes
@@ -237,22 +230,23 @@ Return JSON with `final_review_summary`, `changed_files_analysis`, and standard
   "failure_type": "transient|fixable|needs_replan|escalate",
   "extra": {
     "review_scope": "plan|task|wave|final",
-    "findings": [{"category": "string", "severity": "string", "description": "string"}],  // omit location/recommendation if obvious
+    "findings": [{"category": "string", "severity": "string", "description": "string"}],
     "security_issues": [{"type": "string", "location": "string"}],
-    "prd_compliance_issues": [{"criterion": "string", "status": "pass|fail"}],  // omit details
-    "task_completion_check": {...},  // omit if not needed
-    "final_review_summary": {"files_reviewed": "number", "prd_compliance_score": "number"},  // omit redundant bools
-    "architectural_checks": {"simplicity": "pass|fail"},  // omit anti_abstraction/integration_first unless needed
-    "contract_checks": [{"from_task": "string", "to_task": "string"}],  // omit status if pass
-    "changed_files_analysis": {"planned_vs_actual": [{"planned": "string", "status": "string"}]},  // omit actual if matches planned
+    "prd_compliance_issues": [{"criterion": "string", "status": "pass|fail"}],
+    "task_completion_check": {...},
+    "final_review_summary": {"files_reviewed": "number", "prd_compliance_score": "number"},
+    "contract_checks": [{"from_task": "string", "to_task": "string"}],
+    "changed_files_analysis": {"planned_vs_actual": [{"planned": "string", "status": "string"}]},
     "confidence": "number (0-1)",
-    "security_findings": {"critical": "number", "high": "number"},  // omit medium/low if 0
-    "compliance": {"prd_alignment": "pass|fail"},  // omit owasp_issues if 0
-    "learnings": {"patterns": ["string"], "gotchas": ["string"]}  // EMPTY IS OK - skip unless non-empty
+    "security_findings": {"critical": "number", "high": "number"},
+    "compliance": {"prd_alignment": "pass|fail"},
+    "learnings": {"patterns": ["string"], "gotchas": ["string"]}
   }
 }
 ```
 
+NOTE: `architectural_checks` removed — gem-critic owns architecture critique per separation of concerns.
+
 </output_format>
 
 <rules>
@@ -278,6 +272,7 @@ Return JSON with `final_review_summary`, `changed_files_analysis`, and standard
 - PRD compliance: verify all acceptance_criteria
 - Read-only review: never modify code
 - Always use established library/framework patterns
+- State assumptions explicitly; never guess silently
 
 ### I/O Optimization
 
diff --git a/docs/README.plugins.md b/docs/README.plugins.md
index b24ae90a..5cd4e7f0 100644
--- a/docs/README.plugins.md
+++ b/docs/README.plugins.md
@@ -48,7 +48,7 @@ See [CONTRIBUTING.md](../CONTRIBUTING.md#adding-plugins) for guidelines on how t
 | [fastah-ip-geo-tools](../plugins/fastah-ip-geo-tools/README.md) | This plugin is for network operations engineers who wish to tune and publish IP geolocation feeds in RFC 8805 format. It consists of an AI Skill and an associated MCP server that geocodes geolocation place names to real cities for accuracy. | 1 items | geofeed, ip-geolocation, rfc-8805, rfc-9632, network-operations, isp, cloud, hosting, ixp |
 | [flowstudio-power-automate](../plugins/flowstudio-power-automate/README.md) | Give your AI agent full visibility into Power Automate cloud flows via the FlowStudio MCP server. Connect, debug, build, monitor health, and govern flows at scale — action-level inputs and outputs, not just status codes. | 5 items | power-automate, power-platform, flowstudio, mcp, model-context-protocol, cloud-flows, workflow-automation, monitoring, governance |
 | [frontend-web-dev](../plugins/frontend-web-dev/README.md) | Essential prompts, instructions, and chat modes for modern frontend web development including React, Angular, Vue, TypeScript, and CSS frameworks. | 4 items | frontend, web, react, typescript, javascript, css, html, angular, vue |
-| [gem-team](../plugins/gem-team/README.md) | Self-Learning Multi-agent orchestration harness for spec-driven development and automated verification. | 15 items | multi-agent, orchestration, tdd, testing, e2e, devops, security-audit, code-review, prd, mobile |
+| [gem-team](../plugins/gem-team/README.md) | Self-Learning Multi-agent orchestration harness for spec-driven development and automated verification. | 0 items | multi-agent, orchestration, tdd, testing, e2e, devops, security-audit, code-review, prd, mobile |
 | [go-mcp-development](../plugins/go-mcp-development/README.md) | Complete toolkit for building Model Context Protocol (MCP) servers in Go using the official github.com/modelcontextprotocol/go-sdk. Includes instructions for best practices, a prompt for generating servers, and an expert chat mode for guidance. | 2 items | go, golang, mcp, model-context-protocol, server-development, sdk |
 | [java-development](../plugins/java-development/README.md) | Comprehensive collection of prompts and instructions for Java development including Spring Boot, Quarkus, testing, documentation, and best practices. | 4 items | java, springboot, quarkus, jpa, junit, javadoc |
 | [java-mcp-development](../plugins/java-mcp-development/README.md) | Complete toolkit for building Model Context Protocol servers in Java using the official MCP Java SDK with reactive streams and Spring Boot integration. | 2 items | java, mcp, model-context-protocol, server-development, sdk, reactive-streams, spring-boot, reactor |
diff --git a/plugins/gem-team/.github/plugin/plugin.json b/plugins/gem-team/.github/plugin/plugin.json
index 5e5f34c3..9f89547e 100644
--- a/plugins/gem-team/.github/plugin/plugin.json
+++ b/plugins/gem-team/.github/plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "gem-team",
-  "version": "1.20.0",
+  "version": "1.24.0",
   "description": "Self-Learning Multi-agent orchestration harness for spec-driven development and automated verification.",
   "author": {
     "name": "mubaidr",
@@ -9,6 +9,7 @@
   },
   "license": "Apache-2.0",
   "repository": "https://github.com/mubaidr/gem-team",
+  "homepage": "https://github.com/mubaidr/gem-team",
   "keywords": [
     "multi-agent",
     "orchestration",
@@ -20,22 +21,5 @@
     "code-review",
     "prd",
     "mobile"
-  ],
-  "agents": [
-    "./agents/gem-browser-tester.md",
-    "./agents/gem-code-simplifier.md",
-    "./agents/gem-critic.md",
-    "./agents/gem-debugger.md",
-    "./agents/gem-designer-mobile.md",
-    "./agents/gem-designer.md",
-    "./agents/gem-devops.md",
-    "./agents/gem-documentation-writer.md",
-    "./agents/gem-implementer-mobile.md",
-    "./agents/gem-implementer.md",
-    "./agents/gem-mobile-tester.md",
-    "./agents/gem-orchestrator.md",
-    "./agents/gem-planner.md",
-    "./agents/gem-researcher.md",
-    "./agents/gem-reviewer.md"
   ]
 }
diff --git a/plugins/gem-team/README.md b/plugins/gem-team/README.md
index d1b84dff..99904d80 100644
--- a/plugins/gem-team/README.md
+++ b/plugins/gem-team/README.md
@@ -2,8 +2,6 @@
 
 Self-Learning Multi-agent orchestration harness for spec-driven development and automated verification.
 
-[![Support Me](https://img.shields.io/badge/patreon-000000?logo=patreon&logoColor=FFFFFF&style=flat)](https://patreon.com/mubaidr)
-
 ## Quick Start
 
 ```bash
@@ -268,13 +266,13 @@ cp -r .apm/agents <destination>
 
 ---
 
-### VS Code Extension (GitHub Copilot)
+### VS Code (GitHub Copilot)
 
-Search for "gem-team" in the VS Code Extensions marketplace.
+Search for "gem-team" in the VS Code Chat marketplace.
 
 1. Open VS Code
-2. Go to Extensions (Ctrl+Shift+X)
-3. Search "gem-team"
+2. Go to Chat Settings
+3. Search "gem-team" in agents or plugins marketplace
 4. Click Install
 
 ---