test(android-test-app): unify presentation framework with evidence collection

Implement P0-P5 directives for operator clarity, consistent outcomes, and easy evidence capture across all test phases. Changes: - alarm-test-lib.sh: Add evidence collection (capture_alarms, capture_logcat, capture_screenshot), verdict functions (verdict_pass/warn/fail), run directory management, and release gating support (RELEASE_GATE_PHASE3) - test-phase1.sh: Refactor to unified framework with CLI modes (--setup, --run, --smoke, --all, --ci), micro-prompts, evidence capture, and verdict blocks for all 5 tests - test-phase2.sh: Add evidence capture, verdict blocks, and STRICTNESS policy (soft/hard) for warn vs fail behavior - test-phase3.sh: Add evidence capture, verdict blocks, release gating (--gate-phase3), and fatigue reduction (time estimates, automation hints) - RUNBOOK-TESTING.md: New comprehensive operator guide (669 lines) covering prerequisites, all phases, evidence locations, verdict interpretation, common failures, and troubleshooting All test scripts now use consistent UI helpers (section, substep, info, ok, warn, error), standardized evidence collection, and clear verdict reporting. Evidence is saved to timestamped run directories (runs/<RUN_ID>/) with alarms, logs, and screenshots organized by test phase and scenario. Tests pass with consistent presentation and reproducible evidence collection.
2025-12-24 12:01:16 +00:00
parent 973af9b688
commit ac39255672
5 changed files with 2373 additions and 870 deletions
--- a/test-apps/android-test-app/alarm-test-lib.sh
+++ b/test-apps/android-test-app/alarm-test-lib.sh
@@ -12,6 +12,12 @@
 #
 # Configuration can be overridden before sourcing:
 #   APP_ID="custom.package" source "${SCRIPT_DIR}/alarm-test-lib.sh"
+#
+# STRICT MODE NOTE:
+# This library does NOT set strict mode itself (set -euo pipefail) because
+# it's a library file. Scripts that source this library SHOULD set strict mode:
+#   set -euo pipefail
+#   IFS=$'\n\t'

 # --- Config Defaults (can be overridden before sourcing) ---

@@ -27,6 +33,13 @@
 : "${SCREENSHOT_ROOT:=screenshots}"
 : "${ENABLE_SCREENSHOTS:=1}"

+# Run folder configuration (P1)
+: "${RUN_ID:=$(date '+%Y%m%d_%H%M%S' 2>/dev/null || echo 'unknown')}"
+: "${RUN_DIR:=runs/${RUN_ID}}"
+
+# Release gating configuration (P4)
+: "${RELEASE_GATE_PHASE3:=0}"
+
 # Derived config (for backward compatibility with Phase 1)
 PACKAGE="${APP_ID}"
 ACTIVITY="${APP_ID}/.MainActivity"
@@ -38,7 +51,11 @@ YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color

-# --- UI/Log Helpers ---
+# ========================================
+# PUBLIC API - UI/Log Helpers
+# ========================================
+# These are the primary functions that all scripts should use.
+# Deprecated functions (print_*, wait_for_*) are kept for backward compatibility.

 section() {
  echo
@@ -84,8 +101,54 @@ ui_prompt() {
  echo
 }

-# Phase 1 compatibility aliases (print_* functions)
+# ========================================
+# PUBLIC API - Command Execution Helpers
+# ========================================
+
+run_cmd() {
+  # Execute a command and capture output
+  # Usage: run_cmd "description" command [args...]
+  # Returns: exit code of command
+  local desc="$1"
+  shift
+  local cmd=("$@")
+  
+  info "Running: $desc"
+  if "${cmd[@]}"; then
+    ok "$desc completed"
+    return 0
+  else
+    local exit_code=$?
+    error "$desc failed (exit code: $exit_code)"
+    return $exit_code
+  fi
+}
+
+require_cmd() {
+  # Execute a command and exit on failure
+  # Usage: require_cmd "description" command [args...]
+  # Exits script if command fails
+  local desc="$1"
+  shift
+  local cmd=("$@")
+  
+  info "Required: $desc"
+  if ! "${cmd[@]}"; then
+    local exit_code=$?
+    error "$desc failed (exit code: $exit_code)"
+    exit $exit_code
+  fi
+  ok "$desc completed"
+}
+
+# ========================================
+# DEPRECATED - Phase 1 Compatibility Aliases
+# ========================================
+# These functions are kept for backward compatibility but should not be used
+# in new code. Use the public API functions above instead.
+
 print_header() {
+  # DEPRECATED: Use section() instead
  echo ""
  echo -e "${BLUE}========================================${NC}"
  echo -e "${BLUE}$1${NC}"
@@ -94,36 +157,44 @@ print_header() {
 }

 print_step() {
+  # DEPRECATED: Use substep() instead
  echo -e "${GREEN}→ Step $1:${NC} $2"
 }

 print_wait() {
+  # DEPRECATED: Use info() or warn() instead
  echo -e "${YELLOW}⏳ $1${NC}"
 }

 print_success() {
+  # DEPRECATED: Use ok() instead
  echo -e "${GREEN}✅ $1${NC}"
 }

 print_error() {
+  # DEPRECATED: Use error() instead
  echo -e "${RED}❌ $1${NC}"
 }

 print_info() {
+  # DEPRECATED: Use info() instead
  echo -e "${BLUE}ℹ️  $1${NC}"
 }

 print_warn() {
+  # DEPRECATED: Use warn() instead
  echo -e "${YELLOW}⚠️  $1${NC}"
 }

 wait_for_user() {
+  # DEPRECATED: Use pause() instead
  echo ""
  read -p "Press Enter when ready to continue..."
  echo ""
 }

 wait_for_ui_action() {
+  # DEPRECATED: Use ui_prompt() instead
  ui_prompt "$1"
 }

@@ -611,3 +682,257 @@ should_run_test() {
  return 1
 }

+# ========================================
+# PUBLIC API - Run Folder & Evidence Helpers (P1)
+# ========================================
+
+ensure_run_dir() {
+  # Create run directory structure if it doesn't exist
+  # Creates: RUN_DIR/logs, RUN_DIR/alarms, RUN_DIR/screens, RUN_DIR/notes
+  # Returns: 0 on success, 1 on failure
+  local base_dir
+  if [ -n "$SCRIPT_DIR" ] && [ -d "$SCRIPT_DIR" ]; then
+    base_dir="${SCRIPT_DIR}/${RUN_DIR}"
+  elif [ -n "${BASH_SOURCE[0]}" ]; then
+    local lib_dir
+    lib_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd 2>/dev/null)" || lib_dir=""
+    if [ -n "$lib_dir" ]; then
+      base_dir="${lib_dir}/${RUN_DIR}"
+    else
+      base_dir="${RUN_DIR}"
+    fi
+  else
+    base_dir="${RUN_DIR}"
+  fi
+  
+  mkdir -p "${base_dir}/logs" "${base_dir}/alarms" "${base_dir}/screens" "${base_dir}/notes" 2>/dev/null || {
+    error "Failed to create run directory: ${base_dir}"
+    return 1
+  }
+  
+  # Export for use by capture functions
+  export RUN_DIR_ABS="${base_dir}"
+  return 0
+}
+
+get_run_dir() {
+  # Get absolute path to current run directory
+  # Returns: absolute path, or empty string if not initialized
+  echo "${RUN_DIR_ABS:-}"
+}
+
+capture_alarms() {
+  # Capture AlarmManager dump to run folder
+  # Usage: capture_alarms "<label>"
+  # Saves to: RUN_DIR/alarms/<label>_alarms.txt
+  local label="$1"
+  local run_dir
+  run_dir="$(get_run_dir)"
+  
+  if [ -z "$run_dir" ]; then
+    warn "Run directory not initialized, skipping alarm capture: $label"
+    return 0
+  fi
+  
+  local safe_label="${label//[^a-zA-Z0-9_-]/_}"
+  local file="${run_dir}/alarms/${safe_label}_alarms.txt"
+  
+  info "Capturing alarms: $label → $file"
+  if $ADB_BIN shell dumpsys alarm > "$file" 2>/dev/null; then
+    ok "Alarms captured: $file"
+    return 0
+  else
+    warn "Failed to capture alarms: $label"
+    return 1
+  fi
+}
+
+capture_logcat() {
+  # Capture logcat output to run folder
+  # Usage: capture_logcat "<label>" "<grep_pattern>" "<lines>"
+  # Saves to: RUN_DIR/logs/<label>_logcat.txt
+  # If grep_pattern is empty, captures all recent logs
+  local label="$1"
+  local pattern="${2:-}"
+  local lines="${3:-250}"
+  local run_dir
+  run_dir="$(get_run_dir)"
+  
+  if [ -z "$run_dir" ]; then
+    warn "Run directory not initialized, skipping logcat capture: $label"
+    return 0
+  fi
+  
+  local safe_label="${label//[^a-zA-Z0-9_-]/_}"
+  local file="${run_dir}/logs/${safe_label}_logcat.txt"
+  
+  info "Capturing logcat: $label → $file"
+  
+  if [ -n "$pattern" ]; then
+    if $ADB_BIN logcat -d -t "$lines" | grep -E "$pattern" > "$file" 2>/dev/null; then
+      ok "Logcat captured (filtered): $file"
+      return 0
+    else
+      # Even if grep finds nothing, create empty file to indicate attempt
+      touch "$file"
+      warn "No logcat matches for pattern: $pattern"
+      return 0
+    fi
+  else
+    if $ADB_BIN logcat -d -t "$lines" > "$file" 2>/dev/null; then
+      ok "Logcat captured: $file"
+      return 0
+    else
+      warn "Failed to capture logcat: $label"
+      return 1
+    fi
+  fi
+}
+
+capture_screenshot() {
+  # Capture device screenshot to run folder
+  # Usage: capture_screenshot "<label>"
+  # Saves to: RUN_DIR/screens/<label>_screenshot.png
+  # Falls back to existing take_screenshot() if screenshots enabled
+  local label="$1"
+  local run_dir
+  run_dir="$(get_run_dir)"
+  
+  if [ -z "$run_dir" ]; then
+    warn "Run directory not initialized, skipping screenshot: $label"
+    return 0
+  fi
+  
+  if [ "$ENABLE_SCREENSHOTS" != "1" ]; then
+    warn "Screenshots disabled, skipping: $label"
+    return 0
+  fi
+  
+  local safe_label="${label//[^a-zA-Z0-9_-]/_}"
+  local file="${run_dir}/screens/${safe_label}_screenshot.png"
+  
+  info "Capturing screenshot: $label → $file"
+  if "$ADB_BIN" exec-out screencap -p > "$file" 2>/dev/null; then
+    if [ -s "$file" ]; then
+      ok "Screenshot captured: $file"
+      return 0
+    else
+      warn "Screenshot file is empty: $file"
+      rm -f "$file" 2>/dev/null || true
+      return 1
+    fi
+  else
+    warn "Failed to capture screenshot: $label"
+    return 1
+  fi
+}
+
+evidence_block() {
+  # Print evidence location block for a test
+  # Usage: evidence_block "<test_id>"
+  # Prints formatted block showing where artifacts are saved
+  local test_id="$1"
+  local run_dir
+  run_dir="$(get_run_dir)"
+  
+  if [ -z "$run_dir" ]; then
+    warn "Run directory not initialized, cannot show evidence block"
+    return 1
+  fi
+  
+  echo
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo "📦 EVIDENCE: $test_id"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo "Run ID: $RUN_ID"
+  echo "Evidence directory: $run_dir"
+  echo
+  echo "Artifacts:"
+  echo "  • Alarms:    $run_dir/alarms/"
+  echo "  • Logs:      $run_dir/logs/"
+  echo "  • Screens:   $run_dir/screens/"
+  echo "  • Notes:     $run_dir/notes/"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo
+}
+
+# ========================================
+# PUBLIC API - Verdict Functions (P1)
+# ========================================
+
+verdict_pass() {
+  # Emit a PASS verdict for a test
+  # Usage: verdict_pass "<test_id>" "<message>"
+  local test_id="$1"
+  local message="$2"
+  local run_dir
+  run_dir="$(get_run_dir)"
+  
+  echo
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo "✅ VERDICT: PASS"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo "Test ID: $test_id"
+  echo "Status:  PASS"
+  echo "Message: $message"
+  if [ -n "$run_dir" ]; then
+    echo "Evidence: $run_dir"
+  fi
+  echo "Next:    Continue to next test"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo
+}
+
+verdict_warn() {
+  # Emit a WARN verdict for a test
+  # Usage: verdict_warn "<test_id>" "<message>"
+  local test_id="$1"
+  local message="$2"
+  local run_dir
+  run_dir="$(get_run_dir)"
+  
+  echo
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo "⚠️  VERDICT: WARN"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo "Test ID: $test_id"
+  echo "Status:  WARN"
+  echo "Message: $message"
+  if [ -n "$run_dir" ]; then
+    echo "Evidence: $run_dir"
+  fi
+  echo "Next:    Review evidence and continue"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo
+}
+
+verdict_fail() {
+  # Emit a FAIL verdict for a test
+  # Usage: verdict_fail "<test_id>" "<message>"
+  # If RELEASE_GATE_PHASE3=1, this will cause script to exit with non-zero
+  local test_id="$1"
+  local message="$2"
+  local run_dir
+  run_dir="$(get_run_dir)"
+  
+  echo
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo "❌ VERDICT: FAIL"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo "Test ID: $test_id"
+  echo "Status:  FAIL"
+  echo "Message: $message"
+  if [ -n "$run_dir" ]; then
+    echo "Evidence: $run_dir"
+  fi
+  echo "Next:    Review evidence and investigate"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo
+  
+  # If release gating is enabled, exit with failure
+  if [ "${RELEASE_GATE_PHASE3:-0}" = "1" ]; then
+    error "Release gating enabled: exiting due to test failure"
+    exit 1
+  fi
+}
+
--- a/test-apps/android-test-app/docs/RUNBOOK-TESTING.md
+++ b/test-apps/android-test-app/docs/RUNBOOK-TESTING.md
@@ -0,0 +1,669 @@
+# Daily Notification Plugin — Test Operator Runbook
+
+**Last Updated:** 2025-01-XX  
+**Purpose:** Complete guide for operators running Phase 1, 2, and 3 test suites  
+**Audience:** Test operators, QA engineers, developers running manual tests
+
+---
+
+## Table of Contents
+
+1. [Prerequisites](#prerequisites)
+2. [Quick Start](#quick-start)
+3. [Phase 1: Daily Rollover & Recovery](#phase-1-daily-rollover--recovery)
+4. [Phase 2: Force Stop Recovery](#phase-2-force-stop-recovery)
+5. [Phase 3: Boot Recovery](#phase-3-boot-recovery)
+6. [Evidence & Artifacts](#evidence--artifacts)
+7. [Interpreting Verdicts](#interpreting-verdicts)
+8. [Common Failures & Fixes](#common-failures--fixes)
+9. [Troubleshooting](#troubleshooting)
+
+---
+
+## Prerequisites
+
+### Required Tools
+
+- **ADB (Android Debug Bridge)** — Must be in PATH
+  ```bash
+  which adb
+  adb version
+  ```
+- **Android Emulator or Physical Device** — Connected and accessible
+  ```bash
+  adb devices
+  ```
+- **Bash 4.0+** — For script execution
+  ```bash
+  bash --version
+  ```
+- **Gradle** — For building test app (included via `gradlew`)
+- **Java JDK 11+** — For Android builds
+
+### Environment Setup
+
+1. **Navigate to test directory:**
+   ```bash
+   cd test-apps/android-test-app
+   ```
+
+2. **Verify scripts are executable:**
+   ```bash
+   ls -la test-phase*.sh
+   chmod +x test-phase*.sh alarm-test-lib.sh
+   ```
+
+3. **Check ADB connection:**
+   ```bash
+   adb devices
+   # Should show your device/emulator
+   ```
+
+### Pre-Flight Checks
+
+Before running any test phase, verify:
+
+- [ ] ADB device is connected (`adb devices` shows device)
+- [ ] Emulator/device is unlocked and responsive
+- [ ] Test app can be built (`./gradlew assembleDebug` succeeds)
+- [ ] Previous test runs cleaned up (optional but recommended)
+
+---
+
+## Quick Start
+
+### Run All Phases (Advisory Mode)
+
+```bash
+# Phase 1: Daily rollover and recovery
+./test-phase1.sh --all
+
+# Phase 2: Force stop recovery
+./test-phase2.sh
+
+# Phase 3: Boot recovery
+./test-phase3.sh
+```
+
+### Run Specific Tests
+
+```bash
+# Phase 1: Run only TEST 0 (Daily Rollover)
+./test-phase1.sh 0
+
+# Phase 2: Run TEST 1 and TEST 3
+./test-phase2.sh 1 3
+
+# Phase 3: Run only TEST 4 (Silent Boot Recovery)
+./test-phase3.sh 4
+```
+
+### Run in Release-Gating Mode
+
+```bash
+# Phase 3: Fail fast on any test failure
+./test-phase3.sh --gate-phase3
+
+# Or via environment variable
+RELEASE_GATE_PHASE3=1 ./test-phase3.sh
+```
+
+---
+
+## Phase 1: Daily Rollover & Recovery
+
+**Purpose:** Verify daily rollover behavior, force-stop recovery, schedule updates, and error handling.
+
+**Expected Duration:** 30-45 minutes (all tests)
+
+### Test Modes
+
+```bash
+# Setup only (pre-flight checks, app install, permissions)
+./test-phase1.sh --setup
+
+# Run all tests
+./test-phase1.sh --all
+
+# Smoke test (minimal verification)
+./test-phase1.sh --smoke
+
+# CI mode (non-interactive, fail fast)
+./test-phase1.sh --ci
+
+# Run specific test
+./test-phase1.sh 0  # TEST 0: Daily Rollover
+./test-phase1.sh 1  # TEST 1: Force-Stop Recovery
+./test-phase1.sh 2  # TEST 2: Schedule Update
+./test-phase1.sh 3  # TEST 3: Recovery Timeout
+./test-phase1.sh 4  # TEST 4: Invalid Data Handling
+```
+
+### Test Descriptions
+
+**TEST 0: Daily Rollover Verification**
+- **Time:** 5-8 minutes
+- **Automatable:** Partial (requires manual verification)
+- **What it tests:** Daily rollover at midnight, schedule advancement
+- **Key steps:**
+  1. Schedule notification for future
+  2. Advance system time past midnight
+  3. Verify rollover occurred and next day scheduled
+
+**TEST 1: Force-Stop Recovery - Database Restoration**
+- **Time:** 8-12 minutes
+- **Automatable:** Partial (requires manual force-stop)
+- **What it tests:** Recovery after force-stop, database restoration
+- **Key steps:**
+  1. Schedule notification
+  2. Force-stop app
+  3. Relaunch and verify recovery
+
+**TEST 2: Schedule Update Verification**
+- **Time:** 5-8 minutes
+- **Automatable:** Partial
+- **What it tests:** Schedule updates, one-per-day semantics
+- **Key steps:**
+  1. Schedule notification
+  2. Update schedule
+  3. Verify only one alarm exists
+
+**TEST 3: Recovery Timeout**
+- **Time:** 3-5 minutes
+- **Automatable:** Yes (code verification)
+- **What it tests:** Recovery timeout handling
+- **Key steps:**
+  1. Verify timeout logic in code
+  2. Check timeout behavior
+
+**TEST 4: Invalid Data Handling**
+- **Time:** 5-8 minutes
+- **Automatable:** Partial
+- **What it tests:** Graceful handling of invalid data
+- **Key steps:**
+  1. Inject invalid data
+  2. Verify graceful recovery
+
+### Phase 1 Evidence Location
+
+All evidence is saved to: `runs/<RUN_ID>/`
+
+- **Alarms:** `runs/<RUN_ID>/alarms/phase1_*.txt`
+- **Logs:** `runs/<RUN_ID>/logs/phase1_*.txt`
+- **Screenshots:** `runs/<RUN_ID>/screens/phase1_*.png`
+
+---
+
+## Phase 2: Force Stop Recovery
+
+**Purpose:** Verify force-stop detection, alarm rescheduling, and recovery scenarios.
+
+**Expected Duration:** 15-20 minutes (all tests)
+
+### Test Modes
+
+```bash
+# Run all tests (default)
+./test-phase2.sh
+
+# Run specific tests
+./test-phase2.sh 1      # TEST 1: Force Stop with Cleared Alarms
+./test-phase2.sh 2      # TEST 2: Force Stop with Intact Alarms
+./test-phase2.sh 3      # TEST 3: First Launch / No Schedules
+
+# With strictness policy
+STRICTNESS=soft ./test-phase2.sh   # Default: minor quirks = warn
+STRICTNESS=hard ./test-phase2.sh    # Any issue = fail
+```
+
+### Test Descriptions
+
+**TEST 1: Force Stop – Alarms Cleared**
+- **Time:** 5-8 minutes
+- **Automatable:** Partial (requires manual force-stop)
+- **What it tests:** Force-stop detection when alarms are cleared
+- **Key steps:**
+  1. Schedule notification
+  2. Force-stop app (clears alarms on many devices)
+  3. Relaunch and verify FORCE_STOP scenario detected
+  4. Verify alarms rescheduled
+
+**TEST 2: Force Stop / Process Stop – Alarms Intact**
+- **Time:** 4-6 minutes
+- **Automatable:** Partial
+- **What it tests:** Recovery when alarms remain intact
+- **Key steps:**
+  1. Schedule notification
+  2. Soft-kill app (alarms remain)
+  3. Relaunch and verify FORCE_STOP scenario did NOT run
+
+**TEST 3: First Launch / No Schedules Safeguard**
+- **Time:** 3-5 minutes
+- **Automatable:** Yes
+- **What it tests:** No recovery on empty database
+- **Key steps:**
+  1. Uninstall app
+  2. Reinstall app
+  3. Reboot (without scheduling)
+  4. Verify no recovery logs or NONE scenario
+
+### Strictness Policy
+
+**`STRICTNESS=soft` (default):**
+- Minor device quirks = `verdict_warn`
+- Alarms still present after force-stop = warn
+- FORCE_STOP scenario not detected = warn
+
+**`STRICTNESS=hard`:**
+- Any unexpected behavior = `verdict_fail`
+- Alarms still present after force-stop = fail
+- FORCE_STOP scenario not detected = fail
+- Recovery errors = fail
+
+### Phase 2 Evidence Location
+
+- **Alarms:** `runs/<RUN_ID>/alarms/phase2_*.txt`
+- **Logs:** `runs/<RUN_ID>/logs/phase2_*.txt`
+- **Screenshots:** `runs/<RUN_ID>/screens/phase2_*.png`
+
+---
+
+## Phase 3: Boot Recovery
+
+**Purpose:** Verify boot recovery, missed alarm detection, and silent recovery.
+
+**Expected Duration:** 12-18 minutes (all tests)
+
+### Test Modes
+
+```bash
+# Run all tests (advisory mode, default)
+./test-phase3.sh
+
+# Run specific tests
+./test-phase3.sh 1      # TEST 1: Boot with Future Alarms
+./test-phase3.sh 2      # TEST 2: Boot with Past Alarms
+./test-phase3.sh 3      # TEST 3: Boot with No Schedules
+./test-phase3.sh 4      # TEST 4: Silent Boot Recovery
+
+# Release-gating mode (failures exit with non-zero)
+./test-phase3.sh --gate-phase3
+RELEASE_GATE_PHASE3=1 ./test-phase3.sh
+```
+
+### Test Descriptions
+
+**TEST 1: Boot with Future Alarms**
+- **Time:** 2-3 minutes (includes 30-60s reboot)
+- **Automatable:** Partial (requires manual reboot confirmation)
+- **What it tests:** Boot recovery with future alarms
+- **Key steps:**
+  1. Schedule notification for future
+  2. Reboot emulator
+  3. Verify BOOT scenario detected
+  4. Verify alarms rescheduled
+
+**TEST 2: Boot with Past Alarms**
+- **Time:** 5-6 minutes (includes 3min wait + 30-60s reboot)
+- **Automatable:** Partial (requires manual time advancement or wait)
+- **What it tests:** Missed alarm detection and rescheduling
+- **Key steps:**
+  1. Schedule notification for 2 minutes future
+  2. Wait 3 minutes (alarm time passes)
+  3. Reboot emulator
+  4. Verify missed alarms detected
+  5. Verify next occurrence scheduled
+
+**TEST 3: Boot with No Schedules**
+- **Time:** 2-3 minutes (includes 30-60s reboot)
+- **Automatable:** Yes
+- **What it tests:** Graceful handling of empty database
+- **Key steps:**
+  1. Uninstall app
+  2. Reinstall app
+  3. Reboot (without scheduling)
+  4. Verify no recovery logs or NONE scenario
+
+**TEST 4: Silent Boot Recovery (App Never Opened)**
+- **Time:** 2-3 minutes (includes 30-60s reboot)
+- **Automatable:** Partial (requires manual verification app not opened)
+- **What it tests:** Boot recovery without app launch
+- **Key steps:**
+  1. Schedule notification
+  2. Reboot emulator
+  3. **DO NOT open app** after reboot
+  4. Verify boot recovery occurred silently
+  5. Verify alarms recreated
+
+### Release Gating
+
+**Advisory Mode (default):**
+- Failures become warnings
+- Script continues to next test
+- Use for development/testing
+
+**Release-Blocking Mode (`--gate-phase3` or `RELEASE_GATE_PHASE3=1`):**
+- Failures cause script to exit with non-zero
+- Use for CI/CD or release validation
+- First failure stops execution
+
+### Phase 3 Evidence Location
+
+- **Alarms:** `runs/<RUN_ID>/alarms/phase3_*.txt`
+- **Logs:** `runs/<RUN_ID>/logs/phase3_*.txt`
+- **Screenshots:** `runs/<RUN_ID>/screens/phase3_*.png`
+
+---
+
+## Evidence & Artifacts
+
+### Run Directory Structure
+
+Each test run creates a timestamped directory:
+
+```
+runs/
+  └── 20250124_143022_abc123/
+      ├── alarms/
+      │   ├── phase1_test0_initial.txt
+      │   ├── phase1_test0_before_schedule.txt
+      │   └── ...
+      ├── logs/
+      │   ├── phase1_test0_initial.txt
+      │   ├── phase1_test0_after_schedule.txt
+      │   └── ...
+      ├── screens/
+      │   ├── phase1_test0_after_schedule_screenshot.png
+      │   └── ...
+      └── notes/
+          └── (manual notes can be added here)
+```
+
+### Evidence Types
+
+**Alarm Dumps:**
+- Captured via `adb shell dumpsys alarm`
+- Shows all scheduled alarms
+- Used to verify plugin alarms exist
+
+**Logcat Logs:**
+- Captured via `adb logcat`
+- Filtered by tag (e.g., `DNP`, `DNP-REACTIVATION`)
+- Used to verify recovery scenarios and errors
+
+**Screenshots:**
+- Captured via `adb exec-out screencap`
+- Shows app state at key points
+- Used for visual verification
+
+### Accessing Evidence
+
+```bash
+# Get current run directory
+cd test-apps/android-test-app
+ls -la runs/
+
+# View latest run
+ls -la runs/$(ls -t runs/ | head -1)
+
+# View specific evidence
+cat runs/<RUN_ID>/alarms/phase1_test0_initial.txt
+cat runs/<RUN_ID>/logs/phase1_test0_after_schedule.txt
+```
+
+---
+
+## Interpreting Verdicts
+
+### Verdict Types
+
+**✅ PASS (`verdict_pass`):**
+- Test passed all checks
+- Continue to next test
+- Evidence saved for reference
+
+**⚠️ WARN (`verdict_warn`):**
+- Test had minor issues or device quirks
+- May be acceptable depending on context
+- Review evidence and decide if action needed
+- Script continues
+
+**❌ FAIL (`verdict_fail`):**
+- Test failed critical checks
+- Review evidence immediately
+- In advisory mode: script continues
+- In release-gating mode: script exits with non-zero
+
+### Verdict Format
+
+```
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+✅ VERDICT: PASS
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+Test ID: phase1_test0_daily_rollover
+Status:  PASS
+Message: Daily rollover verified successfully
+Evidence: runs/20250124_143022_abc123
+Next:    Continue to next test
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+```
+
+### Evidence Block Format
+
+```
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+📦 EVIDENCE: phase1_test0_daily_rollover
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+Run ID: 20250124_143022_abc123
+Evidence directory: runs/20250124_143022_abc123
+
+Artifacts:
+  • Alarms:    runs/20250124_143022_abc123/alarms/
+  • Logs:      runs/20250124_143022_abc123/logs/
+  • Screens:   runs/20250124_143022_abc123/screens/
+  • Notes:     runs/20250124_143022_abc123/notes/
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+```
+
+---
+
+## Common Failures & Fixes
+
+### Phase 1 Failures
+
+**"No plugin alarms found before rollover"**
+- **Cause:** Notification not scheduled correctly
+- **Fix:** Verify plugin is configured, check app UI shows scheduled notification
+- **Evidence:** Check `alarms/phase1_test0_before_schedule.txt`
+
+**"Rollover did not occur"**
+- **Cause:** System time not advanced correctly, or rollover logic issue
+- **Fix:** Verify system time was advanced past midnight, check logs for rollover activity
+- **Evidence:** Check `logs/phase1_test0_after_rollover.txt`
+
+**"Recovery did not restore database"**
+- **Cause:** Force-stop recovery not triggered, or database restoration failed
+- **Fix:** Verify force-stop actually cleared alarms, check recovery logs
+- **Evidence:** Check `logs/phase1_test1_after_recovery.txt`
+
+### Phase 2 Failures
+
+**"FORCE_STOP scenario not detected"**
+- **Cause:** Device/emulator didn't clear alarms on force-stop, or scenario detection logic issue
+- **Fix:** Check if device clears alarms on force-stop (device-specific), verify boot flag cleared
+- **Evidence:** Check `logs/phase2_test1_after_recovery.txt`, verify `scenario=FORCE_STOP`
+
+**"Alarms still present after force-stop"**
+- **Cause:** Device/emulator doesn't clear alarms on force-stop (common on some devices)
+- **Fix:** This is device-specific behavior. In `STRICTNESS=soft` mode, this is a warning. In `STRICTNESS=hard` mode, this is a failure.
+- **Evidence:** Check `alarms/phase2_test1_after_force_stop.txt`
+
+**"rescheduled>0 on first launch"**
+- **Cause:** Boot recovery misfiring on empty database
+- **Fix:** Check recovery logic, verify database is actually empty
+- **Evidence:** Check `logs/phase2_test3_after_reboot.txt`
+
+### Phase 3 Failures
+
+**"Boot recovery not detected"**
+- **Cause:** Boot receiver not registered, or BOOT_COMPLETED permission missing
+- **Fix:** Verify `AndroidManifest.xml` has boot receiver and permission
+- **Evidence:** Check `logs/phase3_test1_after_reboot.txt`, verify boot receiver logs
+
+**"No missed alarms detected"**
+- **Cause:** Alarm time didn't actually pass before reboot
+- **Fix:** Verify system time was advanced, or wait longer before reboot
+- **Evidence:** Check `alarms/phase3_test2_after_wait.txt`, verify alarm time is in past
+
+**"Boot recovery ran but alarms not recreated"**
+- **Cause:** Recovery succeeded but alarm scheduling failed
+- **Fix:** Check alarm scheduling logic, verify permissions
+- **Evidence:** Check `logs/phase3_test4_after_reboot.txt`, verify `rescheduled>0` but `after_count=0`
+
+### General Failures
+
+**"ADB device not found"**
+- **Cause:** Device disconnected, ADB not in PATH, or device not authorized
+- **Fix:** Run `adb devices`, verify device shows as "device" (not "unauthorized"), reconnect if needed
+
+**"App build failed"**
+- **Cause:** Gradle issues, missing dependencies, or Java version mismatch
+- **Fix:** Run `./gradlew clean`, verify Java version, check `build.gradle` dependencies
+
+**"Permission denied"**
+- **Cause:** App doesn't have required permissions
+- **Fix:** Grant permissions via app UI or `adb shell pm grant`
+
+---
+
+## Troubleshooting
+
+### Script Won't Run
+
+**"Permission denied: ./test-phase1.sh"**
+```bash
+chmod +x test-phase1.sh test-phase2.sh test-phase3.sh alarm-test-lib.sh
+```
+
+**"Command not found: adb"**
+```bash
+# Add Android SDK platform-tools to PATH
+export PATH="$PATH:$ANDROID_HOME/platform-tools"
+```
+
+**"Syntax error near unexpected token"**
+```bash
+# Verify bash version
+bash --version  # Should be 4.0+
+
+# Check script syntax
+bash -n test-phase1.sh
+```
+
+### Evidence Not Captured
+
+**"Run directory not initialized"**
+- **Cause:** `ensure_run_dir()` failed
+- **Fix:** Check write permissions in test directory, verify `runs/` directory can be created
+
+**"Screenshot capture failed"**
+- **Cause:** `ENABLE_SCREENSHOTS=0` or device doesn't support screencap
+- **Fix:** Set `ENABLE_SCREENSHOTS=1` (default), verify device supports `adb exec-out screencap`
+
+**"Logcat capture empty"**
+- **Cause:** Logs cleared before capture, or filter pattern doesn't match
+- **Fix:** Verify log tag matches filter pattern, check logs weren't cleared
+
+### Device/Emulator Issues
+
+**"Emulator not responding"**
+```bash
+# Restart emulator
+adb kill-server
+adb start-server
+adb devices
+```
+
+**"Device unauthorized"**
+- **Cause:** USB debugging authorization not granted
+- **Fix:** Check device screen for authorization prompt, click "Allow"
+
+**"Alarms not clearing on force-stop"**
+- **Cause:** Device-specific behavior (some devices don't clear alarms)
+- **Fix:** This is expected on some devices. Use `STRICTNESS=soft` mode to treat as warning.
+
+### Test-Specific Issues
+
+**"Phase 3 reboot takes too long"**
+- **Cause:** Emulator is slow or hung
+- **Fix:** Wait longer (60-90 seconds), or restart emulator if completely hung
+
+**"Time advancement not working"**
+- **Cause:** System time can't be advanced (requires root or emulator)
+- **Fix:** Use emulator with root, or manually advance time via emulator settings
+
+**"Plugin not configured"**
+- **Cause:** Plugin setup not completed
+- **Fix:** Run `./test-phase1.sh --setup` to configure plugin, or manually configure in app UI
+
+---
+
+## Quick Reference
+
+### Command Cheat Sheet
+
+```bash
+# Phase 1
+./test-phase1.sh --setup          # Setup only
+./test-phase1.sh --all            # All tests
+./test-phase1.sh --smoke          # Smoke test
+./test-phase1.sh 0                # TEST 0 only
+
+# Phase 2
+./test-phase2.sh                  # All tests (soft mode)
+STRICTNESS=hard ./test-phase2.sh  # All tests (hard mode)
+./test-phase2.sh 1 3              # TEST 1 and 3
+
+# Phase 3
+./test-phase3.sh                  # All tests (advisory)
+./test-phase3.sh --gate-phase3    # All tests (release-blocking)
+./test-phase3.sh 2                # TEST 2 only
+```
+
+### Evidence Locations
+
+```bash
+# Latest run
+ls -la runs/$(ls -t runs/ | head -1)
+
+# Specific evidence
+cat runs/<RUN_ID>/alarms/phase1_test0_initial.txt
+cat runs/<RUN_ID>/logs/phase1_test0_after_schedule.txt
+```
+
+### Verdict Meanings
+
+- **✅ PASS:** Test passed, continue
+- **⚠️ WARN:** Minor issue, review evidence, continue
+- **❌ FAIL:** Critical failure, review evidence, may exit (if gating enabled)
+
+---
+
+## Support & Feedback
+
+**Issues or Questions?**
+- Check evidence files in `runs/<RUN_ID>/`
+- Review test logs for detailed error messages
+- Consult platform-specific documentation in `docs/platform/android/`
+
+**Improving This Runbook:**
+- Document new failure patterns as they're discovered
+- Update time estimates based on actual test runs
+- Add automation hints for new test scenarios
+
+---
+
+**Last Updated:** 2025-01-XX  
+**Version:** 1.0  
+**Maintainer:** Test Engineering Team
+
--- a/test-apps/android-test-app/test-phase1.sh
+++ b/test-apps/android-test-app/test-phase1.sh
--- a/test-apps/android-test-app/test-phase2.sh
+++ b/test-apps/android-test-app/test-phase2.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash

 set -euo pipefail
+IFS=$'\n\t'

 # ========================================
 # Phase 2 Testing Script – Force Stop Recovery
@@ -10,6 +11,12 @@ set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/alarm-test-lib.sh"

+# Initialize run directory (P1)
+ensure_run_dir || {
+  error "Failed to initialize run directory"
+  exit 1
+}
+
 # Phase 2 specific configuration
 # Log tags / patterns (matched to actual ReactivationManager logs)
 FORCE_STOP_SCENARIO_VALUE="FORCE_STOP"
@@ -17,6 +24,11 @@ COLD_START_SCENARIO_VALUE="COLD_START"
 NONE_SCENARIO_VALUE="NONE"
 BOOT_SCENARIO_VALUE="BOOT"

+# Strictness policy (P3.2)
+# soft: minor device quirks = warn (default, for development)
+# hard: any unexpected alarm loss / missed schedule = fail (for release gating)
+: "${STRICTNESS:=soft}"
+
 # Allow selecting specific tests on the command line (e.g. ./test-phase2.sh 2 3)
 SELECTED_TESTS=()

@@ -27,16 +39,31 @@ SELECTED_TESTS=()
 test1_force_stop_cleared_alarms() {
  section "TEST 1: Force Stop – Alarms Cleared"

-  echo "Purpose: Verify force stop detection and alarm rescheduling when alarms are cleared."
+  info "Purpose: Verify force stop detection and alarm rescheduling when alarms are cleared."
+  info "Expected time: 5-8 minutes"
+  info "Automatable: Partial (requires manual force-stop verification)"
+  echo ""

  pause

+  # Capture initial state
+  capture_alarms "phase2_test1_initial"
+  capture_logcat "phase2_test1_initial" "DNP" 50
+
  substep "Step 1: Launch app & check plugin status"
  launch_app

-  ui_prompt "In the app UI, verify plugin status:\n\n  ⚙️ Plugin Settings: ✅ Configured\n  🔌 Native Fetcher: ✅ Configured\n\nIf either shows ❌ or 'Not configured', click 'Configure Plugin', wait until both are ✅, then press Enter."
+  ui_prompt "1) In the app UI, verify plugin status:

-  ui_prompt "Now schedule at least one future notification (e.g., click 'Test Notification' to schedule for a few minutes in the future)."
+  ⚙️ Plugin Settings: ✅ Configured
+  🔌 Native Fetcher: ✅ Configured
+
+  If either shows ❌ or 'Not configured', click 'Configure Plugin', wait until both are ✅, then press Enter."
+
+  ui_prompt "2) Now schedule at least one future notification (e.g., click 'Test Notification' to schedule for a few minutes in the future)."
+
+  # Capture before force-stop state
+  capture_alarms "phase2_test1_before_force_stop"

  substep "Step 2: Verify alarms are scheduled"
  show_alarms
@@ -59,6 +86,9 @@ test1_force_stop_cleared_alarms() {
  substep "Step 3: Force stop app (should clear alarms on many devices)"
  force_stop_app

+  # Capture after force-stop state
+  capture_alarms "phase2_test1_after_force_stop"
+
  substep "Step 4: Check alarms after force stop"
  local after_count system_after
  after_count="$(get_plugin_alarm_count)"
@@ -68,8 +98,12 @@ test1_force_stop_cleared_alarms() {
  show_alarms

  if [[ "$after_count" -gt 0 ]]; then
-    warn "Plugin alarms still present after force stop. This device/OS may not clear alarms on force stop."
-    warn "TEST 1 will continue but may not fully validate FORCE_STOP scenario."
+    if [[ "$STRICTNESS" == "hard" ]]; then
+      error "Plugin alarms still present after force stop (strict mode: hard)"
+    else
+      warn "Plugin alarms still present after force stop. This device/OS may not clear alarms on force stop."
+      warn "TEST 1 will continue but may not fully validate FORCE_STOP scenario."
+    fi
  fi

  pause
@@ -77,7 +111,7 @@ test1_force_stop_cleared_alarms() {
  substep "Step 4.5: Clear boot flag (prevent false BOOT detection)"
  # Clear boot flag to ensure force stop detection works correctly
  # Boot flag might be set from previous runs or emulator quirks
-  adb shell "run-as ${APP_ID} rm -f shared_prefs/dailynotification_recovery.xml 2>/dev/null || true"
+  $ADB_BIN shell "run-as ${APP_ID} rm -f shared_prefs/dailynotification_recovery.xml 2>/dev/null || true"
  info "Boot flag cleared (if it existed)"

  substep "Step 5: Launch app (triggers recovery) and capture logs"
@@ -85,6 +119,11 @@ test1_force_stop_cleared_alarms() {
  launch_app
  sleep 5  # give recovery a moment to run

+  # Capture after recovery state
+  capture_alarms "phase2_test1_after_recovery"
+  capture_logcat "phase2_test1_after_recovery" "DNP-REACTIVATION" 250
+  capture_screenshot "phase2_test1_after_recovery"
+
  info "Collecting recovery logs..."
  local logs
  logs="$(get_recovery_logs)"
@@ -104,24 +143,64 @@ test1_force_stop_cleared_alarms() {
  echo "  errors     = ${errors}"
  echo

+  # Determine verdict based on STRICTNESS policy
+  local test1_passed=false
+  local test1_message=""
+
  if [[ "$errors" -gt 0 ]]; then
    error "Recovery reported errors>0 (errors=$errors)"
+    if [[ "$STRICTNESS" == "hard" ]]; then
+      test1_message="Recovery reported errors (errors=$errors)"
+    else
+      test1_message="Recovery reported errors but continuing (errors=$errors, strictness=soft)"
+    fi
  fi

  if [[ "$scenario" == "$FORCE_STOP_SCENARIO_VALUE" && "$rescheduled" -gt 0 ]]; then
    ok "TEST 1 PASSED: Force stop detected and alarms rescheduled (scenario=$scenario, rescheduled=$rescheduled)."
+    test1_passed=true
+    test1_message="Force stop detected and alarms rescheduled (scenario=$scenario, rescheduled=$rescheduled)"
  elif [[ "$scenario" == "$FORCE_STOP_SCENARIO_VALUE" && "$rescheduled" -eq 0 ]]; then
-    warn "TEST 1: scenario=FORCE_STOP but rescheduled=0. Check implementation or logs."
+    if [[ "$STRICTNESS" == "hard" ]]; then
+      test1_message="scenario=FORCE_STOP but rescheduled=0 (strict mode: hard)"
+    else
+      warn "TEST 1: scenario=FORCE_STOP but rescheduled=0. Check implementation or logs."
+      test1_message="scenario=FORCE_STOP but rescheduled=0 (strictness=soft)"
+    fi
  elif [[ "$after_count" -gt 0 ]]; then
    info "TEST 1: Device/emulator kept alarms after force stop; FORCE_STOP scenario may not trigger here."
    if [[ "$rescheduled" -gt 0 ]]; then
      info "Recovery still worked (rescheduled=$rescheduled), but scenario was ${scenario:-COLD_START} instead of FORCE_STOP"
+      test1_passed=true
+      test1_message="Recovery worked but FORCE_STOP scenario not detected (device kept alarms, rescheduled=$rescheduled, scenario=${scenario:-COLD_START})"
+    else
+      if [[ "$STRICTNESS" == "hard" ]]; then
+        test1_message="Device kept alarms but recovery didn't reschedule (strict mode: hard)"
+      else
+        test1_message="Device kept alarms, FORCE_STOP scenario may not trigger (strictness=soft)"
+      fi
    fi
  else
-    warn "TEST 1: Expected FORCE_STOP scenario not clearly detected. Review logs and scenario detection logic."
-    info "Scenario detected: ${scenario:-<none>}, rescheduled=$rescheduled"
+    if [[ "$STRICTNESS" == "hard" ]]; then
+      test1_message="Expected FORCE_STOP scenario not detected (strict mode: hard, scenario=${scenario:-<none>}, rescheduled=$rescheduled)"
+    else
+      warn "TEST 1: Expected FORCE_STOP scenario not clearly detected. Review logs and scenario detection logic."
+      info "Scenario detected: ${scenario:-<none>}, rescheduled=$rescheduled"
+      test1_message="FORCE_STOP scenario not clearly detected (scenario=${scenario:-<none>}, rescheduled=$rescheduled, strictness=soft)"
+    fi
  fi

+  # Emit verdict
+  if [[ "$test1_passed" == "true" ]]; then
+    verdict_pass "phase2_test1_force_stop_cleared" "$test1_message"
+  elif [[ "$STRICTNESS" == "hard" ]]; then
+    verdict_fail "phase2_test1_force_stop_cleared" "$test1_message"
+  else
+    verdict_warn "phase2_test1_force_stop_cleared" "$test1_message"
+  fi
+
+  evidence_block "phase2_test1_force_stop_cleared"
+
  substep "Step 6: Verify alarms are rescheduled in AlarmManager"
  show_alarms
 }
@@ -133,13 +212,25 @@ test1_force_stop_cleared_alarms() {
 test2_force_stop_intact_alarms() {
  section "TEST 2: Force Stop / Process Stop – Alarms Intact"

-  echo "Purpose: Verify that heavy FORCE_STOP recovery does not run when alarms are still present."
+  info "Purpose: Verify that heavy FORCE_STOP recovery does not run when alarms are still present."
+  info "Expected time: 4-6 minutes"
+  info "Automatable: Partial (requires manual verification)"
+  echo ""

  pause

+  # Capture initial state
+  capture_alarms "phase2_test2_initial"
+  capture_logcat "phase2_test2_initial" "DNP" 50
+
  substep "Step 1: Launch app & schedule notifications"
  launch_app
-  ui_prompt "In the app UI, ensure plugin is configured and schedule at least one future notification.\n\nPress Enter when done."
+  ui_prompt "1) In the app UI, ensure plugin is configured and schedule at least one future notification.
+
+  Press Enter when done."
+
+  # Capture before soft stop state
+  capture_alarms "phase2_test2_before_soft_stop"

  substep "Step 2: Verify alarms are scheduled"
  show_alarms
@@ -165,6 +256,9 @@ test2_force_stop_intact_alarms() {
  sleep 2
  ok "Kill signal sent (soft stop)"

+  # Capture after soft stop state
+  capture_alarms "phase2_test2_after_soft_stop"
+
  substep "Step 4: Verify alarms are still scheduled"
  local after system_after
  after="$(get_plugin_alarm_count)"
@@ -174,7 +268,11 @@ test2_force_stop_intact_alarms() {
  show_alarms

  if [[ "$after" -eq 0 ]]; then
-    warn "Alarms appear cleared after soft stop; this environment may not distinguish TEST 2 well."
+    if [[ "$STRICTNESS" == "hard" ]]; then
+      error "Alarms cleared after soft stop (strict mode: hard)"
+    else
+      warn "Alarms appear cleared after soft stop; this environment may not distinguish TEST 2 well."
+    fi
  fi

  pause
@@ -184,6 +282,11 @@ test2_force_stop_intact_alarms() {
  launch_app
  sleep 5

+  # Capture after recovery state
+  capture_alarms "phase2_test2_after_recovery"
+  capture_logcat "phase2_test2_after_recovery" "DNP-REACTIVATION" 250
+  capture_screenshot "phase2_test2_after_recovery"
+
  info "Collecting recovery logs..."
  local logs
  logs="$(get_recovery_logs)"
@@ -205,16 +308,49 @@ test2_force_stop_intact_alarms() {
  echo "  errors     = ${errors}"
  echo

+  # Determine verdict based on STRICTNESS policy
+  local test2_passed=false
+  local test2_message=""
+
  if [[ "$errors" -gt 0 ]]; then
    error "Recovery reported errors>0 (errors=$errors)"
+    if [[ "$STRICTNESS" == "hard" ]]; then
+      test2_message="Recovery reported errors (errors=$errors)"
+    else
+      test2_message="Recovery reported errors but continuing (errors=$errors, strictness=soft)"
+    fi
  fi

  if [[ "$after" -gt 0 && "$rescheduled" -eq 0 && "$scenario" != "$FORCE_STOP_SCENARIO_VALUE" ]]; then
    ok "TEST 2 PASSED: Alarms remained intact, and FORCE_STOP scenario did not run (scenario=$scenario, rescheduled=0)."
+    test2_passed=true
+    test2_message="Alarms remained intact, FORCE_STOP scenario did not run (scenario=$scenario, rescheduled=0)"
  else
-    warn "TEST 2: Verify that FORCE_STOP recovery didn't misfire when alarms were intact."
-    info "Scenario=${scenario:-<none>}, rescheduled=$rescheduled, after_count=$after"
+    if [[ "$STRICTNESS" == "hard" ]]; then
+      if [[ "$after" -eq 0 ]]; then
+        test2_message="Alarms were cleared but should have remained (strict mode: hard)"
+      elif [[ "$rescheduled" -gt 0 ]]; then
+        test2_message="FORCE_STOP recovery ran when alarms were intact (strict mode: hard, scenario=$scenario, rescheduled=$rescheduled)"
+      else
+        test2_message="Unexpected state (strict mode: hard, scenario=$scenario, rescheduled=$rescheduled, after=$after)"
+      fi
+    else
+      warn "TEST 2: Verify that FORCE_STOP recovery didn't misfire when alarms were intact."
+      info "Scenario=${scenario:-<none>}, rescheduled=$rescheduled, after_count=$after"
+      test2_message="FORCE_STOP recovery may have misfired (scenario=$scenario, rescheduled=$rescheduled, after=$after, strictness=soft)"
+    fi
  fi
+
+  # Emit verdict
+  if [[ "$test2_passed" == "true" ]]; then
+    verdict_pass "phase2_test2_force_stop_intact" "$test2_message"
+  elif [[ "$STRICTNESS" == "hard" ]]; then
+    verdict_fail "phase2_test2_force_stop_intact" "$test2_message"
+  else
+    verdict_warn "phase2_test2_force_stop_intact" "$test2_message"
+  fi
+
+  evidence_block "phase2_test2_force_stop_intact"
 }

 # ------------------------------------------------------------------------------
@@ -224,10 +360,16 @@ test2_force_stop_intact_alarms() {
 test3_first_launch_no_schedules() {
  section "TEST 3: First Launch / No Schedules Safeguard"

-  echo "Purpose: Ensure force-stop recovery is NOT triggered when DB is empty or plugin isn't configured."
+  info "Purpose: Ensure force-stop recovery is NOT triggered when DB is empty or plugin isn't configured."
+  info "Expected time: 3-5 minutes"
+  info "Automatable: Yes"
+  echo ""

  pause

+  # Capture initial state (before uninstall)
+  capture_alarms "phase2_test3_initial"
+
  substep "Step 1: Uninstall app to clear DB/state"
  set +e
  $ADB_BIN uninstall "$APP_ID" >/dev/null 2>&1
@@ -243,7 +385,7 @@ test3_first_launch_no_schedules() {
  fi

  info "Clearing logcat..."
-  $ADB_BIN logcat -c
+  clear_logs
  ok "Logs cleared"

  pause
@@ -252,6 +394,11 @@ test3_first_launch_no_schedules() {
  launch_app
  sleep 5

+  # Capture after first launch state
+  capture_alarms "phase2_test3_after_first_launch"
+  capture_logcat "phase2_test3_after_first_launch" "DNP-REACTIVATION" 250
+  capture_screenshot "phase2_test3_after_first_launch"
+
  substep "Step 4: Collect logs and ensure no force-stop recovery ran"
  local logs
  logs="$(get_recovery_logs)"
@@ -267,15 +414,45 @@ test3_first_launch_no_schedules() {
  echo "  rescheduled= ${rescheduled}"
  echo

+  # Determine verdict based on STRICTNESS policy
+  local test3_passed=false
+  local test3_message=""
+
  if [[ -z "$logs" ]]; then
    ok "TEST 3 PASSED: No force-stop recovery logs on first launch."
+    test3_passed=true
+    test3_message="No force-stop recovery logs on first launch (expected behavior)"
  elif [[ "$scenario" == "$NONE_SCENARIO_VALUE" && "$rescheduled" -eq 0 ]]; then
    ok "TEST 3 PASSED: NONE scenario logged with rescheduled=0 on first launch."
+    test3_passed=true
+    test3_message="NONE scenario logged with rescheduled=0 on first launch (expected behavior)"
  elif [[ "$rescheduled" -gt 0 ]]; then
-    warn "TEST 3: rescheduled>0 on first launch / empty DB. Check that force-stop recovery isn't misfiring."
+    if [[ "$STRICTNESS" == "hard" ]]; then
+      test3_message="rescheduled>0 on first launch / empty DB - force-stop recovery misfired (strict mode: hard, rescheduled=$rescheduled)"
+    else
+      warn "TEST 3: rescheduled>0 on first launch / empty DB. Check that force-stop recovery isn't misfiring."
+      test3_message="rescheduled>0 on first launch (rescheduled=$rescheduled, strictness=soft)"
+    fi
  else
-    info "TEST 3: Logs present but no rescheduling; review scenario handling to ensure it's explicit about NONE / FIRST_LAUNCH."
+    if [[ "$STRICTNESS" == "hard" ]]; then
+      test3_message="Logs present but scenario unclear (strict mode: hard, scenario=${scenario:-<none>}, rescheduled=$rescheduled)"
+    else
+      info "TEST 3: Logs present but no rescheduling; review scenario handling to ensure it's explicit about NONE / FIRST_LAUNCH."
+      test3_passed=true  # Not a failure, just needs review
+      test3_message="Logs present but no rescheduling (scenario=${scenario:-<none>}, rescheduled=$rescheduled, strictness=soft)"
+    fi
  fi
+
+  # Emit verdict
+  if [[ "$test3_passed" == "true" ]]; then
+    verdict_pass "phase2_test3_first_launch_no_schedules" "$test3_message"
+  elif [[ "$STRICTNESS" == "hard" ]]; then
+    verdict_fail "phase2_test3_first_launch_no_schedules" "$test3_message"
+  else
+    verdict_warn "phase2_test3_first_launch_no_schedules" "$test3_message"
+  fi
+
+  evidence_block "phase2_test3_first_launch_no_schedules"
 }

 # ------------------------------------------------------------------------------
@@ -285,26 +462,34 @@ test3_first_launch_no_schedules() {
 main() {
  # Allow selecting specific tests: e.g. `./test-phase2.sh 1 3`
  if [[ "$#" -gt 0 && ( "$1" == "-h" || "$1" == "--help" ) ]]; then
-    echo "Usage: $0 [TEST_IDS...]"
+    echo "Usage: $0 [TEST_IDS...] [STRICTNESS=soft|hard]"
    echo
    echo "If no TEST_IDS are given, all tests (1, 2, 3) will run."
+    echo
+    echo "STRICTNESS policy (P3.2):"
+    echo "  soft (default): minor device quirks = warn"
+    echo "  hard: any unexpected alarm loss / missed schedule = fail"
+    echo
    echo "Examples:"
-    echo "  $0          # run all tests"
-    echo "  $0 1        # run only TEST 1"
-    echo "  $0 2 3      # run only TEST 2 and TEST 3"
+    echo "  $0                    # run all tests (soft mode)"
+    echo "  $0 1                  # run only TEST 1 (soft mode)"
+    echo "  $0 2 3                # run only TEST 2 and TEST 3 (soft mode)"
+    echo "  STRICTNESS=hard $0    # run all tests (hard mode, release gating)"
    return 0
  fi

  SELECTED_TESTS=("$@")

-  echo
-  echo "========================================"
-  echo "Phase 2 Testing Script – Force Stop Recovery"
-  echo "========================================"
-  echo
-  echo "This script will guide you through Phase 2 tests."
-  echo "You'll be prompted when UI interaction is needed."
-  echo
+  section "Phase 2 Testing Script – Force Stop Recovery"
+  
+  info "Mode: Standard"
+  info "Strictness: ${STRICTNESS} (soft=warn on quirks, hard=fail on issues)"
+  info "Run ID: ${RUN_ID}"
+  info "Evidence directory: $(get_run_dir)"
+  echo ""
+  info "This script will guide you through Phase 2 tests."
+  info "You'll be prompted when UI interaction is needed."
+  echo ""

  pause

@@ -328,25 +513,23 @@ main() {

  section "Testing Complete"

-  echo "Test Results Summary (see logs above for details):"
-  echo
-  echo "TEST 1: Force Stop – Alarms Cleared"
-  echo "  - Check logs for scenario=$FORCE_STOP_SCENARIO_VALUE and rescheduled>0"
-  echo
-  echo "TEST 2: Force Stop / Process Stop – Alarms Intact"
-  echo "  - Verify FORCE_STOP scenario is not incorrectly triggered when alarms are still present"
-  echo
-  echo "TEST 3: First Launch / No Schedules"
-  echo "  - Confirm that no force-stop recovery runs, or that NONE/FIRST_LAUNCH scenario is logged with rescheduled=0"
-  echo
+  info "Test Results Summary:"
+  echo ""
+  echo "All test verdicts are shown above with evidence locations."
+  echo "Review evidence in: $(get_run_dir)"
+  echo ""
+  echo "Strictness mode: ${STRICTNESS}"
+  echo "  - soft: Minor device quirks treated as warnings"
+  echo "  - hard: Any unexpected behavior treated as failures"
+  echo ""

  ok "Phase 2 testing script complete!"
-  echo
+  echo ""
  echo "Next steps:"
-  echo "  - Review logs above"
-  echo "  - Capture snippets into PHASE2-EMULATOR-TESTING.md"
-  echo "  - Update PHASE2-VERIFICATION.md and unified directive status matrix"
-  echo
+  echo "  - Review evidence in: $(get_run_dir)"
+  echo "  - Verify all test verdicts above"
+  echo "  - Update documentation with test results"
+  echo ""
 }

 main "$@"
--- a/test-apps/android-test-app/test-phase3.sh
+++ b/test-apps/android-test-app/test-phase3.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash

 set -euo pipefail
+IFS=$'\n\t'

 # ========================================
 # Phase 3 Testing Script – Boot Recovery
@@ -10,11 +11,22 @@ set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/alarm-test-lib.sh"

+# Initialize run directory (P1)
+ensure_run_dir || {
+  error "Failed to initialize run directory"
+  exit 1
+}
+
 # Phase 3 specific configuration
 # Log tags / patterns (matched to actual ReactivationManager logs)
 BOOT_SCENARIO_VALUE="BOOT"
 NONE_SCENARIO_VALUE="NONE"

+# Release gating config (P4.1)
+# 0 = advisory mode (default): failures become warnings, continue
+# 1 = release-blocking mode: failures exit with nonzero
+: "${RELEASE_GATE_PHASE3:=0}"
+
 # Allow selecting specific tests on the command line (e.g. ./test-phase3.sh 1 3)
 SELECTED_TESTS=()

@@ -39,16 +51,32 @@ extract_scenario_from_logs() {
 test1_boot_future_alarms() {
  section "TEST 1: Boot with Future Alarms"

-  echo "Purpose: Verify alarms are recreated on boot when schedules have future run times."
+  info "Purpose: Verify alarms are recreated on boot when schedules have future run times."
+  info "Expected time: 2-3 minutes (includes 30-60s reboot)"
+  info "Automatable: Partial (requires manual reboot confirmation)"
+  info "If you see: 'Boot recovery not detected' → Check boot receiver registration and BOOT_COMPLETED permission"
+  echo ""

  pause

+  # Capture initial state
+  capture_alarms "phase3_test1_initial"
+  capture_logcat "phase3_test1_initial" "DNP" 50
+
  substep "Step 1: Launch app & check plugin status"
  launch_app

-  ui_prompt "In the app UI, verify plugin status:\n\n  ⚙️ Plugin Settings: ✅ Configured\n  🔌 Native Fetcher: ✅ Configured\n\nIf either shows ❌ or 'Not configured', click 'Configure Plugin', wait until both are ✅, then press Enter."
+  ui_prompt "1) In the app UI, verify plugin status:

-  ui_prompt "Now schedule at least one future notification (e.g., click 'Test Notification' to schedule for a few minutes in the future)."
+  ⚙️ Plugin Settings: ✅ Configured
+  🔌 Native Fetcher: ✅ Configured
+
+  If either shows ❌ or 'Not configured', click 'Configure Plugin', wait until both are ✅, then press Enter."
+
+  ui_prompt "2) Now schedule at least one future notification (e.g., click 'Test Notification' to schedule for a few minutes in the future)."
+
+  # Capture before reboot state
+  capture_alarms "phase3_test1_before_reboot"

  substep "Step 2: Verify alarms are scheduled"
  show_alarms
@@ -76,6 +104,12 @@ test1_boot_future_alarms() {
  substep "Step 4: Collect boot recovery logs"
  info "Collecting recovery logs from boot..."
  sleep 2  # Give recovery a moment to complete
+
+  # Capture after reboot state
+  capture_alarms "phase3_test1_after_reboot"
+  capture_logcat "phase3_test1_after_reboot" "DNP-REACTIVATION" 250
+  capture_screenshot "phase3_test1_after_reboot"
+
  local logs
  logs="$(get_recovery_logs)"
  echo "$logs"
@@ -96,8 +130,29 @@ test1_boot_future_alarms() {
  echo "  errors     = ${errors}"
  echo

+  # Determine verdict
+  local test1_passed=false
+  local test1_message=""
+
  if [[ "$errors" -gt 0 ]]; then
    error "Recovery reported errors>0 (errors=$errors)"
+    test1_message="Recovery reported errors (errors=$errors)"
+  fi
+
+  if [[ "$scenario" == "$BOOT_SCENARIO_VALUE" && "$rescheduled" -gt 0 ]]; then
+    ok "TEST 1 PASSED: Boot recovery detected and alarms rescheduled (scenario=$scenario, rescheduled=$rescheduled)."
+    test1_passed=true
+    test1_message="Boot recovery detected and alarms rescheduled (scenario=$scenario, rescheduled=$rescheduled)"
+  elif echo "$logs" | grep -qi "Starting boot recovery\|boot recovery"; then
+    if [[ "$rescheduled" -gt 0 ]]; then
+      ok "TEST 1 PASSED: Boot recovery ran and alarms rescheduled (rescheduled=$rescheduled)."
+      test1_passed=true
+      test1_message="Boot recovery ran and alarms rescheduled (rescheduled=$rescheduled)"
+    else
+      test1_message="Boot recovery ran but rescheduled=0. Check implementation or logs."
+    fi
+  else
+    test1_message="Boot recovery not clearly detected. Review logs and boot receiver implementation (scenario=${scenario:-<none>}, rescheduled=$rescheduled)"
  fi

  substep "Step 5: Verify alarms were recreated"
@@ -108,18 +163,22 @@ test1_boot_future_alarms() {
  info "Plugin alarms after boot: $after_count (expected: 1)"
  info "System/other alarms: $system_after (for context)"

-  if [[ "$scenario" == "$BOOT_SCENARIO_VALUE" && "$rescheduled" -gt 0 ]]; then
-    ok "TEST 1 PASSED: Boot recovery detected and alarms rescheduled (scenario=$scenario, rescheduled=$rescheduled)."
-  elif echo "$logs" | grep -qi "Starting boot recovery\|boot recovery"; then
-    if [[ "$rescheduled" -gt 0 ]]; then
-      ok "TEST 1 PASSED: Boot recovery ran and alarms rescheduled (rescheduled=$rescheduled)."
-    else
-      warn "TEST 1: Boot recovery ran but rescheduled=0. Check implementation or logs."
-    fi
-  else
-    warn "TEST 1: Boot recovery not clearly detected. Review logs and boot receiver implementation."
-    info "Scenario detected: ${scenario:-<none>}, rescheduled=$rescheduled"
+  if [[ "$after_count" -eq 0 && "$test1_passed" == "true" ]]; then
+    warn "Alarms were not recreated despite recovery success. Check alarm scheduling logic."
+    test1_message="Boot recovery succeeded but alarms not recreated (rescheduled=$rescheduled, after_count=$after_count)"
+    test1_passed=false
+  elif [[ "$after_count" -gt 0 && "$test1_passed" == "true" ]]; then
+    ok "Alarms successfully recreated after boot (after_count=$after_count)"
  fi
+
+  # Emit verdict
+  if [[ "$test1_passed" == "true" ]]; then
+    verdict_pass "phase3_test1_boot_future_alarms" "$test1_message"
+  else
+    verdict_fail "phase3_test1_boot_future_alarms" "$test1_message"
+  fi
+
+  evidence_block "phase3_test1_boot_future_alarms"
 }

 # ------------------------------------------------------------------------------
@@ -129,22 +188,44 @@ test1_boot_future_alarms() {
 test2_boot_past_alarms() {
  section "TEST 2: Boot with Past Alarms"

-  echo "Purpose: Verify missed alarms are detected and next occurrence is scheduled on boot."
+  info "Purpose: Verify missed alarms are detected and next occurrence is scheduled on boot."
+  info "Expected time: 5-6 minutes (includes 3min wait + 30-60s reboot)"
+  info "Automatable: Partial (requires manual time advancement or wait)"
+  info "If you see: 'No missed alarms detected' → Verify alarm time actually passed before reboot"
+  info "Automation hint: Use 'adb shell date' to check current time, advance if needed"
+  echo ""

  pause

+  # Capture initial state
+  capture_alarms "phase3_test2_initial"
+  capture_logcat "phase3_test2_initial" "DNP" 50
+
  substep "Step 1: Launch app & ensure plugin configured"
  launch_app

-  ui_prompt "In the app UI, verify plugin status:\n\n  ⚙️ Plugin Settings: ✅ Configured\n  🔌 Native Fetcher: ✅ Configured\n\nIf needed, click 'Configure Plugin', then press Enter."
+  ui_prompt "1) In the app UI, verify plugin status:

-  ui_prompt "Click 'Test Notification' to schedule a notification for 2 minutes in the future.\n\nAfter scheduling, we'll wait for the alarm time to pass, then reboot."
+  ⚙️ Plugin Settings: ✅ Configured
+  🔌 Native Fetcher: ✅ Configured
+
+  If needed, click 'Configure Plugin', then press Enter."
+
+  ui_prompt "2) Click 'Test Notification' to schedule a notification for 2 minutes in the future.
+
+  After scheduling, we'll wait for the alarm time to pass, then reboot."
+
+  # Capture before wait state
+  capture_alarms "phase3_test2_before_wait"

  substep "Step 2: Wait for alarm time to pass"
  info "Waiting 3 minutes for scheduled alarm time to pass..."
  warn "You can manually advance system time if needed (requires root/emulator)"
  sleep 180  # Wait 3 minutes

+  # Capture after wait state
+  capture_alarms "phase3_test2_after_wait"
+
  substep "Step 3: Verify alarm time has passed"
  info "Alarm time should now be in the past"
  show_alarms
@@ -159,6 +240,12 @@ test2_boot_past_alarms() {
  substep "Step 5: Collect boot recovery logs"
  info "Collecting recovery logs from boot..."
  sleep 2
+
+  # Capture after reboot state
+  capture_alarms "phase3_test2_after_reboot"
+  capture_logcat "phase3_test2_after_reboot" "DNP-REACTIVATION" 250
+  capture_screenshot "phase3_test2_after_reboot"
+
  local logs
  logs="$(get_recovery_logs)"
  echo "$logs"
@@ -182,17 +269,33 @@ test2_boot_past_alarms() {
  echo "  errors     = ${errors}"
  echo

+  # Determine verdict
+  local test2_passed=false
+  local test2_message=""
+
  if [[ "$errors" -gt 0 ]]; then
    error "Recovery reported errors>0 (errors=$errors)"
+    test2_message="Recovery reported errors (errors=$errors)"
  fi

  if [[ "$missed" -ge 1 && "$rescheduled" -ge 1 ]]; then
    ok "TEST 2 PASSED: Past alarms detected and next occurrence scheduled (missed=$missed, rescheduled=$rescheduled)."
+    test2_passed=true
+    test2_message="Past alarms detected and next occurrence scheduled (missed=$missed, rescheduled=$rescheduled)"
  elif [[ "$missed" -ge 1 ]]; then
-    warn "TEST 2: Past alarms detected (missed=$missed) but rescheduled=$rescheduled. Check reschedule logic."
+    test2_message="Past alarms detected (missed=$missed) but rescheduled=$rescheduled. Check reschedule logic."
  else
-    warn "TEST 2: No missed alarms detected. Verify alarm time actually passed before reboot."
+    test2_message="No missed alarms detected. Verify alarm time actually passed before reboot (missed=$missed, rescheduled=$rescheduled)"
  fi
+
+  # Emit verdict
+  if [[ "$test2_passed" == "true" ]]; then
+    verdict_pass "phase3_test2_boot_past_alarms" "$test2_message"
+  else
+    verdict_fail "phase3_test2_boot_past_alarms" "$test2_message"
+  fi
+
+  evidence_block "phase3_test2_boot_past_alarms"
 }

 # ------------------------------------------------------------------------------
@@ -202,10 +305,17 @@ test2_boot_past_alarms() {
 test3_boot_no_schedules() {
  section "TEST 3: Boot with No Schedules"

-  echo "Purpose: Verify boot recovery handles empty database gracefully."
+  info "Purpose: Verify boot recovery handles empty database gracefully."
+  info "Expected time: 2-3 minutes (includes 30-60s reboot)"
+  info "Automatable: Yes"
+  info "If you see: 'rescheduled>0 on first launch' → Check that boot recovery isn't misfiring"
+  echo ""

  pause

+  # Capture initial state (before uninstall)
+  capture_alarms "phase3_test3_initial"
+
  substep "Step 1: Uninstall app to clear DB/state"
  set +e
  $ADB_BIN uninstall "$APP_ID" >/dev/null 2>&1
@@ -221,7 +331,7 @@ test3_boot_no_schedules() {
  fi

  info "Clearing logcat..."
-  $ADB_BIN logcat -c
+  clear_logs
  ok "Logs cleared"

  pause
@@ -235,6 +345,12 @@ test3_boot_no_schedules() {
  substep "Step 4: Collect boot recovery logs"
  info "Collecting recovery logs from boot..."
  sleep 2
+
+  # Capture after reboot state
+  capture_alarms "phase3_test3_after_reboot"
+  capture_logcat "phase3_test3_after_reboot" "DNP-REACTIVATION" 250
+  capture_screenshot "phase3_test3_after_reboot"
+
  local logs
  logs="$(get_recovery_logs)"
  echo "$logs"
@@ -251,20 +367,37 @@ test3_boot_no_schedules() {
  echo "  missed    = ${missed}"
  echo

+  # Determine verdict
+  local test3_passed=false
+  local test3_message=""
+
  if [[ -z "$logs" ]]; then
    ok "TEST 3 PASSED: No recovery logs when there are no schedules (safe behavior)."
-    return
-  fi
-
-  if echo "$logs" | grep -qiE "No schedules found|No schedules present"; then
+    test3_passed=true
+    test3_message="No recovery logs when there are no schedules (safe behavior)"
+  elif echo "$logs" | grep -qiE "No schedules found|No schedules present"; then
    ok "TEST 3 PASSED: Explicit 'No schedules found' message logged with no rescheduling."
+    test3_passed=true
+    test3_message="Explicit 'No schedules found' message logged with no rescheduling"
  elif [[ "$scenario" == "$NONE_SCENARIO_VALUE" && "$rescheduled" -eq 0 ]]; then
    ok "TEST 3 PASSED: NONE scenario detected with no rescheduling."
+    test3_passed=true
+    test3_message="NONE scenario detected with no rescheduling (scenario=$scenario, rescheduled=$rescheduled)"
  elif [[ "$rescheduled" -gt 0 ]]; then
-    warn "TEST 3: rescheduled>0 on first launch / empty DB. Check that boot recovery isn't misfiring."
+    test3_message="rescheduled>0 on first launch / empty DB. Check that boot recovery isn't misfiring (rescheduled=$rescheduled)"
  else
-    info "TEST 3: Logs present but no rescheduling; review scenario handling to ensure it's explicit about NONE / NO_SCHEDULES."
+    test3_passed=true  # Not a failure, just needs review
+    test3_message="Logs present but no rescheduling; review scenario handling to ensure it's explicit about NONE / NO_SCHEDULES (scenario=${scenario:-<none>}, rescheduled=$rescheduled)"
  fi
+
+  # Emit verdict
+  if [[ "$test3_passed" == "true" ]]; then
+    verdict_pass "phase3_test3_boot_no_schedules" "$test3_message"
+  else
+    verdict_fail "phase3_test3_boot_no_schedules" "$test3_message"
+  fi
+
+  evidence_block "phase3_test3_boot_no_schedules"
 }

 # ------------------------------------------------------------------------------
@@ -274,16 +407,32 @@ test3_boot_no_schedules() {
 test4_silent_boot_recovery() {
  section "TEST 4: Silent Boot Recovery (App Never Opened)"

-  echo "Purpose: Verify boot recovery occurs even when the app is never opened after reboot."
+  info "Purpose: Verify boot recovery occurs even when the app is never opened after reboot."
+  info "Expected time: 2-3 minutes (includes 30-60s reboot)"
+  info "Automatable: Partial (requires manual verification that app was not opened)"
+  info "If you see: 'Boot recovery not detected' → Verify boot receiver is registered and has BOOT_COMPLETED permission"
+  echo ""

  pause

+  # Capture initial state
+  capture_alarms "phase3_test4_initial"
+  capture_logcat "phase3_test4_initial" "DNP" 50
+
  substep "Step 1: Launch app & ensure plugin configured"
  launch_app

-  ui_prompt "In the app UI, verify plugin status:\n\n  ⚙️ Plugin Settings: ✅ Configured\n  🔌 Native Fetcher: ✅ Configured\n\nIf needed, click 'Configure Plugin', then press Enter."
+  ui_prompt "1) In the app UI, verify plugin status:

-  ui_prompt "Click 'Test Notification' to schedule a notification for a few minutes in the future."
+  ⚙️ Plugin Settings: ✅ Configured
+  🔌 Native Fetcher: ✅ Configured
+
+  If needed, click 'Configure Plugin', then press Enter."
+
+  ui_prompt "2) Click 'Test Notification' to schedule a notification for a few minutes in the future."
+
+  # Capture before reboot state
+  capture_alarms "phase3_test4_before_reboot"

  substep "Step 2: Verify alarms are scheduled"
  show_alarms
@@ -312,6 +461,12 @@ test4_silent_boot_recovery() {
  substep "Step 4: Collect boot recovery logs (without opening app)"
  info "Collecting recovery logs from boot (app was NOT opened)..."
  sleep 2
+
+  # Capture after reboot state (without opening app)
+  capture_alarms "phase3_test4_after_reboot"
+  capture_logcat "phase3_test4_after_reboot" "DNP-REACTIVATION" 250
+  capture_screenshot "phase3_test4_after_reboot"
+
  local logs
  logs="$(get_recovery_logs)"
  echo "$logs"
@@ -340,15 +495,37 @@ test4_silent_boot_recovery() {
  info "Plugin alarms after boot (app never opened): $after_count (expected: 1)"
  info "System/other alarms: $system_after (for context)"

+  # Determine verdict
+  local test4_passed=false
+  local test4_message=""
+
+  if [[ "$errors" -gt 0 ]]; then
+    error "Recovery reported errors>0 (errors=$errors)"
+    test4_message="Recovery reported errors (errors=$errors)"
+  fi
+
  if [[ "$after_count" -gt 0 && "$rescheduled" -gt 0 ]]; then
    ok "TEST 4 PASSED: Boot recovery occurred silently and alarms were recreated (rescheduled=$rescheduled) without app launch."
+    test4_passed=true
+    test4_message="Boot recovery occurred silently and alarms were recreated (rescheduled=$rescheduled, after_count=$after_count) without app launch"
  elif [[ "$rescheduled" -gt 0 ]]; then
    ok "TEST 4 PASSED: Boot recovery occurred silently (rescheduled=$rescheduled), but alarm count check unclear."
+    test4_passed=true
+    test4_message="Boot recovery occurred silently (rescheduled=$rescheduled), but alarm count unclear (after_count=$after_count)"
  elif echo "$logs" | grep -qi "Starting boot recovery\|boot recovery"; then
-    warn "TEST 4: Boot recovery ran but alarms may not have been recreated. Check logs and implementation."
+    test4_message="Boot recovery ran but alarms may not have been recreated. Check logs and implementation (rescheduled=$rescheduled, after_count=$after_count)"
  else
-    warn "TEST 4: Boot recovery not detected. Verify boot receiver is registered and has BOOT_COMPLETED permission."
+    test4_message="Boot recovery not detected. Verify boot receiver is registered and has BOOT_COMPLETED permission (scenario=${scenario:-<none>}, rescheduled=$rescheduled)"
  fi
+
+  # Emit verdict
+  if [[ "$test4_passed" == "true" ]]; then
+    verdict_pass "phase3_test4_silent_boot_recovery" "$test4_message"
+  else
+    verdict_fail "phase3_test4_silent_boot_recovery" "$test4_message"
+  fi
+
+  evidence_block "phase3_test4_silent_boot_recovery"
 }

 # ------------------------------------------------------------------------------
@@ -356,32 +533,63 @@ test4_silent_boot_recovery() {
 # ------------------------------------------------------------------------------

 main() {
-  # Allow selecting specific tests: e.g. `./test-phase3.sh 1 3`
-  if [[ "$#" -gt 0 && ( "$1" == "-h" || "$1" == "--help" ) ]]; then
-    echo "Usage: $0 [TEST_IDS...]"
-    echo
-    echo "If no TEST_IDS are given, all tests (1, 2, 3, 4) will run."
-    echo "Examples:"
-    echo "  $0          # run all tests"
-    echo "  $0 1        # run only TEST 1"
-    echo "  $0 2 3      # run only TEST 2 and TEST 3"
-    echo "  $0 4        # run only TEST 4 (silent boot recovery)"
-    return 0
+  # Parse CLI args for --gate-phase3 flag
+  local gate_phase3=0
+  local test_args=()
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -h|--help)
+        echo "Usage: $0 [--gate-phase3] [TEST_IDS...]"
+        echo
+        echo "If no TEST_IDS are given, all tests (1, 2, 3, 4) will run."
+        echo
+        echo "Options:"
+        echo "  --gate-phase3    Enable release gating (failures exit with non-zero)"
+        echo "                   Equivalent to: RELEASE_GATE_PHASE3=1 $0"
+        echo
+        echo "Environment:"
+        echo "  RELEASE_GATE_PHASE3=0|1  Release gating mode (default: 0)"
+        echo "                           0 = advisory (warn and continue)"
+        echo "                           1 = release-blocking (fail and exit)"
+        echo
+        echo "Examples:"
+        echo "  $0                    # run all tests (advisory mode)"
+        echo "  $0 1                  # run only TEST 1 (advisory mode)"
+        echo "  $0 --gate-phase3     # run all tests (release-blocking mode)"
+        echo "  $0 --gate-phase3 2 3  # run TEST 2 and 3 (release-blocking mode)"
+        echo "  RELEASE_GATE_PHASE3=1 $0  # same as --gate-phase3"
+        return 0
+        ;;
+      --gate-phase3)
+        gate_phase3=1
+        shift
+        ;;
+      *)
+        test_args+=("$1")
+        shift
+        ;;
+    esac
+  done
+
+  # Set RELEASE_GATE_PHASE3 if flag was provided
+  if [[ "$gate_phase3" -eq 1 ]]; then
+    RELEASE_GATE_PHASE3=1
  fi

-  SELECTED_TESTS=("$@")
+  SELECTED_TESTS=("${test_args[@]}")

-  echo
-  echo "========================================"
-  echo "Phase 3 Testing Script – Boot Recovery"
-  echo "========================================"
-  echo
-  echo "This script will guide you through Phase 3 tests."
-  echo "You'll be prompted when UI interaction is needed."
-  echo
-  echo "⚠️  WARNING: This script will reboot the emulator multiple times."
-  echo "    Each reboot takes 30-60 seconds."
-  echo
+  section "Phase 3 Testing Script – Boot Recovery"
+
+  info "Mode: ${RELEASE_GATE_PHASE3:-0} (0=advisory, 1=release-blocking)"
+  info "Run ID: ${RUN_ID}"
+  info "Evidence directory: $(get_run_dir)"
+  echo ""
+  info "This script will guide you through Phase 3 tests."
+  info "You'll be prompted when UI interaction is needed."
+  echo ""
+  warn "⚠️  WARNING: This script will reboot the emulator multiple times."
+  info "    Each reboot takes 30-60 seconds."
+  echo ""

  pause

@@ -410,28 +618,23 @@ main() {

  section "Testing Complete"

-  echo "Test Results Summary (see logs above for details):"
-  echo
-  echo "TEST 1: Boot with Future Alarms"
-  echo "  - Check logs for scenario=$BOOT_SCENARIO_VALUE and rescheduled>0"
-  echo
-  echo "TEST 2: Boot with Past Alarms"
-  echo "  - Check that missed>=1 and rescheduled>=1"
-  echo
-  echo "TEST 3: Boot with No Schedules"
-  echo "  - Check that no recovery runs, or NONE scenario is logged with rescheduled=0"
-  echo
-  echo "TEST 4: Silent Boot Recovery"
-  echo "  - Check that boot recovery occurred and alarms were recreated without app launch"
-  echo
+  info "Test Results Summary:"
+  echo ""
+  echo "All test verdicts are shown above with evidence locations."
+  echo "Review evidence in: $(get_run_dir)"
+  echo ""
+  echo "Release gating mode: ${RELEASE_GATE_PHASE3:-0}"
+  echo "  - 0 (advisory): Failures become warnings, script continues"
+  echo "  - 1 (release-blocking): Failures cause script to exit with non-zero"
+  echo ""

  ok "Phase 3 testing script complete!"
-  echo
+  echo ""
  echo "Next steps:"
-  echo "  - Review logs above"
-  echo "  - Capture snippets into PHASE3-EMULATOR-TESTING.md"
-  echo "  - Update PHASE3-VERIFICATION.md and unified directive status matrix"
-  echo
+  echo "  - Review evidence in: $(get_run_dir)"
+  echo "  - Verify all test verdicts above"
+  echo "  - Update documentation with test results"
+  echo ""
 }

 main "$@"