diff --git a/.github/CI_CD_WORKFLOWS.md b/.github/CI_CD_WORKFLOWS.md new file mode 100644 index 0000000..3d443e6 --- /dev/null +++ b/.github/CI_CD_WORKFLOWS.md @@ -0,0 +1,567 @@ +# CI/CD Workflows - Automated Testing & Validation + +**Purpose**: GitHub Copilot can scan code and provide recommendations but **cannot execute shell commands**. This document defines CI/CD workflows to automate the validation steps from `copilot-instructions.md`. + +--- + +## GitHub Copilot Capabilities vs CI/CD Requirements + +### What Copilot CAN Do +- ✅ Static code analysis and pattern detection +- ✅ YAML/JSON/code syntax validation +- ✅ Security pattern detection (hardcoded secrets, weak TLS, etc.) +- ✅ Best practice recommendations +- ✅ Documentation completeness checks +- ✅ Code style and convention validation + +### What Copilot CANNOT Do +- ❌ Execute shell commands (`helm lint`, `kubectl apply --dry-run`) +- ❌ Run container vulnerability scans (`trivy image`) +- ❌ Execute test suites (unit, integration, E2E) +- ❌ Deploy to Kubernetes clusters +- ❌ Perform dynamic security testing +- ❌ Generate performance benchmarks + +### Solution: Hybrid Approach +**Copilot** → Scan and recommend in PR reviews +**CI/CD** → Execute commands and enforce quality gates + +--- + +## Recommended CI/CD Pipeline Architecture + +### GitHub Actions Workflow Template + +**File**: `.github/workflows/validation.yml` + +```yaml +name: Code Validation & Security + +on: + pull_request: + branches: [main, maintenance] + push: + branches: [main, maintenance] + +permissions: + contents: read + pull-requests: write + security-events: write + +jobs: + lint: + name: Lint & Syntax Validation + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bca8484df8a183e9c3623645834c2b0 # v4.1.5 + + - name: YAML Lint + uses: ibiqlik/action-yamllint@2576378a8e339169678f9939646ee3ee325e845c # v3.1.1 + with: + file_or_dir: . + config_file: .yamllint.yml + + - name: Helm Lint + run: | + helm lint ./*/helm 2>&1 | tee helm-lint.log + if grep -q "ERROR" helm-lint.log; then + echo "::error::Helm lint failed" + exit 1 + fi + + - name: Shell Script Lint + uses: ludeeus/action-shellcheck@00b27aa7cb85167568cb48a3838b75f4265f2bca # v2.0.0 + with: + scandir: './scripts' + + security: + name: Security Scanning + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bca8484df8a183e9c3623645834c2b0 # v4.1.5 + + - name: Secret Detection + uses: trufflesecurity/trufflehog@4b0d468b4a67df0f6b86db2db182c992fb2cbb4e # v3.82.13 + with: + path: ./ + base: ${{ github.event.repository.default_branch }} + head: HEAD + + - name: Trivy Config Scan + uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # v0.24.0 + with: + scan-type: 'config' + scan-ref: '.' + format: 'sarif' + output: 'trivy-config.sarif' + severity: 'HIGH,CRITICAL' + exit-code: '1' + + - name: Upload Trivy Results + uses: github/codeql-action/upload-sarif@b8d3b6e8af63cde30bdc382c0bc28114f4346c88 # v2 + if: always() + with: + sarif_file: 'trivy-config.sarif' + + kubernetes: + name: Kubernetes Validation + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bca8484df8a183e9c3623645834c2b0 # v4.1.5 + + - name: Helm Template Validation + run: | + for chart in */helm; do + echo "Validating $chart" + helm template test ./$chart --debug + done + + - name: Kubernetes Dry-Run + run: | + for chart in */helm; do + echo "Dry-run validation: $chart" + helm template test ./$chart | kubectl apply --dry-run=server -f - + done + + - name: Kubeval Validation + uses: instrumenta/kubeval-action@831e8d7618bee0555ef06c4a7c1635c6e9130339 # v0.4.0 + with: + files: ./*/helm/templates/*.yaml + + compliance: + name: Compliance Validation + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bca8484df8a183e9c3623645834c2b0 # v4.1.5 + + - name: SOC2 Checklist Validation + run: | + # Check for required security controls + + # 1. NetworkPolicy exists + find . -name "networkpolicy.yaml" -o -name "network-policy.yaml" | grep -q . || { + echo "::error::Missing NetworkPolicy - SOC2 requirement" + exit 1 + } + + # 2. No hardcoded secrets (exclude comments, examples, and proper secret injection) + if grep -RInE '^[[:space:]]*[^#]*password[^:]*[:=][[:space:]]*[^[:space:]#]+' --include="*.yaml" --include="*.yml" . | grep -Ev "valueFrom|secretKeyRef|envFrom:|example|sample"; then + echo "::error::Hardcoded secrets detected - SOC2 violation" + exit 1 + fi + + # 3. TLS 1.3 enforcement (check Ingress resources specifically) + ingress_files=$(find . -type f \( -name "*.yaml" -o -name "*.yml" \) -exec grep -l "kind: *Ingress" {} \;) + if [ -z "$ingress_files" ]; then + echo "::warning::No Ingress resources found to validate TLS 1.3 enforcement" + elif ! grep -l "TLSv1.3" $ingress_files >/dev/null 2>&1; then + echo "::error::TLS 1.3 not enforced in Ingress resources - SOC2 requirement" + exit 1 + fi + + # 4. RBAC configured + find . -name "role.yaml" -o -name "rolebinding.yaml" | grep -q . || { + echo "::error::Missing RBAC - SOC2 requirement" + exit 1 + } + + - name: ISO/IEC 42001 AI Management Validation + if: contains(toLower(github.event.head_commit.message), 'ai') + run: | + # AI-specific compliance checks + + # 1. Check for AI risk assessment documentation + if [ ! -f "docs/AI_RISK_ASSESSMENT.md" ]; then + echo "::warning::Missing AI risk assessment documentation" + fi + + # 2. Check for model versioning (ensure models have version tracking) + model_matches=$(grep -r "model" --include="*.yaml" . || true) + if [ -n "$model_matches" ] && ! echo "$model_matches" | grep -q "version"; then + echo "::warning::AI models should have version tracking" + fi + + documentation: + name: Documentation Validation + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bca8484df8a183e9c3623645834c2b0 # v4.1.5 + + - name: Check Required Files + run: | + required_files=( + "README.md" + "CHANGELOG.md" + ) + + for file in "${required_files[@]}"; do + if [ ! -f "$file" ]; then + echo "::error::Missing required file: $file" + exit 1 + fi + done + + - name: Markdown Lint + uses: nosborn/github-action-markdown-cli@9b5e871c11cc0649c5ac2526af22e23525fa344d # v3.3.0 + with: + files: . + config_file: .markdownlint.json + + - name: Version Consistency Check + run: | + # Check each Chart.yaml version is mentioned in its corresponding CHANGELOG.md + shopt -s nullglob + chart_files=( */helm/Chart.yaml ) + + if [ ${#chart_files[@]} -eq 0 ]; then + echo "No Chart.yaml files found under */helm, skipping version consistency check." + exit 0 + fi + + failed=0 + for chart_file in "${chart_files[@]}"; do + chart_dir=$(dirname "$chart_file") + service_dir=$(dirname "$chart_dir") + changelog_file="$service_dir/CHANGELOG.md" + + if [ ! -f "$changelog_file" ]; then + echo "::error::Missing CHANGELOG.md for chart at $chart_file (expected $changelog_file)" + failed=1 + continue + fi + + chart_version=$(grep "^version:" "$chart_file" | awk '{print $2}' | head -1) + if [ -z "$chart_version" ]; then + echo "::error::Unable to determine version from $chart_file" + failed=1 + continue + fi + + if ! grep -qi "$chart_version" "$changelog_file"; then + echo "::error::Chart version $chart_version from $chart_file not documented in $changelog_file" + failed=1 + fi + done + + if [ "$failed" -ne 0 ]; then + exit 1 + fi + + versioning: + name: WeOwnVer Validation + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bca8484df8a183e9c3623645834c2b0 # v4.1.5 + + - name: Validate WeOwnVer Format + run: | + # Extract version from Chart.yaml + version=$(grep "^version:" */helm/Chart.yaml | head -1 | awk '{print $2}') + + # Validate format: SEASON.WEEK[.DAY[.VERSION]] + if ! echo "$version" | grep -Eq '^[0-9]+\.[0-9]+(\.[0-9]+)?(\.[0-9]+)?$'; then + echo "::error::Invalid WeOwnVer format: $version" + echo "Expected: SEASON.WEEK[.DAY[.VERSION]] where all components are non-negative integers" + exit 1 + fi + + # Validate season/week/day ranges + season=$(echo "$version" | cut -d. -f1) + week=$(echo "$version" | cut -d. -f2) + day=$(echo "$version" | cut -d. -f3) + + # Season must be a positive, reasonable number (1+) + if [ "$season" -lt 1 ] || [ "$season" -gt 9999 ]; then + echo "::error::Season $season is out of allowed range (1-9999)" + exit 1 + fi + + # Week must be between 1 and 17 inclusive + if [ "$week" -lt 1 ] || [ "$week" -gt 17 ]; then + echo "::error::Week $week is out of allowed range (1-17)" + exit 1 + fi + + # If a day component is present, it must be between 0 and 7 inclusive + if [ -n "$day" ]; then + if [ "$day" -lt 0 ] || [ "$day" -gt 7 ]; then + echo "::error::Day $day is out of allowed range (0-7)" + exit 1 + fi + fi + + # If a version component is present (4th digit), it must be 0 or greater + version_num=$(echo "$version" | cut -d. -f4) + if [ -n "$version_num" ]; then + if [ "$version_num" -lt 0 ]; then + echo "::error::Version $version_num is out of allowed range (0+)" + exit 1 + fi + fi + + - name: Check Version References + run: | + # Ensure all documentation references WeOwnVer + if ! grep -Er "WeOwnVer|#WeOwnVer" README.md CHANGELOG.md; then + echo "::warning::Documentation should reference WeOwnVer system" + fi + + summary: + name: Validation Summary + runs-on: ubuntu-latest + needs: [lint, security, kubernetes, compliance, documentation, versioning] + if: always() + steps: + - name: Generate Summary + run: | + echo "## 🎯 Validation Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Check | Status |" >> $GITHUB_STEP_SUMMARY + echo "|-------|--------|" >> $GITHUB_STEP_SUMMARY + echo "| Lint | ${{ needs.lint.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Security | ${{ needs.security.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Kubernetes | ${{ needs.kubernetes.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Compliance | ${{ needs.compliance.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Documentation | ${{ needs.documentation.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Versioning | ${{ needs.versioning.result }} |" >> $GITHUB_STEP_SUMMARY +``` + +--- + +## Configuration Files + +### .yamllint.yml +```yaml +extends: default + +rules: + line-length: + max: 120 + level: warning + indentation: + spaces: 2 + indent-sequences: true + comments: + min-spaces-from-content: 1 + truthy: + allowed-values: ['true', 'false', 'on', 'off'] +``` + +### .markdownlint.json +```json +{ + "default": true, + "MD013": false, + "MD033": false, + "MD041": false +} +``` + +--- + +## Advanced Workflows + +### Container Image Scanning + +**File**: `.github/workflows/container-scan.yml` + +```yaml +name: Container Security Scan + +on: + pull_request: + paths: + - '**/Dockerfile*' + - '**/values.yaml' + +jobs: + scan: + name: Trivy Image Scan + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bca8484df8a183e9c3623645834c2b0 # v4.1.5 + + - name: Build Test Images + run: | + # Build all Dockerfiles for scanning + find . -name "Dockerfile*" -exec dirname {} \; | sort -u | while read dir; do + docker build -t test:latest "$dir" + trivy image --exit-code 1 --severity HIGH,CRITICAL test:latest + done +``` + +### Performance Testing + +**File**: `.github/workflows/performance.yml` + +```yaml +name: Performance Testing + +on: + pull_request: + branches: [main] + +jobs: + lighthouse: + name: Lighthouse CI + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bca8484df8a183e9c3623645834c2b0 # v4.1.5 + + - name: Run Lighthouse + uses: treosh/lighthouse-ci-action@2f8dda6cf4de7d73b29853c3f29e73a01e297bd8 # v10.1.0 + with: + urls: | + https://staging.example.com + uploadArtifacts: true + temporaryPublicStorage: true +``` + +### Dependency Scanning + +**File**: `.github/workflows/dependencies.yml` + +```yaml +name: Dependency Security + +on: + schedule: + - cron: '0 0 * * 0' # Weekly + pull_request: + paths: + - '**/package*.json' + - '**/requirements.txt' + - '**/go.mod' + +jobs: + scan: + name: Dependency Audit + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bca8484df8a183e9c3623645834c2b0 # v4.1.5 + + - name: Node.js Audit + if: hashFiles('**/package-lock.json') != '' + run: | + npm audit --audit-level=high + + - name: Python Safety Check + if: hashFiles('**/requirements.txt') != '' + run: | + pip install safety==3.2.11 + safety check --json + + - name: Go Vulnerability Check + if: hashFiles('**/go.mod') != '' + run: | + go install golang.org/x/vuln/cmd/govulncheck@latest + govulncheck ./... +``` + +--- + +## Integration with Copilot + +### Copilot's Role (PR Review) +1. **Scan code** for patterns and anti-patterns +2. **Recommend fixes** with specific file locations +3. **Reference** copilot-instructions.md requirements +4. **Flag violations** with severity levels + +### CI/CD's Role (Automated Enforcement) +1. **Execute** all validation commands +2. **Enforce** quality gates (fail on HIGH/CRITICAL) +3. **Generate** reports and artifacts +4. **Block** merges if checks fail + +### Workflow Integration +``` +1. Developer pushes to maintenance branch +2. GitHub Actions runs validation workflows +3. GitHub Copilot reviews code patterns +4. Both provide feedback in PR comments +5. Developer fixes issues +6. Push updates trigger re-validation +7. All checks pass → Human approves → Merge +``` + +--- + +## Quality Gates + +### Blocking (Must Pass) +- ❌ Helm lint errors +- ❌ Kubernetes dry-run failures +- ❌ HIGH/CRITICAL security vulnerabilities +- ❌ Hardcoded secrets detected +- ❌ Missing NetworkPolicy +- ❌ Missing RBAC configuration +- ❌ WeOwnVer format violations + +### Warning (Review Required) +- ⚠️ Missing TLS 1.3 enforcement +- ⚠️ Documentation gaps +- ⚠️ Performance regressions +- ⚠️ Code style violations +- ⚠️ Missing AI risk assessments + +--- + +## Monitoring & Reporting + +### GitHub Actions Dashboard +- **Status badges** in README.md +- **Workflow run history** for trend analysis +- **Artifact storage** for scan reports +- **Notification integration** (Slack, email) + +### Metrics to Track +- ✅ CI/CD success rate +- ✅ Average validation time +- ✅ Security vulnerability trends +- ✅ Code quality score over time +- ✅ Deployment frequency + +--- + +## Maintenance + +### Weekly Tasks +- Review and update workflow configurations +- Update action versions to latest +- Review security scan findings +- Optimize workflow performance + +### Monthly Tasks +- Audit quality gate effectiveness +- Review blocked PRs for patterns +- Update compliance checklists +- Performance benchmark analysis + +--- + +## Implementation Checklist + +- [ ] Create `.github/workflows/validation.yml` +- [ ] Create `.yamllint.yml` configuration +- [ ] Create `.markdownlint.json` configuration +- [ ] Enable GitHub Actions in repository settings +- [ ] Configure required status checks in branch protection +- [ ] Set up notification integrations +- [ ] Train team on workflow usage +- [ ] Document workflow customizations + +--- + +**Last Updated**: 2026-01-26 (v2.5.0) +**Maintained By**: Roman Di Domizio (roman@weown.email) +**Compliance**: SOC2, ISO/IEC 42001 automated validation diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..789525e --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,855 @@ +# WeOwn AI Infrastructure - GitHub Copilot Code Review Instructions + +## Repository Overview + +**Purpose**: Enterprise-grade, production-ready AI infrastructure for WeOwn's decentralized agentic ecosystem. + +**Stack**: Kubernetes-native deployments (DigitalOcean K8s 1.33.1) with Helm charts, Docker containers, and zero-trust security architecture. + +**Applications**: AnythingLLM (AI assistant), WordPress (CMS), Matomo (analytics), Vaultwarden (secrets), n8n (automation) - all with SOC2/ISO/IEC 42001 compliance. + +**Deployment Philosophy**: Self-contained Helm charts, official images (no Bitnami dependencies), enterprise security by default, cohort replication ready. + +--- + +## Critical: SOC2 Compliance Requirements + +### Trust Services Criteria - ALL REQUIRED + +#### 1. Security Controls (MANDATORY) +- **Access Control**: RBAC configured for all K8s namespaces, ServiceAccounts with least privilege, no cluster-admin bindings +- **Network Security**: NetworkPolicy resources REQUIRED for all deployments (deny-all ingress + explicit allow rules) +- **Authentication**: 2FA/MFA for admin access, Machine Identity for service accounts (e.g., Infisical Universal Auth) +- **Encryption in Transit**: TLS 1.3 ONLY via cert-manager/Let's Encrypt, strong cipher suites configured in Ingress annotations +- **Encryption at Rest**: Kubernetes etcd encryption enabled, PVC encryption via storage class, secrets encrypted at rest +- **Vulnerability Management**: Container images scanned for CVEs, non-root users (UID 1000+), minimal base images (Alpine/distroless) +- **Intrusion Detection**: Pod Security Standards "restricted" profile enforced, readOnlyRootFilesystem where possible +- **Secret Management**: Never use --from-literal, always use $(mktemp) for temp files, Kubernetes secrets with proper RBAC + +#### 2. Availability Controls (MANDATORY) +- **Service Level Guarantees**: Resource requests/limits defined, HPA for production workloads, PodDisruptionBudgets for critical services +- **Failover**: Multi-replica deployments for stateless workloads, StatefulSets for databases with persistent storage +- **Health Checks**: livenessProbe and readinessProbe REQUIRED for all containers, proper grace periods configured +- **Backup & Recovery**: CronJob-based backups with 30-day retention minimum, documented restore procedures, tested recovery + +#### 3. Processing Integrity Controls (MANDATORY) +- **Data Validation**: Input sanitization in all user-facing applications, content security policies, CSRF protection +- **Completeness**: Audit logs for all administrative actions, immutable log storage, retention policies documented +- **Accuracy**: Automated testing (unit, integration, E2E) before production deployment, validation scripts in CI/CD +- **Timeliness**: Monitoring and alerting for processing delays, SLO/SLI tracking, incident response procedures + +#### 4. Confidentiality Controls (MANDATORY) +- **Data Classification**: Secrets vs ConfigMaps properly segregated, PII identified and encrypted, data flow diagrams maintained +- **Access Restrictions**: Namespace isolation, service mesh policies (if applicable), no cross-namespace access without justification +- **Secure Transmission**: No plain HTTP, all inter-service communication over TLS, DNS over TLS where supported + +#### 5. Privacy Controls (IF APPLICABLE) +- **GDPR/CCPA**: Data minimization, right to erasure, consent management, privacy policies documented +- **Data Retention**: Automatic PVC cleanup after retention period, backup rotation policies, secure deletion procedures +- **Third-Party Sharing**: DPA agreements with cloud providers, data processing addendums, vendor risk assessments + +### SOC2 Audit Evidence Requirements +- **90-day audit logs**: Centralized logging (e.g., Elasticsearch/Loki), tamper-proof storage, compliance reports generated +- **Change management**: Git-based deployments only, PRs required for main branch, approval workflows, rollback procedures +- **Incident response**: Documented procedures, escalation paths, post-mortem reports, corrective actions tracked +- **Access reviews**: Quarterly RBAC audits, ServiceAccount cleanup, SSH key rotation, credential rotation schedules + +--- + +## Critical: ISO/IEC 42001 AI Management System Requirements + +### Annex A: AI Risk Management Controls + +#### AI System Lifecycle (ISO 5338) +- **Design Phase**: Impact assessments (ISO 42005), ethical considerations (ISO 24368), bias mitigation strategies +- **Development**: Model versioning, training data lineage, reproducibility requirements, validation datasets +- **Deployment**: Canary releases, A/B testing, gradual rollouts, monitoring for drift +- **Monitoring**: Performance metrics, accuracy tracking, fairness metrics, model degradation alerts +- **Retirement**: Decommissioning procedures, data retention policies, model archival + +#### AI-Specific Security Controls +- **Model Security**: Adversarial robustness testing, input validation, rate limiting on inference APIs +- **Data Governance**: Training data provenance, bias audits, data lineage tracking, GDPR compliance for training data +- **Transparency**: Model cards, explainability requirements, decision audit trails, user consent for AI processing +- **Human Oversight**: Human-in-the-loop validation, override mechanisms, escalation procedures, appeal processes + +#### Risk Assessment (ISO 23894 + ISO 31000) +- **AI Risk Sources** (Annex C): + - Data quality issues (poisoning, drift, bias) + - Model failures (overfitting, hallucinations, confidence miscalibration) + - Privacy violations (membership inference, data leakage) + - Security threats (adversarial attacks, model extraction) + - Ethical concerns (discrimination, fairness, accountability) +- **Risk Mitigation**: Document risk register, implement controls, monitor effectiveness, periodic reviews + +### ISO/IEC 42001 Documentation Requirements +- **AI Management Policy**: Defined objectives, scope, governance structure, roles/responsibilities +- **Risk Management Framework**: Risk assessment procedures, risk treatment plans, residual risk acceptance +- **Impact Assessments**: Societal impact, ethical implications, environmental considerations, stakeholder analysis +- **Performance Monitoring**: KPIs defined, dashboards implemented, periodic reviews, continuous improvement +- **Compliance Tracking**: Gap analysis documented, corrective actions tracked, audit readiness maintained + +### AI Governance (ISO 38500/38507) +- **Board Oversight**: AI strategy alignment, resource allocation, risk appetite definition, policy approval +- **Accountability**: Clear ownership, decision authority, escalation paths, liability allocation +- **Vendor Management**: Third-party AI services vetted, contracts reviewed, SLAs enforced, exit strategies +- **Continuous Learning**: Training programs, competency frameworks, knowledge sharing, lessons learned + +--- + +## Security Best Practices - ENFORCE STRICTLY + +### Secrets Management (CRITICAL) +```bash +# ✅ CORRECT: Use mktemp for temporary files +AUTH_FILE="$(mktemp)" +trap 'rm -f "$AUTH_FILE"' EXIT +cat > "$AUTH_FILE" << 'EOF' +clientId=VALUE +clientSecret=VALUE +EOF +kubectl create secret generic NAME --from-env-file="$AUTH_FILE" + +# ❌ WRONG: Never use /tmp (world-readable) +cat > /tmp/secrets.env # REJECT THIS IN CODE REVIEW + +# ❌ WRONG: Never use --from-literal (shell history exposure) +kubectl create secret --from-literal=key=value # REJECT THIS +``` + +### Kubernetes RBAC (REQUIRED) +```yaml +# ✅ CORRECT: Least privilege ServiceAccount +apiVersion: v1 +kind: ServiceAccount +metadata: + name: app-sa +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: app-role +rules: +- apiGroups: [""] + resources: ["configmaps", "secrets"] + verbs: ["get", "list"] # ONLY what's needed +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: app-binding +subjects: +- kind: ServiceAccount + name: app-sa +roleRef: + kind: Role + name: app-role + apiGroup: rbac.authorization.k8s.io +``` + +### Pod Security Standards (MANDATORY) +```yaml +# ✅ CORRECT: Restricted profile +securityContext: + runAsNonRoot: true + runAsUser: 1000 # Or appropriate UID (33 for www-data, 999 for mysql) + runAsGroup: 1000 + fsGroup: 1000 + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true # Prefer this, use false only if required + capabilities: + drop: ["ALL"] + seccompProfile: + type: RuntimeDefault + +# ❌ WRONG: Running as root +securityContext: + runAsUser: 0 # REJECT THIS + privileged: true # REJECT THIS +``` + +### NetworkPolicy (REQUIRED FOR ALL DEPLOYMENTS) +```yaml +# ✅ CORRECT: Deny-all ingress + explicit allow +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: app-netpol +spec: + podSelector: + matchLabels: + app: myapp + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: + matchLabels: + name: ingress-nginx + ports: + - protocol: TCP + port: 80 + egress: + - to: + - namespaceSelector: + matchLabels: + name: kube-system + ports: + - protocol: TCP + port: 53 # DNS only +``` + +### TLS Configuration (MANDATORY) +```yaml +# ✅ CORRECT: Strong TLS 1.3 with cipher suites +metadata: + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + nginx.ingress.kubernetes.io/ssl-protocols: "TLSv1.3" + nginx.ingress.kubernetes.io/ssl-ciphers: "ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305" +``` + +--- + +## Helm Chart Best Practices - ENFORCE + +### Templating Standards +```yaml +# ✅ CORRECT: Use .Values.global.namespace pattern for consistency +namespace: {{ .Values.global.namespace | default .Release.Namespace }} + +# ✅ CORRECT: Always provide defaults and required checks +replicas: {{ .Values.replicaCount | default 1 }} +image: {{ .Values.image.repository | required "image.repository is required" }} + +# ✅ CORRECT: Conditional rendering with proper hasKey checks +{{- if .Values.mariadbOfficial }} +{{- if hasKey .Values.mariadbOfficial "enabled" }} +{{- if .Values.mariadbOfficial.enabled }} +# ... resource definition +{{- end }} +{{- end }} +{{- end }} + +# ❌ WRONG: No nil pointer checks +{{ .Values.optional.field }} # REJECT if .Values.optional might not exist +``` + +### Chart Structure +``` +helm/ +├── Chart.yaml # Version 2.x.x, appVersion, dependencies +├── values.yaml # All configurable values with comments +├── templates/ +│ ├── deployment.yaml +│ ├── service.yaml +│ ├── ingress.yaml +│ ├── networkpolicy.yaml # REQUIRED +│ ├── serviceaccount.yaml # REQUIRED +│ ├── role.yaml # REQUIRED if not cluster-admin +│ ├── rolebinding.yaml # REQUIRED +│ ├── secrets.yaml # NEVER hardcode values +│ ├── configmap.yaml +│ ├── _helpers.tpl # Reusable templates +│ └── tests/ +│ └── test-connection.yaml +``` + +### Testing & Validation +```bash +# ✅ ALWAYS run before committing Helm changes +helm lint ./helm +helm template test ./helm --debug +helm template test ./helm | kubectl apply --dry-run=client -f - + +# ✅ Test with different values files +helm template test ./helm -f values-staging.yaml +helm template test ./helm -f values-prod.yaml +``` + +--- + +## Docker Best Practices - ENFORCE + +### Multi-Stage Builds +```dockerfile +# ✅ CORRECT: Multi-stage with minimal final image +FROM node:20-alpine AS builder +WORKDIR /app +COPY package*.json ./ +RUN npm ci --only=production + +FROM node:20-alpine +RUN addgroup -g 1000 appuser && adduser -D -u 1000 -G appuser appuser +WORKDIR /app +COPY --from=builder --chown=appuser:appuser /app/node_modules ./node_modules +COPY --chown=appuser:appuser . . +USER appuser +EXPOSE 3000 +CMD ["node", "server.js"] +``` + +### Security Hardening +```dockerfile +# ✅ CORRECT: Non-root user, minimal packages, security scanning +RUN apk add --no-cache && rm -rf /var/cache/apk/* +USER 1000:1000 +HEALTHCHECK --interval=30s --timeout=3s CMD wget --no-verbose --tries=1 --spider http://localhost:3000/health || exit 1 + +# ❌ WRONG: Running as root, unnecessary packages +USER root # REJECT THIS +RUN apt-get install -y * # REJECT excessive packages +``` + +--- + +## Testing & Validation Requirements + +### Pre-Commit Checks (MUST PASS) +```bash +# Helm validation +helm lint ./helm + +# YAML syntax +yamllint -c .yamllint.yml . + +# Security scanning +trivy image : +trivy config ./helm + +# Kubernetes manifest validation +kubectl apply --dry-run=server -f +``` + +### Integration Testing +```bash +# Deploy to staging namespace +helm upgrade --install test ./helm -n staging --create-namespace + +# Verify readiness +kubectl wait --for=condition=ready pod -l app=myapp -n staging --timeout=300s + +# Run smoke tests +kubectl run smoke-test --image=curlimages/curl --rm -it -- curl http://service.staging.svc.cluster.local + +# Cleanup +helm uninstall test -n staging +``` + +### Performance Testing +- Load testing with k6/Locust before production +- Resource usage monitoring (CPU, memory, disk I/O) +- Database query optimization (no N+1 queries) +- Caching strategies validated + +--- + +## Documentation Requirements + +### README.md (REQUIRED) +- Installation instructions with prerequisites +- Configuration examples with explanations +- Upgrade procedures (preserving existing config) +- Troubleshooting common issues +- Security considerations +- License and compliance information + +### CHANGELOG.md (REQUIRED - Keep a Changelog format) +```markdown +## [X.Y.Z] - YYYY-MM-DD + +### Added +- New features + +### Changed +- Modifications to existing features + +### Deprecated +- Features to be removed in future + +### Removed +- Deleted features + +### Fixed +- Bug fixes + +### Security +- Security patches and improvements +``` + +### Inline Documentation +- Helm templates: Comments explaining complex logic +- Scripts: Usage examples, parameter descriptions +- Configuration: Purpose of each value, valid ranges, examples + +--- + +## Version Management - #WeOwnVer Ecosystem Versioning + +**Official Specification**: See `/docs/VERSIONING_WEOWNVER.md` for complete details + +### #WeOwnVer Format: SEASON.WEEK.DAY.VERSION + +**Current Context** (date handling for reviews): +- **Today**: February 1, 2026 (Sunday, Season 2, Week 5, Day 7) +- **Season Calendar**: Season 2 (Oct 2025-Feb 2026), Season 3 (Feb-May 2026), Season 4 (Jun-Aug 2026) +- **NOTE**: WEEK values should be validated against dates provided in PR context (commit messages, file contents). Focus on enforcing SEASON.WEEK.DAY.VERSION format and internal consistency with existing versioned files. + +### Chart Version (Chart.yaml) + +**Weekly Summary Releases** (3-digit format): +```yaml +# SEASON.WEEK.0 - Week rollup, no specific day +version: 2.5.0 # Season 2 (current) +``` + +**Daily Releases** (4-digit format): +```yaml +# SEASON.WEEK.DAY.VERSION - Multiple releases same day +version: 2.5.7.1 # Season 2, Week 5, Sunday, 1st release +version: 2.5.7.2 # Season 2, Sunday, 2nd release +``` + +**Version Increment Rules**: +- **New week starts** → Increment WEEK (2.5.0 → 2.6.0) +- **Same week, new day** → Increment DAY (2.5.0 → 2.5.1.1) +- **Same day, hotfix** → Increment VERSION (2.5.7.1 → 2.5.7.2) +- **New season starts** → Increment SEASON (2.x.x → 3.1.0) + +### Day Values (DAY position) +```yaml +0: Summary (week rollup, no daily) +1: Monday +2: Tuesday +3: Wednesday +4: Thursday +5: Friday +6: Saturday +7: Sunday +``` + +### Application Version (Chart.yaml) +```yaml +appVersion: "1.9.1" # Upstream application version (not #WeOwnVer) +``` + +**Sync with upstream**: Track official releases, test before upgrading, document breaking changes + +### Date/Time Awareness for Copilot AI + +**CRITICAL**: Always determine current date/time before version recommendations: + +1. **Use web search** to find current ISO week and date +2. **Map ISO week to Season/Week** using Season Calendar in `/docs/VERSIONING_WEOWNVER.md` +3. **Determine day number** (0-7) based on current day of week +4. **Recommend version** in SEASON.WEEK.DAY.VERSION format + +**Example Logic**: +``` +Current Date: Feb 1, 2026 (Sunday) +ISO Week: W05 +Season: Season 2 (Oct 2025-Feb 2026) +Day: Sunday = 7 + +For Daily Release: 2.5.7.1 (Season 2, Week 5, Day 7, Version 1) +For Weekly Summary: 2.5.0 (Season 2, Week 5, no daily - currently used in Chart.yaml) + +NOTE: Chart.yaml currently uses 2.5.0 (weekly summary format). + Use 4-digit format (2.5.7.1) only for multiple releases on same day. + WEEK methodology will be clarified in future update. +``` + +### Documentation Standards + +**CHANGELOG.md Entry Template**: +```markdown +## [2.5.7.1] - 2026-01-26 + +### Added +- Feature description + +### Changed +- Modification description +``` + +**Version References**: +- Always link to `/docs/VERSIONING_WEOWNVER.md` when documenting versioning +- Use format: "Chart Version: 2.5.0 (#WeOwnVer format)" +- NOTE: WEEK methodology will be clarified in future update + +--- + +## Breaking Changes & Migration Plans + +### When Breaking Changes Are Unavoidable +1. **Document** in CHANGELOG with "BREAKING CHANGE:" prefix +2. **Provide migration guide** with step-by-step instructions +3. **Include rollback procedure** if migration fails +4. **Test migration** in staging before production +5. **Communicate** to all stakeholders before deployment + +### Example Migration Plan +```markdown +## Migration from v2.x to v3.0 + +### Breaking Changes +- Environment slug changed from "production" to "prod" +- InfisicalSecret namespace pattern changed + +### Migration Steps +1. Export current values: `helm get values app -o yaml > values.yaml` +2. Update values.yaml: + - Change `envSlug: "production"` to `envSlug: "prod"` +3. Backup PVCs: `kubectl get pvc -n namespace -o yaml > pvc-backup.yaml` +4. Upgrade: `helm upgrade app ./helm -f values.yaml` +5. Verify: `kubectl get pods -n namespace` + +### Rollback Procedure +`helm rollback app [REVISION]` +``` + +--- + +## Vulnerability Screening - ENFORCE + +### Container Image Scanning +```bash +# ✅ Run before every deployment +trivy image --severity HIGH,CRITICAL : + +# ✅ Fail CI/CD if HIGH/CRITICAL vulnerabilities found +trivy image --exit-code 1 --severity HIGH,CRITICAL : +``` + +### Dependency Scanning +```bash +# Node.js +npm audit --audit-level=high + +# Python +safety check --json + +# Go +govulncheck ./... +``` + +### Kubernetes Configuration Scanning +```bash +# ✅ Scan Helm charts +trivy config ./helm + +# ✅ Check for misconfigurations +kube-bench run --targets master,node +``` + +--- + +## DevOps & CI/CD Best Practices + +### GitOps Workflow +1. **Feature Branch**: All changes in branches (feature/*, fix/*, docs/*) +2. **Pull Request**: Required for main branch, CI/CD runs automatically +3. **Code Review**: Copilot AI + human approval required +4. **Merge to Main**: Triggers production deployment pipeline +5. **Tag Release**: Create Git tag for version tracking + +### CI/CD Pipeline Stages +1. **Lint**: YAML, Helm, shell scripts, Dockerfiles +2. **Security Scan**: Trivy, container scanning, secret detection +3. **Build**: Docker images with SHA tags +4. **Test**: Unit, integration, E2E tests +5. **Staging Deploy**: Automated deployment to staging +6. **Production Deploy**: Manual approval required + +### Deployment Strategy +```yaml +# ✅ CORRECT: Rolling update with surge +strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 # Zero-downtime deployments +``` + +--- + +## Kubernetes-Specific Best Practices + +### Resource Management +```yaml +# ✅ CORRECT: Always define requests and limits +resources: + requests: + cpu: 100m # Guaranteed CPU + memory: 256Mi # Guaranteed memory + limits: + cpu: 500m # Maximum CPU + memory: 1Gi # Maximum memory +``` + +### Probes Configuration +```yaml +# ✅ CORRECT: Proper health checks +livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + +readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 2 +``` + +### Labels & Annotations (STANDARDIZE) +```yaml +# ✅ CORRECT: Consistent labeling +metadata: + labels: + app.kubernetes.io/name: myapp + app.kubernetes.io/instance: prod + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/component: web + app.kubernetes.io/part-of: weown-ai + app.kubernetes.io/managed-by: Helm +``` + +--- + +## Code Review Checklist - Copilot AI MUST VALIDATE ALL + +### Security Review +- [ ] No hardcoded credentials, API keys, or tokens +- [ ] Secrets use Kubernetes secrets or external vault (Infisical) +- [ ] TLS 1.3 enforced, strong cipher suites configured +- [ ] Pod Security Standards "restricted" profile applied +- [ ] NetworkPolicy defined with deny-all + explicit allow +- [ ] RBAC configured with least privilege +- [ ] Container runs as non-root user (UID 1000+) +- [ ] readOnlyRootFilesystem enabled where possible +- [ ] All capabilities dropped except required ones +- [ ] No privileged containers or hostPath volumes + +### Compliance Review (SOC2/ISO/IEC 42001) +- [ ] Audit logging enabled and retention configured +- [ ] Backup procedures documented and tested +- [ ] Data encryption at rest and in transit +- [ ] Access control policies defined +- [ ] Incident response procedures documented +- [ ] Change management process followed +- [ ] AI risk assessment completed (if applicable) +- [ ] Privacy impact assessment (if PII processed) +- [ ] Third-party vendor risk assessment +- [ ] Compliance evidence collected and stored + +### Code Quality Review +- [ ] Code follows project style guide +- [ ] No linter warnings or errors +- [ ] Functions have clear purpose and single responsibility +- [ ] Error handling comprehensive and logged +- [ ] No commented-out code blocks +- [ ] Dependencies up-to-date and vulnerability-free +- [ ] Performance implications assessed +- [ ] Resource usage optimized + +### Testing Review +- [ ] Unit tests written for new code (>80% coverage) +- [ ] Integration tests pass +- [ ] E2E tests pass in staging +- [ ] Load testing performed for high-traffic endpoints +- [ ] Security testing (OWASP Top 10) completed +- [ ] Regression testing confirms no breaks + +### Documentation Review +- [ ] README updated with new features/changes +- [ ] CHANGELOG entry added (Keep a Changelog format) +- [ ] API documentation updated +- [ ] Inline code comments explain complex logic +- [ ] Architecture decision records (ADRs) created +- [ ] Migration guide provided (if breaking changes) + +### Infrastructure Review (Helm/K8s/Docker) +- [ ] Helm chart lints successfully +- [ ] helm template renders correctly +- [ ] kubectl apply --dry-run validates +- [ ] Resource requests/limits defined +- [ ] Health checks (liveness/readiness) configured +- [ ] Labels and annotations consistent +- [ ] Dockerfile uses multi-stage builds +- [ ] Base images minimal and security-scanned +- [ ] Image tags specific (not "latest") + +### Versioning Review +- [ ] Chart version incremented (#WeOwnVer format) +- [ ] appVersion updated if upstream changed +- [ ] Git tags created for releases +- [ ] Breaking changes documented +- [ ] Migration plan provided (if needed) +- [ ] Rollback procedure tested + +--- + +## Common Pitfalls - REJECT IN CODE REVIEW + +### ❌ Security Anti-Patterns +```yaml +# REJECT: Hardcoded secrets +env: +- name: API_KEY + value: "sk-1234567890" # NEVER do this + +# REJECT: Running as root +securityContext: + runAsUser: 0 + privileged: true + +# REJECT: No NetworkPolicy +# Missing networkpolicy.yaml file + +# REJECT: Weak TLS +annotations: + nginx.ingress.kubernetes.io/ssl-protocols: "TLSv1.0 TLSv1.1" +``` + +### ❌ Configuration Anti-Patterns +```yaml +# REJECT: Missing resource limits +resources: {} # Always define requests/limits + +# REJECT: No health checks +# Missing livenessProbe and readinessProbe + +# REJECT: Using "latest" tag +image: myapp:latest # Always use specific versions +``` + +### ❌ Helm Anti-Patterns +```yaml +# REJECT: No defaults +value: {{ .Values.required }} # Use "default" or "required" + +# REJECT: Hardcoded namespaces +namespace: production # Use templating + +# REJECT: No nil checks +{{ .Values.optional.nested.field }} # Check hasKey first +``` + +--- + +## Emergency Procedures + +### Security Incident Response +1. **Immediate**: Isolate affected pods (`kubectl scale deployment --replicas=0`) +2. **Investigate**: Collect logs (`kubectl logs`, `kubectl describe`) +3. **Rotate**: All potentially compromised secrets +4. **Patch**: Apply security fixes +5. **Document**: Post-mortem report, lessons learned +6. **Communicate**: Stakeholders, affected users, compliance team + +### Production Rollback +```bash +# Check revision history +helm history -n + +# Rollback to previous version +helm rollback -n + +# Verify rollback +kubectl get pods -n +kubectl logs -n deployment/ +``` + +--- + +## Additional Resources + +### WeOwn-Specific Guidelines +- **Namespace Naming**: `-` (e.g., `wordpress-romandid`) +- **Storage**: DigitalOcean block storage, ReadWriteOnce access mode +- **Networking**: NGINX Ingress controller in `ingress-nginx` namespace +- **Certificates**: cert-manager with Let's Encrypt prod issuer +- **Secrets**: Infisical integration for production, Kubernetes secrets for staging + +### External Standards +- **SOC2**: AICPA Trust Services Criteria +- **ISO/IEC 42001**: AI Management System +- **ISO/IEC 27001**: Information Security +- **CIS Kubernetes Benchmark**: Security hardening +- **NIST Cybersecurity Framework**: Risk management + +--- + +## Copilot AI Review Enforcement + +### Copilot Capabilities & Limitations + +**What GitHub Copilot CAN Do** (Static Analysis): +- ✅ Scan code for security anti-patterns (hardcoded secrets, weak TLS, root users) +- ✅ Validate YAML/JSON/code syntax +- ✅ Detect missing files (NetworkPolicy, RBAC, secrets) +- ✅ Check documentation completeness +- ✅ Verify naming conventions and style +- ✅ Identify configuration violations +- ✅ Recommend specific fixes with file locations + +**What GitHub Copilot CANNOT Do** (Dynamic Execution): +- ❌ Execute shell commands (`helm lint`, `kubectl apply --dry-run`) +- ❌ Run vulnerability scanners (`trivy image`, `trivy config`) +- ❌ Execute test suites (unit, integration, E2E) +- ❌ Deploy to Kubernetes clusters +- ❌ Build Docker images +- ❌ Perform performance testing + +### CI/CD Integration Required + +**For command execution and automated enforcement**, see `.github/CI_CD_WORKFLOWS.md`: +- Automated validation workflows (lint, security, K8s validation) +- Quality gates and blocking checks +- Compliance automation (SOC2, ISO/IEC 42001) +- Performance and dependency scanning + +### Review Process + +**Copilot's Role**: +1. **Scan** all code changes against this instruction file +2. **Identify** violations with severity (CRITICAL, HIGH, MEDIUM, LOW) +3. **Recommend** specific fixes with file paths and line numbers +4. **Reference** relevant sections from this file +5. **Suggest** CI/CD workflow additions if needed + +**User's Role**: +1. **Review** Copilot comments and recommendations +2. **Execute** validation commands locally (helm lint, kubectl dry-run) +3. **Run** security scans (trivy) before pushing +4. **Complete** human-in-the-loop checklist in PR +5. **Verify** CI/CD workflows pass before merge + +**Rejection Criteria**: Any violation of MANDATORY requirements (marked with REQUIRED, CRITICAL, ENFORCE) must result in code review failure with specific remediation steps. + +**Approval Criteria**: +- ✅ All Copilot static analysis checks passed +- ✅ All CI/CD workflows succeeded +- ✅ Documentation complete +- ✅ Security validated +- ✅ Compliance confirmed +- ✅ Human-in-the-loop checklist completed + +**Final Human Validation**: Human-in-the-loop review checklist in auto-generated PR body must be completed before merge. + +--- + +**Last Updated**: 2026-01-26 (v2.5.0) +**Maintained By**: Roman Di Domizio (roman@weown.email) +**Compliance Standards**: SOC2 Type II, ISO/IEC 42001:2023 diff --git a/.github/workflows/auto-pr-maintenance.yml b/.github/workflows/auto-pr-maintenance.yml deleted file mode 100644 index 7036b7e..0000000 --- a/.github/workflows/auto-pr-maintenance.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Auto-Create PR from Maintenance - -on: - push: - branches: - - maintenance - -permissions: - contents: read - pull-requests: write - -jobs: - create-pr: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Create Pull Request - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # Check if PR already exists - existing_pr=$(gh pr list --base main --head maintenance --json number --jq '.[0].number') - - if [ -n "$existing_pr" ]; then - echo "PR #$existing_pr already exists, skipping creation" - exit 0 - fi - - # Create PR body file to safely handle special characters - cat > /tmp/pr-body.txt << 'PRBODY' - 🤖 Automated Pull Request - - This PR was automatically created when changes were pushed to the maintenance branch. - - Please review and merge when ready. - PRBODY - - # Create new PR - gh pr create \ - --base main \ - --head maintenance \ - --title "Auto-PR: Merge maintenance → main" \ - --body-file /tmp/pr-body.txt - - # Cleanup - rm -f /tmp/pr-body.txt diff --git a/.github/workflows/auto-pr-to-main.yml b/.github/workflows/auto-pr-to-main.yml new file mode 100644 index 0000000..3daf65e --- /dev/null +++ b/.github/workflows/auto-pr-to-main.yml @@ -0,0 +1,158 @@ +name: Auto-Create PR to Main + +on: + push: + branches: + - 'maintenance' + - 'feature/*' + - 'fix/*' + - 'docs/*' + - 'hotfix/*' + - '!main' + - '!experimental/*' + +permissions: + contents: read + pull-requests: write + +jobs: + create-pr: + runs-on: ubuntu-latest + steps: + - name: Generate GitHub App Token + id: generate-token + uses: actions/create-github-app-token@d72941d797fd3113feb6b93fd0dec494b13a2547 # v1 + with: + app-id: ${{ secrets.APP_ID }} + private-key: ${{ secrets.APP_PRIVATE_KEY }} + owner: ${{ github.repository_owner }} + repositories: ${{ github.event.repository.name }} + + - name: Checkout repository + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + with: + fetch-depth: 0 + token: ${{ steps.generate-token.outputs.token }} + + - name: Create Pull Request + id: create-pr + env: + GITHUB_TOKEN: ${{ steps.generate-token.outputs.token }} + run: | + # Get current branch name + BRANCH_NAME="${{ github.ref_name }}" + + # Check if PR already exists + existing_pr=$(gh pr list --base main --head "$BRANCH_NAME" --json number --jq '.[0].number') + + if [ -n "$existing_pr" ]; then + echo "PR #$existing_pr already exists, new commits will be added automatically" + echo "pr_number=$existing_pr" >> $GITHUB_OUTPUT + exit 0 + fi + + # Create PR body and title files with cleanup trap + # Set trap first to ensure cleanup even if second mktemp fails + trap 'rm -f "$PR_BODY" "$PR_TITLE"' EXIT + PR_BODY=$(mktemp) + PR_TITLE=$(mktemp) + + # Generate dynamic title from first commit (relative to main when available) + TARGET_BRANCH="main" + if git rev-parse --verify "origin/$TARGET_BRANCH" >/dev/null 2>&1; then + FIRST_COMMIT=$(git log --format=%s -1 "$BRANCH_NAME" ^"origin/$TARGET_BRANCH") + else + FIRST_COMMIT=$(git log --format=%s -1 "$BRANCH_NAME") + fi + + # Fallback if no unique commits are found or subject is empty + if [ -z "$FIRST_COMMIT" ]; then + # Determine commit count compared to target branch when possible + if git rev-parse --verify "origin/$TARGET_BRANCH" >/dev/null 2>&1; then + COMMIT_COUNT=$(git rev-list --count "$BRANCH_NAME" ^"origin/$TARGET_BRANCH" 2>/dev/null || echo "") + else + COMMIT_COUNT=$(git rev-list --count "$BRANCH_NAME" 2>/dev/null || echo "") + fi + + # Use latest commit message on the branch as an additional hint + LATEST_SUBJECT=$(git log --format=%s -1 "$BRANCH_NAME" 2>/dev/null || echo "") + + if [ -n "$LATEST_SUBJECT" ]; then + FIRST_COMMIT="Merge $BRANCH_NAME into $TARGET_BRANCH - $LATEST_SUBJECT" + elif [ -n "$COMMIT_COUNT" ]; then + FIRST_COMMIT="Merge $BRANCH_NAME into $TARGET_BRANCH ($COMMIT_COUNT commits)" + else + FIRST_COMMIT="Merge $BRANCH_NAME into $TARGET_BRANCH" + fi + fi + + # Create title: "Auto-PR: " + echo "Auto-PR: $FIRST_COMMIT" > "$PR_TITLE" + + { + echo "🤖 Automated Pull Request" + echo "" + echo "## 📋 Human-in-the-Loop Review Checklist" + echo "" + echo "**Review the following before approving this PR:**" + echo "" + echo "### Security & Compliance" + echo "- [ ] All GitHub Copilot AI code review comments addressed" + echo "- [ ] SOC2/ISO/IEC 42001 compliance requirements validated" + echo "- [ ] Security best practices followed (no hardcoded secrets, proper RBAC, etc.)" + echo "- [ ] No sensitive data in commits" + echo "- [ ] TLS 1.3 configured where applicable" + echo "" + echo "### Code Quality & Testing" + echo "- [ ] Code follows established conventions and style guides" + echo "- [ ] All automated tests passing" + echo "- [ ] No breaking changes (or migration plan documented)" + echo "- [ ] Performance implications assessed" + echo "- [ ] Error handling adequate" + echo "" + echo "### Documentation & Versioning" + echo "- [ ] Documentation updated (README, CHANGELOG, inline comments)" + echo "- [ ] Version numbers incremented appropriately" + echo "- [ ] API changes documented" + echo "- [ ] Architecture Decision Records (ADRs) created if applicable" + echo "" + echo "### Infrastructure & DevOps" + echo "- [ ] Helm chart best practices followed" + echo "- [ ] Kubernetes manifests validated (helm lint, kubectl dry-run)" + echo "- [ ] Docker best practices followed (multi-stage builds, security)" + echo "- [ ] Resource limits and requests configured" + echo "- [ ] Deployment tested in staging" + echo "" + echo "## 📝 Recent Commits" + echo "" + # Show last 5 commits on current branch (handle missing main branch) + if git rev-parse --verify origin/main >/dev/null 2>&1; then + git log --oneline --no-decorate -5 "$BRANCH_NAME" ^origin/main + else + git log --oneline --no-decorate -5 "$BRANCH_NAME" + fi + echo "" + echo "---" + echo "" + echo "**🔍 Copilot AI Review**: Automated compliance and security validation will run on this PR." + echo "" + echo "**📚 Guidelines**: See \`.github/copilot-instructions.md\` for complete review criteria." + echo "" + echo "**Auto-generated by** \`.github/workflows/auto-pr-to-main.yml\`" + } > "$PR_BODY" + + # Create PR with dynamic title and body + pr_url=$(gh pr create \ + --base main \ + --head "$BRANCH_NAME" \ + --title "$(cat "$PR_TITLE")" \ + --body-file "$PR_BODY") + + # Extract PR number from URL + pr_number=$(echo "$pr_url" | grep -oE '[0-9]+$') + echo "pr_number=$pr_number" >> $GITHUB_OUTPUT + echo "Created PR #$pr_number" + echo "Note: Copilot auto-review will be triggered by Repository Ruleset" + + # Note: cleanup of PR_BODY and PR_TITLE temp files is handled by the 'trap EXIT' set on line 56 + # Trap is set before mktemp calls to ensure cleanup even if subsequent operations fail diff --git a/.gitignore b/.gitignore index 2e5aa57..744f3bd 100644 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,9 @@ Icon? servers/ knowledge-cache/ +# Windsurf/AI Assistant workspace rules (local only) +.windsurf/rules/ + # Development artifacts getMessage **/getMessage @@ -103,3 +106,6 @@ helm/Chart.lock # Temporary Helm values values-*.yaml.tmp custom-values-*.yaml + +# Windsurf workspace rules (local configuration, never commit) +.windsurf/rules/ diff --git a/anythingllm/CHANGELOG.md b/anythingllm/CHANGELOG.md index ca3f69a..5f761e7 100644 --- a/anythingllm/CHANGELOG.md +++ b/anythingllm/CHANGELOG.md @@ -3,9 +3,18 @@ All notable changes to the AnythingLLM Kubernetes deployment will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +and this project adheres to [#WeOwnVer](/docs/VERSIONING_WEOWNVER.md) (Season.Week.Day.Version). -## [2.1.0] - 2026-01-25 +## [2.5.0] - 2026-01-26 + +### Changed - Versioning System +- **Adopted #WeOwnVer**: Transitioned from Semantic Versioning to WeOwn ecosystem versioning +- **Version Format**: SEASON.WEEK.DAY.VERSION (2.5.0 = Season 2, Week 5, summary) +- **Documentation**: Added reference to `/docs/VERSIONING_WEOWNVER.md` for versioning standards +- **Chart Version**: Updated to align with WeOwn ecosystem rhythm (Season 2, Week 5) +- **Migration Note**: `2.5.0` is the direct successor to `2.1.0`. Versions before `2.5.0` used SemVer, and versions `2.5.0` and later use #WeOwnVer, so version numbers across this change are not directly comparable by their numeric components. + +## [2.1.0] - 2026-01-25 (Legacy SemVer) ### Added - Enterprise Secrets Management (Infisical Integration) diff --git a/anythingllm/README.md b/anythingllm/README.md index 09a757b..385a4c7 100644 --- a/anythingllm/README.md +++ b/anythingllm/README.md @@ -272,6 +272,22 @@ Agent skills can execute code on your system. The default setting (`"1"`) only a To disable completely, remove the `COMMUNITY_HUB_BUNDLE_DOWNLOADS_ENABLED` variable from values.yaml. +### ⚙️ Helm Value Management + +For comprehensive guidance on safely updating configuration values in production: + +**📖 See: [`/docs/HELM_VALUE_MANAGEMENT.md`](/docs/HELM_VALUE_MANAGEMENT.md)** + +This guide covers: +- ✅ **Safe upgrade strategies** (`--reuse-values` vs `--reset-values` vs `--values`) +- ✅ **Live deployment updates** without downtime +- ✅ **Common pitfalls** and how to avoid them (database connection failures, lost configuration) +- ✅ **GUI tools** (Lens, Portainer) and their limitations +- ✅ **Deploy script integration** for secure value updates +- ✅ **Emergency recovery** procedures + +**Critical Rule:** Always use `--reuse-values` with stateful applications (AnythingLLM, WordPress, Matomo). Never use `--reset-values` as it regenerates all values including passwords, breaking database connections. + ### 🔑 API Key Management & Rotation #### Manual Secret Management (Current Process) @@ -589,9 +605,10 @@ The script generates secure admin credentials for: ### 🔄 **Updates & Maintenance** #### **Version Information** -- **Current Version**: 1.9.1 (January 2026) -- **Chart Version**: 2.0.7 -- **Image**: `mintplexlabs/anythingllm:1.9.1` +- **Current Version**: 1.10.0 (January 2026) +- **Chart Version**: 2.5.0 (#WeOwnVer: Season 2, Week 5) +- **Versioning System**: [#WeOwnVer](/docs/VERSIONING_WEOWNVER.md) (Season.Week.Day.Version) +- **Image**: `mintplexlabs/anythingllm:1.10.0` - **Update Strategy**: Rolling updates with zero downtime #### **Manual Upgrade Commands** diff --git a/anythingllm/docs/INFISICAL_INTEGRATION.md b/anythingllm/docs/INFISICAL_INTEGRATION.md index f375968..4237902 100644 --- a/anythingllm/docs/INFISICAL_INTEGRATION.md +++ b/anythingllm/docs/INFISICAL_INTEGRATION.md @@ -614,5 +614,7 @@ infisical: --- **Last Updated**: January 2026 -**Version**: 2.1.0 +**Version**: 2.5.0 (#WeOwnVer format) **Maintainer**: WeOwn Development Team + +**Note**: This document follows the #WeOwnVer versioning system. See `/docs/VERSIONING_WEOWNVER.md` for details. The exact WEEK value methodology will be clarified in a future update. diff --git a/anythingllm/helm/Chart.yaml b/anythingllm/helm/Chart.yaml index 11d964a..17ea691 100644 --- a/anythingllm/helm/Chart.yaml +++ b/anythingllm/helm/Chart.yaml @@ -8,11 +8,12 @@ description: | type: application # Chart version - increment when making changes to chart templates -# Follows Semantic Versioning (https://semver.org/) -version: 2.1.0 +# Follows #WeOwnVer (Season.Week.Day.Version) - see /docs/VERSIONING_WEOWNVER.md +# Current: Season 2, Week 5 (Jan 26-Feb 1, 2026), Day 7 (Sunday, Feb 1) +version: 2.5.0 # Application version - AnythingLLM version being deployed -appVersion: "1.9.1" +appVersion: "1.10.0" # Keywords for chart discovery keywords: diff --git a/anythingllm/helm/values.yaml b/anythingllm/helm/values.yaml index 8591d70..7df5831 100644 --- a/anythingllm/helm/values.yaml +++ b/anythingllm/helm/values.yaml @@ -1,6 +1,6 @@ -# values.yaml for AnythingLLM - WeOwn Enterprise Security MVP-0.2 +# values.yaml for AnythingLLM # SECURITY NOTE: No secrets should be stored in this file! -# All sensitive values are injected from Kubernetes secrets. +# All sensitive values are injected from Infisical and Kubernetes secrets. # Enterprise-grade security: Argon2id hashing, rate limiting, security headers, zero-trust networking # Global configuration diff --git a/docs/HELM_VALUE_MANAGEMENT.md b/docs/HELM_VALUE_MANAGEMENT.md new file mode 100644 index 0000000..1aa95e5 --- /dev/null +++ b/docs/HELM_VALUE_MANAGEMENT.md @@ -0,0 +1,719 @@ +# Helm Value Management & Safe Upgrade Strategies + +**Version**: 2.5.0 +**Last Updated**: January 26, 2026 +**Applies To**: WordPress, Matomo, AnythingLLM, n8n, Vaultwarden, Nextcloud + +--- + +## 🚨 Critical Warning + +**NEVER use `--reset-values` with stateful applications!** This regenerates ALL values including passwords, breaking database connections and losing all configuration. + +--- + +## Understanding Helm Value Precedence + +### `--reuse-values` (✅ Recommended for Stateful Apps) + +```bash +helm upgrade myapp ./chart --reuse-values +``` + +**Behavior:** +- Keeps ALL existing values from previous deployment +- Only adds NEW values introduced in chart updates +- Preserves passwords, domains, secrets, and all configuration + +**Use Cases:** +- ✅ WordPress, Matomo, AnythingLLM (any app with databases) +- ✅ When you want to change 1-2 specific values +- ✅ Production upgrades where safety is critical + +**Advantages:** +- Zero risk of password regeneration +- Database connections remain intact +- Configuration persists across upgrades + +**Disadvantages:** +- May miss important chart default changes +- Requires explicit `--set` flags for new values + +**Example:** +```bash +# Safe upgrade with single non-sensitive value change (OK to use --set) +helm upgrade anythingllm ./helm \ + --namespace anything-llm \ + --reuse-values \ + --set ingress.domain="newdomain.com" + +# Safe upgrade updating secrets via a temporary values file (recommended) +SECRET_VALUES=$(mktemp) +trap 'rm -f "$SECRET_VALUES"' EXIT +cat > "$SECRET_VALUES" << 'EOF' +anythingllm: + openRouterKey: "new-key" + jwtSecret: "new-jwt" +EOF + +helm upgrade anythingllm ./helm \ + --namespace anything-llm \ + --reuse-values \ + --values "$SECRET_VALUES" +``` + +--- + +### `--reset-values` (❌ Dangerous for Stateful Apps) + +```bash +helm upgrade myapp ./chart --reset-values +``` + +**Behavior:** +- ❌ **DISCARDS ALL existing values** +- Regenerates everything from chart defaults +- Creates NEW random passwords for placeholders + +**Use Cases:** +- Only for complete redeployment +- Only for stateless applications with no persistent data +- When you explicitly want to wipe configuration + +**Dangers:** +- ⚠️ **Database connection failures** - MariaDB has old password, app gets new password +- ⚠️ **Lost configuration** - domains, emails, API keys all regenerated +- ⚠️ **Downtime** - requires manual secret patching to recover + +**The Incident (WordPress application version 3.2.5):** +``` +1. Deployed WordPress → Password: WUOgATZwjcTICvkoBhoO7cd3W +2. Upgraded with --reset-values → NEW password generated +3. MariaDB PVC still has OLD password (persistent data) +4. WordPress tries to connect with NEW password → Access denied +5. Site shows: "Error establishing a database connection" +``` + +**Never Use With:** +- WordPress, Matomo, AnythingLLM, Nextcloud (databases) +- n8n, Vaultwarden (persistent storage) +- Any app with StatefulSets or PVCs + +--- + +### `--values` (Merge Strategy) + +```bash +helm upgrade myapp ./chart --values custom-values.yaml +``` + +**Behavior:** +- Merges your values file with chart defaults +- Chart defaults take precedence for unspecified values +- Predictable, version-controlled configuration + +**Use Cases:** +- When you maintain a complete values file +- GitOps workflows with values in version control +- Multi-environment deployments (staging, production) + +**Advantages:** +- Version-controlled configuration +- Repeatable deployments +- Easy to review changes (git diff) + +**Disadvantages:** +- Must keep values file in sync with chart updates +- Requires maintaining separate values file per deployment + +**Example:** +```bash +# Extract current values +helm get values anythingllm -n anything-llm > anythingllm-values.yaml + +# Modify the file +vim anythingllm-values.yaml + +# Apply changes +helm upgrade anythingllm ./helm \ + --namespace anything-llm \ + --values anythingllm-values.yaml +``` + +--- + +### Best Practice: Extract → Modify → Apply + +**Recommended workflow for safe upgrades:** + +```bash +# 0. Create a secure temporary file and ensure it is cleaned up +if ! VALUES_FILE="$(mktemp --suffix=.yaml)"; then + echo "Error: Failed to create temporary values file" >&2 + exit 1 +fi +trap 'rm -f "$VALUES_FILE"' EXIT + +# 1. Extract current values +helm get values anythingllm -n anything-llm > "$VALUES_FILE" + +# 2. Review and modify +cat "$VALUES_FILE" +# Edit only what you need to change, e.g.: +# "${EDITOR:-nano}" "$VALUES_FILE" + +# 3. Apply with layered approach +helm upgrade anythingllm ./helm \ + --namespace anything-llm \ + --reuse-values \ + --values "$VALUES_FILE" +``` + +**Why this works:** +- `--reuse-values` preserves all existing values +- `--values` overlays your specific changes +- Zero risk of losing critical configuration + +--- + +## Live Deployment Value Updates + +### Method 1: Helm Upgrade with Values File (✅ Recommended for Secrets) + +```bash +# Non-sensitive values can use --set +helm upgrade anythingllm ./helm \ + --namespace anything-llm \ + --reuse-values \ + --set ingress.domain="newdomain.com" + +# Secrets should use temporary values file to avoid shell history exposure +SECRET_VALUES=$(mktemp) +trap 'rm -f "$SECRET_VALUES"' EXIT +cat > "$SECRET_VALUES" << EOF +anythingllm: + openRouterKey: "sk-or-v1-xxx" + jwtSecret: "$(openssl rand -hex 32)" +EOF + +helm upgrade anythingllm ./helm \ + --namespace anything-llm \ + --reuse-values \ + --values "$SECRET_VALUES" +``` + +**Advantages:** +- ✅ **Persistent** - Changes saved in Helm release +- ✅ **Survives pod restarts** and cluster maintenance +- ✅ **Audit trail** in Helm history +- ✅ **Rollback capable** with `helm rollback` + +**Disadvantages:** +- Requires helm command access +- Values visible in shell history (use temp files for secrets) + +--- + +### Method 2: kubectl patch Secret (⚠️ Temporary) + +```bash +# Base64 encode your new value +NEW_VALUE=$(echo -n "new-api-key" | base64) + +# Patch the secret +kubectl patch secret anythingllm-secrets \ + -n anything-llm \ + --type='json' \ + -p='[{"op":"replace","path":"/data/OPENROUTER_API_KEY","value":"'$NEW_VALUE'"}]' + +# Restart pods to pick up change +kubectl rollout restart deployment anythingllm -n anything-llm +``` + +**Advantages:** +- ✅ **Fast** - Immediate change without helm upgrade +- ✅ **No helm required** - Works with kubectl only + +**Disadvantages:** +- ❌ **NOT persistent** - Next helm upgrade overwrites +- ❌ **Manual pod restart** required +- ❌ **No audit trail** in Helm history +- ❌ **Not recommended for production** + +--- + +### Method 3: AnythingLLM UI (❌ Not Persistent) + +**Location:** AnythingLLM UI → Settings → LLM Preferences → API Keys + +**Problems:** +- ❌ Changes stored in SQLite database, NOT Kubernetes secrets +- ❌ **Lost on pod restart** unless using persistent volume +- ❌ **Not synchronized** with Helm values +- ❌ **Not recommended** for production + +**When to use:** +- Testing API keys before committing to Helm +- Temporary configuration changes +- Non-production environments + +--- + +### Method 4: GUI Tools (Lens, Portainer, k9s) + +#### Lens Desktop (Best GUI Option) + +```bash +# Install from: https://k8slens.dev + +# Workflow: +1. Connect to cluster +2. Navigate: Workloads → Secrets → anythingllm-secrets +3. Click "Edit" → Modify values +4. Navigate: Workloads → Deployments → anythingllm +5. Click "Restart" to apply changes +``` + +**Advantages:** +- ✅ **User-friendly GUI** for Kubernetes management +- ✅ **Real-time validation** of YAML/JSON +- ✅ **Visual diff** of changes + +**Disadvantages:** +- ❌ **Still not persistent** - Helm will overwrite on next upgrade +- ❌ Desktop application required + +#### Portainer (Web UI) + +**Location:** Already deployed in monitoring stack + +```bash +# Workflow: +1. Navigate to: https://portainer.{CLUSTER_DOMAIN} +2. Go to: Kubernetes → Secrets → anythingllm-secrets +3. Click "Edit" → Modify values +4. Go to: Kubernetes → Deployments → anythingllm +5. Click "Redeploy" to apply changes +``` + +**Same persistence limitations as Lens** + +--- + +### Method 5: Deploy Script Integration (✅ Best for Production) + +**Production-tested implementation:** + +The AnythingLLM `deploy.sh` script includes a complete, secure configuration update feature with proper temporary file cleanup and error handling. + +**Usage:** +```bash +cd /path/to/anythingllm +./deploy.sh +# Select existing deployment → Choose configuration update option +``` + +**Features implemented in deploy.sh:** +- Secure temporary file creation with `mktemp` +- Proper trap cleanup for ALL temporary files +- Interactive menu for quick updates (API keys, JWT secrets, etc.) +- Full values file editing with `$EDITOR` +- Error handling and rollback on failure +- `--reuse-values` to preserve existing configuration + +**See:** `/anythingllm/deploy.sh` for the complete, production-ready implementation with all security best practices applied. + +--- + +## Comparison Matrix + +| Method | Persistent | GUI | Fast | Prod-Safe | Audit Trail | +|--------|-----------|-----|------|-----------|-------------| +| **helm upgrade --reuse-values --set** | ✅ | ❌ | ⚠️ | ✅ | ✅ | +| **helm upgrade --values** | ✅ | ❌ | ⚠️ | ✅ | ✅ | +| **kubectl patch secret** | ❌ | ❌ | ✅ | ❌ | ❌ | +| **AnythingLLM UI** | ❌ | ✅ | ✅ | ❌ | ❌ | +| **Lens/Portainer GUI** | ❌ | ✅ | ✅ | ❌ | ❌ | +| **Deploy script function** | ✅ | ✅ | ✅ | ✅ | ✅ | + +--- + +## Common Scenarios + +### Scenario 1: Update API Key Only + +```bash +# Recommended: Helm upgrade with temporary values file (avoids shell history exposure) +SECRET_VALUES=$(mktemp) +trap 'rm -f "$SECRET_VALUES"' EXIT +cat > "$SECRET_VALUES" << 'EOF' +anythingllm: + openRouterKey: "sk-or-v1-new-key" +EOF + +helm upgrade anythingllm ./helm \ + --namespace anything-llm \ + --reuse-values \ + --values "$SECRET_VALUES" +``` + +### Scenario 2: Rotate JWT Secret + +```bash +# Generate new secret and apply via temporary values file +SECRET_VALUES=$(mktemp) +trap 'rm -f "$SECRET_VALUES"' EXIT +cat > "$SECRET_VALUES" << EOF +anythingllm: + jwtSecret: "$(openssl rand -hex 32)" +EOF + +helm upgrade anythingllm ./helm \ + --namespace anything-llm \ + --reuse-values \ + --values "$SECRET_VALUES" + +# Note: All users will be logged out (expected behavior) +``` + +### Scenario 3: Change Domain + +```bash +# Update domain and regenerate TLS certificate +helm upgrade anythingllm ./helm \ + --namespace anything-llm \ + --reuse-values \ + --set ingress.domain="newdomain.com" + +# Update DNS A record to point to cluster external IP +kubectl get svc -n ingress-nginx ingress-nginx-controller -o jsonpath='{.status.loadBalancer.ingress[0].ip}' +``` + +### Scenario 4: Upgrade Chart Version + +```bash +# Extract current values first into a secure temporary file +BACKUP_FILE="$(mktemp "${TMPDIR:-/tmp}/anythingllm-backup-XXXXXX.yaml")" +helm get values anythingllm -n anything-llm > "$BACKUP_FILE" +echo "Backup saved to: $BACKUP_FILE" + +# Upgrade chart with reused values +helm upgrade anythingllm ./helm \ + --namespace anything-llm \ + --reuse-values + +# If something breaks, rollback +helm rollback anythingllm -n anything-llm +``` + +### Scenario 5: Bulk Configuration Changes + +```bash +# Create a secure temporary file for current values +TMP_VALUES_FILE="$(mktemp)" +trap 'rm -f "$TMP_VALUES_FILE"' EXIT + +# Extract current values +helm get values anythingllm -n anything-llm > "${TMP_VALUES_FILE}" + +# Edit multiple values +"${EDITOR:-vim}" "${TMP_VALUES_FILE}" + +# Apply all changes at once +helm upgrade anythingllm ./helm \ + --namespace anything-llm \ + --reuse-values \ + --values "${TMP_VALUES_FILE}" + +# Cleanup handled by trap EXIT +``` + +--- + +## Troubleshooting + +### Issue: "Error establishing database connection" after upgrade + +**Cause:** Used `--reset-values` which regenerated passwords + +**Solution:** +```bash +# Get old password from Helm history +helm get values anythingllm -n anything-llm --revision 5 | grep mariadbPassword + +# Encode password separately to avoid exposure +OLD_PASSWORD_BASE64=$(echo -n "OLD_PASSWORD" | base64) + +# Patch secret with correct password (use the base64-encoded value) +kubectl patch secret anythingllm-secrets \ + -n anything-llm \ + --type='json' \ + -p='[{"op":"replace","path":"/data/MARIADB_PASSWORD","value":"'"$OLD_PASSWORD_BASE64"'"}]' + +# Restart pods +kubectl rollout restart deployment anythingllm -n anything-llm +``` + +### Issue: Changes not persisting after pod restart + +**Cause:** Changes made via kubectl or application UI, not Helm + +**Solution:** Always use `helm upgrade` with `--reuse-values` or `--values` + +### Issue: Can't remember what values were used + +```bash +# View current values +helm get values anythingllm -n anything-llm + +# View all values (including defaults) +helm get values anythingllm -n anything-llm --all + +# View values from specific revision +helm get values anythingllm -n anything-llm --revision 3 +``` + +--- + +## Security Best Practices + +### Never Expose Secrets in Shell History + +```bash +# ❌ BAD: Secret visible in shell history +helm upgrade app ./chart --set password="MySecret123" + +# ✅ GOOD: Use temporary file +AUTH_FILE="$(mktemp)" +trap 'rm -f "$AUTH_FILE"' EXIT + +# Single quotes prevent variable expansion - this is literal text +cat > "$AUTH_FILE" << 'EOF' +password: MySecret123 +apiKey: sk-xxx +EOF + +helm upgrade app ./chart --reuse-values --values "$AUTH_FILE" + +# Cleanup handled by trap EXIT +``` + +### Use Secure Secret Generation + +```bash +# Generate cryptographically secure secrets +openssl rand -hex 32 # JWT secrets +openssl rand -base64 32 # API tokens +pwgen -s 32 1 # Passwords (if pwgen installed) +``` + +### Audit Helm Changes + +```bash +# View Helm history +helm history anythingllm -n anything-llm + +# View specific revision details +helm get values anythingllm -n anything-llm --revision 10 + +# Compare two revisions +diff <(helm get values app -n ns --revision 1) \ + <(helm get values app -n ns --revision 2) +``` + +--- + +## Related Documentation + +- [`VERSIONING_WEOWNVER.md`](./VERSIONING_WEOWNVER.md) - WeOwn versioning system +- [`/anythingllm/README.md`](../anythingllm/README.md) - AnythingLLM deployment guide +- [`/anythingllm/docs/INFISICAL_INTEGRATION.md`](../anythingllm/docs/INFISICAL_INTEGRATION.md) - Automated secret rotation + +--- + +## Quick Reference + +### Safe Upgrade Commands + +```bash +# Standard upgrade (safe for all apps) +helm upgrade APP ./helm --namespace NS --reuse-values + +# Upgrade with single value change +helm upgrade APP ./helm --namespace NS --reuse-values --set key=value + +# Upgrade with multiple changes +helm upgrade APP ./helm --namespace NS --reuse-values \ + --set key1=value1 \ + --set key2=value2 + +# Upgrade with values file (using a secure temporary file) +VALUES_FILE="$(mktemp)" +helm get values APP -n NS > "$VALUES_FILE" +# Edit "$VALUES_FILE" +helm upgrade APP ./helm --namespace NS --reuse-values --values "$VALUES_FILE" +rm -f "$VALUES_FILE" + +# Rollback if needed +helm rollback APP -n NS +``` + +### Emergency Recovery + +```bash +# If deployment is broken after upgrade: +1. Check what changed: helm diff revision APP 1 2 -n NS +2. View old values: helm get values APP -n NS --revision 1 +3. Rollback: helm rollback APP -n NS +4. Verify: kubectl get pods -n NS +``` + +--- + +## Enterprise Secrets Management + +### 🔐 **Best Practice: External Secret Managers** + +**Recommended Approach**: Use **Infisical Kubernetes Operator** or **HashiCorp Vault** instead of raw Kubernetes secrets for production deployments. + +#### **Why External Secret Managers?** + +**Security Benefits:** +- ✅ Centralized secret rotation without pod restarts +- ✅ Audit trails for secret access +- ✅ Automatic secret sync across clusters +- ✅ Reduced manual intervention +- ✅ Enterprise compliance (SOC2/ISO42001) +- ✅ Secret versioning and rollback + +**vs. Native Kubernetes Secrets:** +- ❌ Manual rotation requires pod restarts +- ❌ Limited audit capabilities +- ❌ No cross-cluster sync +- ❌ Secrets visible in etcd (even if encrypted) +- ❌ Process listing exposure with `--set` flags + +#### **Infisical Kubernetes Operator Setup** + +```bash +# 1. Install Infisical Operator (version pinned for supply chain security) +helm repo add infisical https://infisical.github.io/helm-charts +helm install infisical-secrets-operator infisical/secrets-operator \ + --version 0.9.0 \ + -n infisical \ + --create-namespace + +# 2. Create InfisicalSecret resource +cat < "$SECRET_VALUES" < + - identity: {} +``` + +**3. Restrict Secret Access with RBAC:** +```yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: secret-reader + namespace: anything-llm +rules: +- apiGroups: [""] + resources: ["secrets"] + resourceNames: ["anythingllm-secrets"] + verbs: ["get"] +``` + +**4. Rotate Secrets Every 90 Days:** +```bash +# Use secure temp file method from above +# Track rotation dates in compliance documentation +``` + +**5. Audit Secret Access:** +```bash +# Enable Kubernetes audit logging +kubectl logs -n kube-system kube-apiserver-* | grep "secrets/anythingllm-secrets" +``` + +#### **Migration Path: Kubernetes Secrets → Infisical** + +```bash +# 1. Export existing secrets to secure temporary file +# WARNING: This backup contains sensitive data in plain text +BACKUP_FILE="$(mktemp --suffix=.json)" +kubectl get secret anythingllm-secrets -n anything-llm -o json > "$BACKUP_FILE" +echo "⚠️ SECURITY WARNING: Backup file $BACKUP_FILE contains secrets in plain text" +echo " Delete immediately after migration or encrypt with: gpg -c $BACKUP_FILE" + +# 2. Import to Infisical via CLI or dashboard (using stdin to avoid CLI exposure) +kubectl get secret anythingllm-secrets -n anything-llm -o jsonpath='{.data.OPENROUTER_API_KEY}' | base64 -d | infisical secrets set OPENROUTER_API_KEY - + +# 3. Deploy InfisicalSecret resource (shown above) + +# 4. Verify sync +kubectl get secret anythingllm-secrets -n anything-llm -o yaml + +# 5. Securely delete manual backup +rm -f "$BACKUP_FILE" +echo "✅ Backup file deleted" +``` + +**Status**: Infisical integration planned for WeOwn cohort deployments. Current deployments use encrypted Kubernetes secrets with RBAC restrictions. + +--- + +**Remember:** Always use `--reuse-values` for stateful applications. Never use `--reset-values` in production unless you explicitly want to wipe configuration. diff --git a/docs/VERSIONING_WEOWNVER.md b/docs/VERSIONING_WEOWNVER.md new file mode 100644 index 0000000..5111fce --- /dev/null +++ b/docs/VERSIONING_WEOWNVER.md @@ -0,0 +1,190 @@ +# VERSIONING_WEOWNVER.md + +> Official version nomenclature for ♾️ WeOwnNet 🌐 + +## Document Info + +| Field | Value | +|-------|-------| +| Title | #WeOwnVer Specification | +| Version | v2.5.0 | +| Status | ✅ APPROVED | +| Approved by | yonks.box|🤖🏛️🪙|Jason Younker ♾️ | +| Effective | Season 2 Week 5 (Jan 2026) | + +> **⚠️ CRITICAL: VERSIONING SYSTEM ON HOLD** +> +> The #WeOwnVer versioning system is **ON HOLD** and **NOT FINALIZED**. The specification, season/week mappings, date boundaries, and implementation details are still under active review and discussion by leadership. +> +> **DO NOT:** +> - Enforce strict date/week alignment +> - Report calendar boundary inconsistencies +> - Expect all Helm charts to use #WeOwnVer +> - Block work due to versioning discrepancies +> +> **Current Status**: Reference implementation only. Ignore all versioning inconsistencies until official finalization and migration plan. + +--- + +## 1. FORMAT + +SEASON.WEEK.DAY.VERSION + +| Position | Name | Range | Description | +|----------|------|-------|-------------| +| 1st | SEASON | 1+ | Ecosystem season number | +| 2nd | WEEK | 1-17 | Week within season | +| 3rd | DAY | 0-7 | 0=summary, 1=Mon → 7=Sun | +| 4th | VERSION | 0+ | Release within day | + +--- + +## 2. DAY VALUES + +| Value | Day | Note | +|-------|-----|------| +| 0 | Summary | Week rollup / no daily | +| 1 | Monday | | +| 2 | Tuesday | | +| 3 | Wednesday | | +| 4 | Thursday | | +| 5 | Friday | | +| 6 | Saturday | | +| 7 | Sunday | | + +--- + +## 3. EXAMPLES + +| Version | Decode | +|---------|--------| +| 3.1.1.1 | Season 3, Week 1, Monday, 1st release | +| 3.2.2.2 | Season 3, Week 2, Tuesday, 2nd release | +| 3.3.3.3 | Season 3, Week 3, Wednesday, 3rd release | +| 3.4.0 | Season 3, Week 4, Day 0 (weekly rollup) | +| 3.2.5.3 | Season 3, Week 2, Friday, 3rd release | + +> **Note**: `3.4.0` is a weekly rollup written in the 3-part shorthand format `SEASON.WEEK.DAY`, where the third component is `DAY=0` (summary). In the full 4-part format `SEASON.WEEK.DAY.VERSION`, this corresponds to `SEASON=3`, `WEEK=4`, `DAY=0` and an implicit `VERSION=0` (the trailing `.0` for `VERSION` is not shown for week summaries). + +--- + +## 4. MULTIPLE RELEASES (SAME DAY) + +| Release | Version | Decode | +|---------|---------|--------| +| 1st | 3.2.2.1 | Season 3, Week 2, Tuesday, 1st | +| 2nd | 3.2.2.2 | Season 3, Week 2, Tuesday, 2nd | +| 3rd | 3.2.2.3 | Season 3, Week 2, Tuesday, 3rd | + +--- + +## 5. SEASON CALENDAR + +| Season | Start | End | ISO Weeks | Months | +|--------|-------|-----|-----------|--------| +| 1 | 2025-06-01 | 2025-09-30 | W23-W40 | Jun-Sep 2025 | +| 2 | 2025-10-01 | 2026-02-01 | 2025-W40–2026-W05 | Oct 2025-Feb 2026 | +| 3 | 2026-02-02 | 2026-05-31 | W06-W22 | Feb-May 2026 | +| 4 | 2026-06-01 | 2026-08-31 | W23-W35 | Jun-Aug 2026 | + +**NOTE**: The exact methodology for determining the WEEK value in SEASON.WEEK.DAY.VERSION will be addressed and clarified in a future update. Until then, refer to existing versioned documents in the repository for current week values. + +### ISO Week Reference (2026) + +| ISO Week | Dates | +|----------|-------| +| W03 | Jan 12-18, 2026 | +| W04 | Jan 19-25, 2026 | +| W05 | Jan 26-Feb 1, 2026 | +| W06 | Feb 2-8, 2026 | +| W07 | Feb 9-15, 2026 | + +--- + +## 6. ARTIFACT SCOPE + +| Artifact Type | Apply #WeOwnVer | Example | +|---------------|-----------------|---------| +| #SharedKernel | ✅ YES | SHARED-KERNEL_v3.1.1.1.md | +| GUIDES | ✅ YES | GUIDE_GAME-MECHANICS_v3.1.1.1.md | +| GOV policies | ✅ YES | GOV-001_v3.1.1.1.md | +| TEMPLATES | ✅ YES | TEMPLATE_ADD-CONTEXT_v3.1.1.1.md | +| RAG uploads | ✅ YES | filename_v3.1.1.1.md | +| Code releases | ✅ YES | v3.1.1.1 tag | +| Helm charts | ✅ YES | Chart version: 2.5.0 (Season 2, Week 5, summary) | +| CCC-IDs | ❌ NO | Keep `CCC_YYYY-WXX_NNN` | +| Session logs | ❌ NO | Keep timestamp-based | + +--- + +## 7. FILENAME CONVENTION + +### Pattern + +`NAME_vSEASON.WEEK.DAY.VERSION.md` + +**Note**: Uppercase terms (NAME, SEASON, WEEK, DAY, VERSION) are placeholders and are not part of the actual filename. For example, use `SHARED-KERNEL_v3.1.1.1.md`, not `NAME_v3.1.1.1.md`. + +### Examples + +| Filename | Decode | +|----------|--------| +| SHARED-KERNEL_v3.1.1.1.md | Season 3, Week 1, Monday, 1st | +| GUIDE_GAME-MECHANICS_v3.2.0.md | Season 3, Week 2, summary | +| GOV-001_v3.3.5.2.md | Season 3, Week 3, Friday, 2nd | + +--- + +## 8. HELM CHART VERSIONING + +For Helm charts and code releases, use simplified format for weekly releases: + +| Format | Example | Meaning | +|--------|---------|---------| +| SEASON.WEEK.0 | 2.5.0 | Season 2, Week 5, summary | +| SEASON.WEEK.DAY.VERSION | 2.5.7.1 | Season 2, Week 5, Sunday, 1st release | + +**When to use 3-digit vs 4-digit:** +- **3-digit (SEASON.WEEK.0)**: Weekly rollup releases, no specific day +- **4-digit (SEASON.WEEK.DAY.VERSION)**: Multiple releases in same day + +--- + +## 9. TRANSITION PLAN + +| Phase | When | Version Format | +|-------|------|----------------| +| LEGACY | W03-W04 (Jan 2026) | v2.4.x (SemVer) | +| CURRENT | W05 (Jan 26-Feb 1, 2026) | 2.5.0 (#WeOwnVer) | +| ONGOING | W06+ (Feb 2026+) | All new = #WeOwnVer | + +--- + +## 10. COMPARISON + +| System | Format | Example | Notes | +|--------|--------|---------|-------| +| SemVer | MAJOR.MINOR.PATCH | 2.4.1 | No time context | +| CalVer | YYYY.MM.DD | 2026.01.16 | No semantic meaning | +| **#WeOwnVer** | SEASON.WEEK.DAY.VER | 3.1.4.2 | Time + rhythm + semantic | + +--- + +## 11. SPECIAL CASES + +| Pattern | Meaning | +|---------|---------| +| `x.x.0` | Week summary: `SEASON.WEEK.DAY` where `DAY = 0` (VERSION component omitted) | +| `x.x.x.0` | Day summary: `SEASON.WEEK.DAY.VERSION` where `VERSION = 0` | +| `x.x.x.1` | First release of day: `SEASON.WEEK.DAY.VERSION` where `VERSION = 1` | + +--- + +## Version History + +**Note**: This specification document itself uses #WeOwnVer versioning as a reference implementation, even though the broader system is ON HOLD and NOT FINALIZED for other artifacts. + +| Version | Date | Changes | +|---------|------|---------| +| v2.4.0 | 2026-01-16 | Initial #WeOwnVer specification | +| v2.5.0 | 2026-01-26 | Added Helm chart versioning, transitioned to #WeOwnVer |