# robots.txt for https://smarter-systems.com
# Last updated: 2026-05-14
# Companion: /llms.txt
#
# Explicit allow stanzas for major AI / generative-search crawlers.
# SEAKT (Dimension T) checks for per-bot handling rather than relying
# on the wildcard. Grouped by vendor for readability.

# ---------- AI crawlers explicitly allowed ----------

# OpenAI
User-agent: GPTBot
Allow: /

User-agent: OAI-SearchBot
Allow: /

User-agent: ChatGPT-User
Allow: /

# Anthropic
User-agent: ClaudeBot
Allow: /

User-agent: anthropic-ai
Allow: /

User-agent: Claude-Web
Allow: /

# Perplexity
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# Google AI / AI Overviews
User-agent: Google-Extended
Allow: /

# Common Crawl (training-data backbone for many LLMs)
User-agent: CCBot
Allow: /

# Cohere
User-agent: cohere-ai
Allow: /

# Meta
User-agent: Meta-ExternalAgent
Allow: /

# Apple Intelligence
User-agent: Applebot-Extended
Allow: /

# ByteDance (Doubao)
User-agent: Bytespider
Allow: /

# ---------- Fallback ----------
User-agent: *
# Content Signals (contentsignals.org / IETF draft-romm-aipref):
# We grant all three usage modes. search=yes so AI search engines
# may cite us; ai-train=yes so LLMs may use our content as training
# material; ai-input=yes so RAG systems may retrieve us as context.
# Rationale: maximum visibility is the goal; the content here is
# marketing material we want amplified.
Content-Signal: search=yes, ai-train=yes, ai-input=yes
Allow: /
# Contact form is a POST-only Cloudflare Pages Function; no value
# crawling it (returns 405 on GET). Saves crawl budget.
Disallow: /api/
# Legacy WordPress paths — the site migrated to Astro and these
# endpoints no longer exist. Explicit Disallow tells Google to
# stop re-crawling them so they age out of the index faster.
Disallow: /wp-admin/
Disallow: /wp-content/
Disallow: /wp-includes/
Disallow: /wp-json/
Disallow: /?s=
Disallow: /?p=

# ---------- Explicit allow for companion files ----------
Allow: /llms.txt
Allow: /llms-full.txt

Sitemap: https://smarter-systems.com/sitemap.xml
# LLMs-txt is a proposed standard, not a recognised robots.txt directive
# (Lighthouse flags non-standard directives as errors), so we keep the
# pointer as a comment. Crawlers that support llmstxt.org find /llms.txt
# at its conventional path regardless.
# https://smarter-systems.com/llms.txt