# =====================================================================
# AnchorSOL® Wall - robots.txt
# Sitemap: https://anchorsolwall.com/sitemap.xml
# LLM/AI summary: https://anchorsolwall.com/llms.txt
# =====================================================================

# Default: everything is crawlable, dev folders disallowed
User-agent: *
Allow: /
Disallow: /research/
Disallow: /extracted/
Disallow: /tools/
Disallow: /handoff/
Disallow: /originals/

# Legacy WordPress paths - the site used to run on WordPress.
# These URLs no longer exist on the current static site, but Google still
# tries to crawl them. Disallowing them tells Google to stop wasting crawl
# budget on URLs that will only ever return 404.
Disallow: /wp-admin/
Disallow: /wp-content/
Disallow: /wp-includes/
Disallow: /wp-login.php
Disallow: /xmlrpc.php
Disallow: /wp-json/
Disallow: /?p=
Disallow: /?page_id=
Disallow: /feed/
Disallow: /*/feed/
Disallow: /comments/feed/
Disallow: /tag/
Disallow: /category/
Disallow: /author/
Disallow: /industries-category/
Disallow: /industries/
Disallow: /faqs/
# Old WordPress spam/injected posts (drop these so Google forgets them):
Disallow: /for-the-wealthy-work-is-the-new-retirement/
Disallow: /top-5-tips-for-solving-the-email-security-problem/

# Search engine crawlers, explicit welcome
User-agent: Googlebot
Allow: /

User-agent: Bingbot
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Slurp
Allow: /

User-agent: Baiduspider
Allow: /

User-agent: YandexBot
Allow: /

User-agent: Yeti
Allow: /

User-agent: SeznamBot
Allow: /

# AI / LLM crawlers, allowed (with a structured summary at /llms.txt)
User-agent: GPTBot
Allow: /

User-agent: ClaudeBot
Allow: /

User-agent: PerplexityBot
Allow: /

User-agent: Google-Extended
Allow: /

# Sitemap reference (every crawler reads this)
Sitemap: https://anchorsolwall.com/sitemap.xml