November 26, 2024

Robots.txt for npr.org

        # robots.txt for www.npr.org
# Changes are tracked in www-render

User-agent: *
Disallow: /mpx/
Disallow: /cgi-bin
Disallow: /ramfiles/
Disallow: /oauth2/
Disallow: /account/
Disallow: /proxy/
Disallow: /*.smil
Disallow: /*.asx
Disallow: /*.ram
Disallow: /*.wav
Disallow: /*.rmm
Disallow: /*.js
Disallow: /*.au
Disallow: /stations/force/force_localization.php?
Disallow: /rundowns/segment.php?
Disallow: /templates/search/*
Disallow: /2013/03/21/174840895/
Disallow: /sections/ombudsman/2008/01/frequently_asked_questions_1.html
Disallow: /sections/health-shots/2013/03/11/173816690/new-voices-for-the-voiceless-synthetic-speech-gets-an-upgrade
Disallow: /transcripts/470280334*
Disallow: /2015/07/04/419570939/chasing-memories-in-their-refugee-camp-40-years-after-they-fled-vietnam
Disallow: /transcripts/419570939*
Disallow: /sections/parallels/2016/08/15/480128005/for-french-teens-smoking-still-has-more-allure-than-stigma
Disallow: /transcripts/480128005*
Disallow: /2020/04/08/830237502/episode-989-what-if-no-one-pays-rent
Disallow: /transcripts/830237502*
Disallow: /sections/goatsandsoda/2015/06/09/406744975/a-gender-revolution-hits-the-streets-two-wheels-at-a-time
Disallow: /transcripts/406744975*
Disallow: /sureroute
Disallow: /*/partials*
Disallow: /*?*
Disallow: /proxy/*
Disallow: /player/*
Disallow: /get/*
Disallow: /geolocation

# Disallowing the OpenAI web crawler
User-agent: GPTBot
Disallow: /

# Disallowing OpenAI plugins
User-agent: ChatGPT-User
Disallow: /

# Disallowing Common Crawl
User-agent: CCBot
Disallow: /

# Disallowing Google Bard and Vertex AI web crawlers
User-agent: Google-Extended
Disallow: /

# Disallowing various bots
User-agent: anthropic-ai
Disallow: /

User-agent: Applebot-Extended
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: Claude-Web
Disallow: /

User-agent: cohere-ai
Disallow: /

User-agent: Diffbot
Disallow: /

User-agent: FacebookBot
Disallow: /

User-agent: omgili
Disallow: /

User-agent: omgilibot
Disallow: /

User-agent: PerplexityBot
Disallow: /

# Allow Google Search Console for sitemap crawling
User-agent: Google-InspectionTool
Allow: /
User-agent: Google-Image
Allow: /
User-agent: Google-Video
Allow: /
User-agent: Googlebot
Allow: /

# Explicitly allowing the TTD contextual crawler
User-agent: Mozilla/5.0 (compatible; TTD-Content; +https://www.thetradedesk.com/general/ttd-content)
Allow: /

# Ensures that we're using the correct sitemap. The fact that this is googlecrawl*.npr.org is OK because the crawler will only accept
# URLs in this sitemap to match www*.npr.org
Sitemap: https://googlecrawl.npr.org/standard/sitemap_index.xml
Sitemap: https://googlecrawl.npr.org/news/sitemap_news.xml
Sitemap: https://googlecrawl.npr.org/video/sitemap_video.xml
Sitemap: https://www.npr.org/live-updates/sitemap.xml