diff --git a/.env.example b/.env.example index c46b9245..793b21f9 100644 --- a/.env.example +++ b/.env.example @@ -36,6 +36,9 @@ M3U8_SUPPORT=False # Enable Aria2 for downloads (True/False) ENABLE_ARIA2=False +# Enable Cloudflare bypass for direct downloads (True/False) +BYPASS_CLOUDFLARE=True + # Path to Rclone executable RCLONE_PATH= diff --git a/pyproject.toml b/pyproject.toml index 5aefbb20..9b7b2c90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "Default template for PDM package" authors = [ {name = "Benny", email = "benny.think@gmail.com"}, ] -dependencies = ["tgcrypto>=1.2.5", "yt-dlp[curl-cffi,default]==2026.1.31", "APScheduler>=3.11.2", "ffmpeg-python>=0.2.0", "PyMySQL>=1.1.1", "filetype>=1.2.0", "beautifulsoup4>=4.14.3", "fakeredis>=2.33.0", "redis==6.4.0", "requests>=2.32.5", "tqdm==4.67.2", "token-bucket>=0.3.0", "python-dotenv>=1.0.1", "black>=24.10.0", "sqlalchemy>=2.0.36", "psutil==7.2.2", "ffpb>=0.4.1", "kurigram==2.2.18", "cryptography>=46.0.4", "greenlet==3.3.1"] +dependencies = ["tgcrypto>=1.2.5", "yt-dlp[curl-cffi,default]==2026.1.31", "APScheduler>=3.11.2", "ffmpeg-python>=0.2.0", "PyMySQL>=1.1.1", "filetype>=1.2.0", "beautifulsoup4>=4.14.3", "fakeredis>=2.33.0", "redis==6.4.0", "requests>=2.32.5", "tqdm==4.67.2", "token-bucket>=0.3.0", "python-dotenv>=1.0.1", "black>=24.10.0", "sqlalchemy>=2.0.36", "psutil==7.2.2", "ffpb>=0.4.1", "kurigram==2.2.18", "cryptography>=46.0.4", "greenlet==3.3.1", "ai-cloudscraper>=3.8.4"] requires-python = ">=3.10" readme = "README.md" license = {text = "Apache2.0"} diff --git a/requirements.txt b/requirements.txt index 54783940..96cc9b20 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,5 @@ psutil>=7.2.2 ffpb>=0.4.1 cryptography>=46.0.4 kurigram==2.2.18 -yt-dlp[default,curl-cffi]==2026.1.31 \ No newline at end of file +yt-dlp[default,curl-cffi]==2026.1.31 +ai-cloudscraper>=3.8.4 \ No newline at end of file diff --git a/src/config/config.py b/src/config/config.py index ccf130f9..85abdca4 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -39,6 +39,7 @@ def get_env(name: str, default=None): AUDIO_FORMAT = get_env("AUDIO_FORMAT", "m4a") M3U8_SUPPORT = get_env("M3U8_SUPPORT") ENABLE_ARIA2 = get_env("ENABLE_ARIA2") +BYPASS_CLOUDFLARE = get_env("BYPASS_CLOUDFLARE", True) RCLONE_PATH = get_env("RCLONE") diff --git a/src/engine/direct.py b/src/engine/direct.py index 0388ad45..8f2d6c73 100644 --- a/src/engine/direct.py +++ b/src/engine/direct.py @@ -15,8 +15,9 @@ import filetype import requests -from config import ENABLE_ARIA2, TMPFILE_PATH +from config import BYPASS_CLOUDFLARE, ENABLE_ARIA2, TMPFILE_PATH from engine.base import BaseDownloader +from utils.http_client import get_http_client class DirectDownload(BaseDownloader): @@ -40,7 +41,8 @@ def _setup_formats(self) -> list | None: def _requests_download(self): logging.info("Requests download with url %s", self._url) - response = requests.get(self._url, stream=True) + client = get_http_client(bypass_enabled=BYPASS_CLOUDFLARE) + response = client.get(self._url, stream=True) response.raise_for_status() file = Path(self._tempdir.name).joinpath(uuid4().hex) with open(file, "wb") as f: diff --git a/src/test_cloudflare_bypass.py b/src/test_cloudflare_bypass.py new file mode 100644 index 00000000..e7221dd9 --- /dev/null +++ b/src/test_cloudflare_bypass.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# ytdlbot - test_cloudflare_bypass.py +# Test script for Cloudflare bypass functionality using ai-cloudscraper + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) + +from utils.http_client import BypassHTTPClient, get_realistic_headers + + +def test_cloudflare_bypass(): + print("=" * 50) + print("Testing Cloudflare Bypass (ai-cloudscraper)") + print("=" * 50) + + test_sites = [ + ("https://nowsecure.nl/", "NowSecure (Cloudflare protected)"), + ("https://www.google.com/", "Google (no protection)"), + ] + + client = BypassHTTPClient(bypass_enabled=True) + + for url, description in test_sites: + print(f"\nTesting: {description}") + print(f"URL: {url}") + + try: + resp = client.get(url, timeout=10) + print(f"Status: {resp.status_code}") + print(f"Content length: {len(resp.text)} chars") + print("Result: OK") + except Exception as e: + print(f"Error: {e}") + print("Result: FAILED") + + client.close() + print("\n" + "=" * 50) + print("Test completed!") + print("=" * 50) + + +def test_headers(): + print("\n" + "=" * 50) + print("Testing Realistic Headers") + print("=" * 50) + + headers = get_realistic_headers() + + for key, value in headers.items(): + print(f"{key}: {value}") + + print("\nResult: OK") + + +def test_direct_download_usage(): + print("\n" + "=" * 50) + print("Testing DirectDownload Usage Pattern") + print("=" * 50) + + from utils.http_client import get_http_client + + client = get_http_client(bypass_enabled=True) + + print("\nTesting single instance pattern:") + print(f"Client type: {type(client).__name__}") + + try: + resp = client.get("https://httpbin.org/headers", timeout=5) + print(f"Status: {resp.status_code}") + print("Result: OK") + except Exception as e: + print(f"Error: {e}") + print("Result: FAILED") + + print("\n" + "=" * 50) + + +if __name__ == "__main__": + test_cloudflare_bypass() + test_headers() + test_direct_download_usage() diff --git a/src/utils/http_client.py b/src/utils/http_client.py new file mode 100644 index 00000000..dcee388b --- /dev/null +++ b/src/utils/http_client.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# ytdlbot - http_client.py +# HTTP client with Cloudflare bypass support using ai-cloudscraper + +__author__ = "yeshua-aguilar" + +import logging +from typing import Optional + +import cloudscraper +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + + +class BypassHTTPClient: + """HTTP client that automatically bypasses Cloudflare protection.""" + + def __init__(self, bypass_enabled: bool = True, timeout: int = 30): + self._bypass_enabled = bypass_enabled + self._timeout = timeout + self._session: Optional[requests.Session] = None + self._scraper: Optional[cloudscraper.CloudScraper] = None + + self._user_agent = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + ) + + self._headers = { + "User-Agent": self._user_agent, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + } + + def _create_scraper(self) -> cloudscraper.CloudScraper: + scraper = cloudscraper.create_scraper( + browser={ + "browser": "chrome", + "platform": "windows", + "desktop": True, + }, + delay=10, + ) + scraper.headers.update(self._headers) + return scraper + + def _create_session(self) -> requests.Session: + session = requests.Session() + session.headers.update(self._headers) + + retry_strategy = Retry( + total=3, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + session.mount("http://", adapter) + session.mount("https://", adapter) + + return session + + def get(self, url: str, **kwargs) -> requests.Response: + """Make a GET request with Cloudflare bypass if needed.""" + kwargs.setdefault("timeout", self._timeout) + + if self._bypass_enabled: + return self._bypass_get(url, **kwargs) + return self._normal_get(url, **kwargs) + + def _bypass_get(self, url: str, **kwargs) -> requests.Response: + """Try Cloudflare bypass first, fallback to normal request.""" + try: + if self._scraper is None: + self._scraper = self._create_scraper() + + logging.debug("Attempting Cloudflare bypass for %s", url) + response = self._scraper.get(url, **kwargs) + + if response.status_code == 403 and "cloudflare" in response.text.lower(): + logging.warning("Cloudflare bypass failed, trying normal request") + return self._normal_get(url, **kwargs) + + return response + except Exception as e: + logging.warning("Cloudflare bypass error: %s, falling back to normal request", e) + return self._normal_get(url, **kwargs) + + def _normal_get(self, url: str, **kwargs) -> requests.Response: + """Make a normal GET request without bypass.""" + if self._session is None: + self._session = self._create_session() + + return self._session.get(url, **kwargs) + + def close(self): + """Close all sessions.""" + if self._session: + self._session.close() + self._session = None + if self._scraper: + self._scraper.close() + self._scraper = None + + +_client_instance: Optional[BypassHTTPClient] = None + + +def get_http_client(bypass_enabled: bool = True) -> BypassHTTPClient: + """Get or create a shared HTTP client instance.""" + global _client_instance + if _client_instance is None: + _client_instance = BypassHTTPClient(bypass_enabled=bypass_enabled) + return _client_instance + + +def get_cloudflare_bypass_session() -> cloudscraper.CloudScraper: + """Get a CloudScraper session for yt-dlp or other libraries.""" + scraper = cloudscraper.create_scraper( + browser={ + "browser": "chrome", + "platform": "windows", + "desktop": True, + }, + delay=10, + ) + return scraper + + +def get_realistic_headers() -> dict: + """Get realistic browser headers for manual use.""" + return { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + ), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + }