learncms/static_site_generator.py at master · NUKnightLab/learncms · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
#!/usr/bin/env python3
"""
New Static Site Generator for learncms

This script crawls https://learn.knightlab.com/ and generates static HTML files
that are fully portable with no external dependencies on S3 or other off-site media.
"""

import os
import sys
import json
import time
import re
import shutil
from pathlib import Path
import argparse
from urllib.parse import urljoin, urlparse, urlunparse
from urllib.request import urlopen, Request
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup


class StaticSiteGenerator:
    def __init__(self, base_url, output_dir, static_dir='static'):
        self.base_url = base_url.rstrip('/')
        self.domain = urlparse(base_url).netloc
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.static_dir = Path(static_dir)

        self.crawled_urls = set()
        self.failed_urls = []
        self.off_site_dependencies = []
        self.media_files = set()
        self.missing_local_assets = []

    def fetch_url(self, url, timeout=30):
        """Fetch a URL and return the content"""
        try:
            print(f"Fetching: {url}")
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
            }
            request = Request(url, headers=headers)

            with urlopen(request, timeout=timeout) as response:
                if response.status == 200:
                    content_type = response.headers.get('content-type', '').lower()
                    content = response.read()

                    # If it's HTML, decode properly based on response encoding
                    if 'html' in content_type:
                        encoding = 'utf-8'  # Default to UTF-8
                        if 'charset=' in content_type:
                            encoding = content_type.split('charset=')[1].split(';')[0].strip()
                        content = content.decode(encoding, errors='replace')
                        return content.encode('utf-8'), content_type

                    return content, content_type
                else:
                    print(f"Warning: {url} returned status {response.status}")
                    return None, None

        except HTTPError as e:
            print(f"HTTP Error {e.code} for {url}: {e.reason}")
            self.failed_urls.append((url, f"HTTP {e.code}: {e.reason}"))
            return None, None
        except URLError as e:
            print(f"URL Error for {url}: {e.reason}")
            self.failed_urls.append((url, f"URL Error: {e.reason}"))
            return None, None
        except Exception as e:
            print(f"Unexpected error for {url}: {e}")
            self.failed_urls.append((url, f"Error: {e}"))
            return None, None

    def get_lessons_from_api(self):
        """Get all lesson slugs from the JSON API"""
        lessons_url = f"{self.base_url}/lesson.json"
        content, _ = self.fetch_url(lessons_url)
        if content:
            try:
                lessons_data = json.loads(content.decode('utf-8'))
                return [data["slug"] for title, data in lessons_data.items()
                       if data.get("status") == "published"]
            except json.JSONDecodeError as e:
                print(f"Error parsing lessons JSON: {e}")
        return []

    def validate_media_dependencies(self, html_content, page_url):
        """
        Validate that all media dependencies are on-site.
        Fail loudly if any off-site dependencies are found.
        """
        soup = BeautifulSoup(html_content, 'html.parser')

        # Tags and attributes that can reference media
        media_tags = {
            'img': ['src', 'data-src'],
            'script': ['src'],
            'link': ['href'],
            'source': ['src'],
            'video': ['src', 'poster'],
            'audio': ['src'],
            'object': ['data'],
            'embed': ['src'],
            'iframe': ['src']
        }

        for tag_name, attrs in media_tags.items():
            for tag in soup.find_all(tag_name):
                for attr in attrs:
                    url = tag.get(attr)
                    if url:
                        self._check_media_url(url, page_url)

    def _check_media_url(self, url, page_url):
        """Check if a media URL is on-site or off-site"""
        if not url or url.startswith('#') or url.startswith('javascript:') or url.startswith('mailto:'):
            return

        # Handle protocol-relative URLs
        if url.startswith('//'):
            url = 'https:' + url

        # Handle relative URLs
        if not url.startswith(('http://', 'https://')):
            url = urljoin(page_url, url)

        parsed_url = urlparse(url)

        # Check if this is an off-site dependency
        if parsed_url.netloc and parsed_url.netloc != self.domain:
            # Special handling for media.knightlab.com - should be served locally
            if parsed_url.netloc == 'media.knightlab.com':
                return self._check_knightlab_media(url, parsed_url.path, page_url)

            # Allow certain CDNs that are acceptable
            allowed_cdns = [
                'cdn.knightlab.com',
                'fonts.googleapis.com',
                'fonts.gstatic.com',
                'ajax.googleapis.com',
                'code.jquery.com',
                'maxcdn.bootstrapcdn.com',
                'cdnjs.cloudflare.com',
                'www.googletagmanager.com'  # Allow Google Tag Manager
            ]

            if not any(cdn in parsed_url.netloc for cdn in allowed_cdns):
                dependency = f"Off-site media dependency found: {url} (referenced from {page_url})"
                print(f"ERROR: {dependency}")
                self.off_site_dependencies.append(dependency)
        else:
            # This is an on-site media file
            self.media_files.add(url)

    def _check_knightlab_media(self, full_url, path, page_url):
        """Check if media.knightlab.com asset exists locally and return local path"""
        # Remove /learncms/ prefix if present
        if path.startswith('/learncms/'):
            local_path = path[len('/learncms/'):]
        else:
            local_path = path.lstrip('/')

        # Check if file exists in static directory
        local_file = self.static_dir / local_path
        if local_file.exists():
            print(f"Found local asset: {local_file} for {full_url}")
            return f"/{local_path}"  # Return local path for URL rewriting
        else:
            missing = f"Missing local asset: {local_file} for {full_url} (referenced from {page_url})"
            print(f"ERROR: {missing}")
            self.missing_local_assets.append(missing)
            return None

    def process_html_content(self, content, page_url):
        """Process HTML content and validate dependencies"""
        if isinstance(content, bytes):
            # Try to detect and properly decode the content
            try:
                content = content.decode('utf-8')
            except UnicodeDecodeError:
                # Fall back to latin-1 then encode to utf-8
                content = content.decode('latin-1').encode('utf-8').decode('utf-8')

        # Rewrite media.knightlab.com URLs to local paths
        content = self._rewrite_knightlab_urls(content, page_url)

        # Add charset meta tag if missing
        if '<meta charset=' not in content and '<meta http-equiv="Content-Type"' not in content:
            # Insert charset meta tag after <head>
            content = content.replace('<head>', '<head>\n    <meta charset="utf-8">')

        # Validate media dependencies after rewriting
        self.validate_media_dependencies(content, page_url)

        return content

    def _rewrite_knightlab_urls(self, content, page_url):
        """Rewrite media.knightlab.com URLs and fix broken relative paths"""
        import re

        # Pattern to match media.knightlab.com URLs
        pattern = r'https?://media\.knightlab\.com/learncms/([^"\'\s>]+)'

        def replace_knightlab_url(match):
            path = match.group(1)
            # Check if file exists in our output directory
            local_file = self.output_dir / path
            if local_file.exists():
                return f"/{path}"
            else:
                # Keep original URL if file doesn't exist locally (will be caught by validation)
                return match.group(0)

        content = re.sub(pattern, replace_knightlab_url, content)

        # Fix broken relative image paths like "images/learn/text-editor.png"
        content = re.sub(r'images/learn/([^"\'\s>]+)', r'/images/learn/\1', content)

        return content

    def save_content(self, relative_path, content, content_type='text/html'):
        """Save content to the output directory"""
        if isinstance(content, bytes):
            # Try to detect and properly decode the content
            try:
                content = content.decode('utf-8')
            except UnicodeDecodeError:
                # Fall back to latin-1 then encode to utf-8
                content = content.decode('latin-1').encode('utf-8').decode('utf-8')

        # Create the full path
        if relative_path == '' or relative_path == '/':
            file_path = self.output_dir / 'index.html'
        elif relative_path.endswith('/'):
            # Directory-style URLs get an index.html
            dir_path = self.output_dir / relative_path.strip('/')
            dir_path.mkdir(parents=True, exist_ok=True)
            file_path = dir_path / 'index.html'
        else:
            # File-style URLs
            file_path = self.output_dir / relative_path.lstrip('/')
            file_path.parent.mkdir(parents=True, exist_ok=True)

        print(f"Saving: {file_path}")

        if 'html' in content_type:
            # Process HTML content
            page_url = urljoin(self.base_url, relative_path)
            content = self.process_html_content(content, page_url)

        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)

    def crawl_static_pages(self):
        """Crawl static template pages"""
        static_pages = [
            '/',           # homepage
            '/ask/',       # ask page
            '/404.html',   # 404 page
            '/500.html',   # 500 page
        ]

        for page in static_pages:
            if page not in self.crawled_urls:
                url = f"{self.base_url}{page}"
                content, content_type = self.fetch_url(url)
                if content:
                    self.save_content(page, content, content_type)
                    self.crawled_urls.add(page)

    def crawl_lesson_pages(self):
        """Crawl all published lesson pages"""
        lesson_slugs = self.get_lessons_from_api()
        print(f"Found {len(lesson_slugs)} published lessons")

        for slug in lesson_slugs:
            lesson_path = f"/lesson/{slug}/"
            if lesson_path not in self.crawled_urls:
                url = f"{self.base_url}{lesson_path}"
                content, content_type = self.fetch_url(url)
                if content:
                    self.save_content(lesson_path, content, content_type)
                    self.crawled_urls.add(lesson_path)

    def crawl_json_apis(self):
        """Crawl JSON API endpoints"""
        json_endpoints = [
            '/glossary.json',
            '/lesson.json',
            '/capsule.json'
        ]

        for endpoint in json_endpoints:
            if endpoint not in self.crawled_urls:
                url = f"{self.base_url}{endpoint}"
                content, content_type = self.fetch_url(url)
                if content:
                    self.save_content(endpoint, content, content_type)
                    self.crawled_urls.add(endpoint)

    def copy_all_static_assets(self):
        """Copy all static assets to the output directory"""
        print("Copying static assets...")

        # Copy everything from static/ directory
        if self.static_dir.exists():
            for item in self.static_dir.rglob('*'):
                if item.is_file():
                    relative_path = item.relative_to(self.static_dir)
                    dest_path = self.output_dir / relative_path
                    dest_path.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copy2(item, dest_path)
                    print(f"Copied: {relative_path}")

        # Copy images from components/images to images/ in output
        components_images = Path('components/images')
        if components_images.exists():
            for item in components_images.rglob('*'):
                if item.is_file():
                    relative_path = item.relative_to(components_images)
                    dest_path = self.output_dir / 'images' / relative_path
                    dest_path.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copy2(item, dest_path)
                    print(f"Copied component image: {relative_path} -> images/{relative_path}")

        # Copy from learn-media directory to imagelib in output
        learn_media_path = Path('learn-media')
        if learn_media_path.exists():
            for item in learn_media_path.rglob('*'):
                if item.is_file():
                    relative_path = item.relative_to(learn_media_path)
                    dest_path = self.output_dir / 'imagelib' / relative_path
                    dest_path.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copy2(item, dest_path)
                    print(f"Copied learn-media: {relative_path} -> imagelib/{relative_path}")

    def generate_site(self):
        """Generate the complete static site"""
        print(f"Generating static site from {self.base_url}")
        print(f"Output directory: {self.output_dir.absolute()}")
        print(f"Target domain: {self.domain}")

        start_time = time.time()

        # Copy all static assets first
        self.copy_all_static_assets()

        # Crawl different types of content
        self.crawl_static_pages()
        self.crawl_lesson_pages()
        self.crawl_json_apis()

        # Report results
        end_time = time.time()
        print(f"\n--- Generation Results ---")
        print(f"Total pages crawled: {len(self.crawled_urls)}")
        print(f"Time taken: {end_time - start_time:.2f} seconds")
        print(f"Media files found: {len(self.media_files)}")

        if self.failed_urls:
            print(f"\nFailed URLs ({len(self.failed_urls)}):")
            for url, error in self.failed_urls:
                print(f"  - {url}: {error}")

        # Check for off-site dependencies and missing local assets
        has_errors = False

        if self.off_site_dependencies:
            print(f"\n❌ Off-site dependencies found ({len(self.off_site_dependencies)}):")
            for dependency in self.off_site_dependencies:
                print(f"  - {dependency}")
            has_errors = True

        if self.missing_local_assets:
            print(f"\n❌ Missing local assets ({len(self.missing_local_assets)}):")
            for missing in self.missing_local_assets:
                print(f"  - {missing}")
            has_errors = True

        if has_errors:
            print(f"\n❌ GENERATION FAILED ❌")
            print("The static site cannot be fully portable with these issues.")
            print("Please fix these dependencies before generating a static site.")
            sys.exit(1)
        else:
            print(f"\n✅ SUCCESS: No off-site dependencies found!")
            print(f"Static site is fully portable and ready for deployment.")

        print(f"\nStatic site generated in: {self.output_dir.absolute()}")


def main():
    parser = argparse.ArgumentParser(description='Generate portable static site from learncms')
    parser.add_argument('--url', '-u',
                       default='https://learn.knightlab.com',
                       help='Base URL of the live site (default: https://learn.knightlab.com)')
    parser.add_argument('--output', '-o',
                       default='./static_site',
                       help='Output directory for static files (default: ./portable_static_site)')
    parser.add_argument('--timeout', '-t',
                       type=int, default=30,
                       help='Request timeout in seconds (default: 30)')

    args = parser.parse_args()

    # Generate the static site
    generator = StaticSiteGenerator(args.url, args.output)
    generator.generate_site()


if __name__ == '__main__':
    main()