diff --git a/docs/archives/outputs/themis/2025-10-25-course-full-gen.json b/docs/archives/outputs/themis/2025-10-25-course-full-gen.json new file mode 100644 index 0000000..1704de2 --- /dev/null +++ b/docs/archives/outputs/themis/2025-10-25-course-full-gen.json @@ -0,0 +1,112 @@ +{ + "id": "75518c81-52aa-4a48-8436-3ccc90b36470", + "title": "AI Engineering Apprenticeship", + "description": "A one year course for currently employed developers that want to up-skill to use AI-enhanced workflows and build products that use AI with frontier techniques. Whilst the focus is on AI Engineering (as opposed to ML Engineering), some ML techniques are taught alongside these skills.", + "logistics": { + "totalWeeks": 5, + "daysPerWeek": 1, + "startDate": "2026-01-14" + }, + "learners": { + "cohortSize": 20, + "teamBased": true, + "prerequisites": "1. All participants are currently employed software developers that are released for a day of training by their employers\n2. All have good software development skills but few AI Engineering skills\n3. They will probably have little freedom of choice in their tech stacks and need to be able to apply techniques to their company's stack", + "experience": { + "prereq": ">= 4 years", + "focus": "limited experience" + }, + "teamSize": 4 + }, + "structure": "facilitated", + "arcs": [ + { + "id": "2db84cf6-a1f3-4794-8aeb-53faefe3ad95", + "order": 1, + "title": "Arc 1", + "description": "Describe the broad thematic focus of this arc", + "theme": "Help me decide", + "durationWeeks": 3, + "modules": [ + { + "id": "78a01c43-ad7f-4c36-8783-b513d01d6fbb", + "arcId": "2db84cf6-a1f3-4794-8aeb-53faefe3ad95", + "order": 1, + "title": "Module 1", + "description": "Help me decide", + "durationWeeks": 1, + "status": "complete", + "moduleData": { + "xmlContent": "\n\n \n \n AI-Generated\n claude-sonnet-4-5-20250929\n \n projects.xml\n skills.xml\n research.xml\n \n \n \n \n Created module description focused on foundational AI engineering skills for Module 1\n As the first module in an AI Engineering Apprenticeship, this needs to establish core competencies in prompt engineering, LLM interaction, and building simple AI applications. Research shows that AI engineering in 2025 emphasizes practical application of foundation models rather than training from scratch, making this an appropriate starting point.\n \n Chip Huyen's AI Engineering book defines AI engineering as focusing on building applications on top of foundation models, involving prompt engineering, context construction, and parameter-efficient finetuning\n Industry expert confirms AI engineering is about applying and deploying pre-trained models, not just machine learning theory\n \n \n \n Developed three core learning objectives focused on prompt engineering, LLM APIs, and building first AI applications\n Based on 2025 industry research, these represent the foundational skills for AI engineers. Prompt engineering remains critical despite model improvements, API interaction is standardized around OpenAI-compatible interfaces, and hands-on application building is essential for practical learning.\n \n The Prompt Report (2025) identifies 58 LLM prompting techniques and emphasizes prompt engineering as a fundamental discipline\n OpenAI-compatible APIs have become the industry standard in 2025, making API interaction skills transferable across providers\n Industry analysis confirms that learning to leverage AI APIs and prompt engineering are core skills for 2025 AI roles\n \n \n \n Created 7 primary research topics covering prompt engineering fundamentals, LLM capabilities, API integration, model parameters, and ethical considerations\n These topics reflect current best practices and emerging trends in AI engineering. Topics include both technical skills (APIs, parameters) and critical thinking skills (limitations, ethics) necessary for responsible AI development in 2025.\n \n OpenAI's official prompt engineering guide emphasizes iterative refinement and understanding model parameters like temperature\n 2025 prompt engineering best practices include clear instructions, context provision, and iterative refinement\n LLM APIs operate on request-response models with structured requests containing prompts and configuration parameters\n \n \n \n Designed two project briefs: AI-Powered Writing Assistant and Intelligent CLI Tool, both appropriate for beginners\n These projects allow learners to practice core skills while building useful tools. Writing assistants and CLI tools are common first AI projects that teach prompt engineering, API integration, and handling non-deterministic outputs. Both can be completed within a 1-week timeframe while demonstrating practical value.\n \n Collection of LLM apps shows writing assistants and CLI tools as common beginner-friendly applications\n First LLM projects typically focus on text generation with clear user inputs and outputs\n \n \n \n Provided diverse, substantially different examples for each project brief (3+ per brief)\n Examples span different domains (creative writing, technical writing, development tools, productivity) to help learners see the breadth of applications possible with foundational AI skills. Each example represents a meaningfully different use case.\n \n LLM applications span content creation, code generation, language translation, and sentiment analysis\n \n \n \n Created 3 conceptual twists that reframe problem spaces rather than adding technical features\n Following the twist guidelines, these are conceptual curveballs that change how learners think about AI applications. \"The Unreliable Narrator\" explores multiple perspectives, \"The Minimalist\" challenges verbosity assumptions, and \"The Time Traveler\" adds temporal context awareness. Each twist is implementation-agnostic and philosophically interesting.\n \n \n \n Organized skills into 4 categories: Python Development, API Integration, Development Workflow, and AI Engineering Fundamentals\n These skill categories reflect the technical foundation needed for AI engineering in 2025. Python remains the dominant language, API skills are essential for working with LLMs, version control is a baseline expectation, and understanding AI-specific concepts like tokens and temperature is critical for effective prompt engineering.\n \n Python proficiency is essential for AI engineers, as it's the most popular language with extensive libraries for AI development\n Understanding tokenization, context management, and API parameters is fundamental to LLM application development\n \n \n \n Added 3 stretch topics for advanced learners: cost optimization, local LLM deployment, and multi-modal capabilities\n These topics extend beyond Module 1 fundamentals but are relevant to learners who progress quickly. Cost optimization is increasingly important as applications scale, local deployment addresses privacy concerns, and multi-modal AI represents the direction of the field in 2025.\n \n Local LLM deployment is a viable, cost-effective, and privacy-preserving solution in 2025\n Cost optimization and provider selection are critical considerations for production AI applications\n \n \n \n \n \n \n
Projects/Briefs - Project scope should be validated against actual 1-week timeline with cohort of 12 learners
\n
ResearchTopics/StretchTopics - Stretch topics may need adjustment based on learner progression through primary topics
\n
\n
\n
\n\n This foundational module introduces you to AI Engineering through hands-on exploration of Large Language Models (LLMs). You'll learn to craft effective prompts, interact with LLM APIs, and build your first AI-powered applications. By the end of this week, you'll understand how to harness the power of foundation models to create practical tools that solve real problems, while developing a critical awareness of their capabilities and limitations.\n\n \n \n You'll be able to design, test, and iteratively refine prompts that reliably produce desired outputs from LLMs. This includes understanding prompt structure (instructions, context, examples), applying techniques like few-shot learning and chain-of-thought prompting, and recognizing when to adjust parameters like temperature and max tokens. You'll develop an intuition for how different phrasings influence model behavior and learn to debug prompts systematically when outputs don't meet expectations.\n \n \n You'll gain practical experience integrating LLM APIs into applications, understanding the request-response cycle, authentication mechanisms, and error handling strategies. You'll be comfortable working with OpenAI-compatible APIs (which most providers now support), managing API keys securely, handling rate limits, and processing both synchronous and streaming responses. You'll understand the cost implications of API calls and how to make informed decisions about token usage.\n \n \n You'll be able to conceptualize, design, and implement simple AI-powered applications that combine traditional programming logic with LLM capabilities. This includes structuring user interactions, handling the non-deterministic nature of LLM outputs, implementing basic validation and error handling, and creating interfaces (CLI or simple web) that make AI functionality accessible. You'll understand when AI adds value versus when traditional code is more appropriate, and how to combine both effectively.\n \n \n\n \n \n \n Explore the art and science of communicating effectively with LLMs. Start with OpenAI's official prompt engineering guide and the 2025 Prompt Report (arxiv.org/abs/2406.06608) to understand the taxonomy of techniques. Research zero-shot, few-shot, and chain-of-thought prompting. Investigate how prompt structure (role assignment, context provision, output format specification) affects results.\n \n Learn systematic approaches to improving prompts. Research how to test prompts across different inputs, identify failure modes, and refine instructions. Look into prompt versioning and documentation practices. If multiple people tackle this, one could focus on debugging techniques while another explores prompt testing frameworks and evaluation methods.\n \n \n Investigate sophisticated techniques like meta-prompting (using AI to improve prompts), prompt chaining (breaking complex tasks into steps), and constrained generation (forcing specific output formats). Research when each technique is appropriate and their trade-offs in terms of complexity, reliability, and token cost.\n \n \n \n Develop a realistic understanding of what current LLMs can and cannot do reliably. Research common failure modes: hallucinations, inconsistency, context window limitations, and knowledge cutoff dates. Investigate how different model sizes and architectures (GPT-4, Claude, Gemini, open-source models) compare in capabilities. Study the difference between reasoning models and standard models.\n \n Understand why LLMs produce different outputs for identical prompts and how to manage this in applications. Research temperature and top-p parameters, when to use deterministic settings (temperature=0), and strategies for ensuring consistent outputs when needed. Explore how to test applications that use stochastic components.\n \n \n \n Master the practical skills of API integration. Start with OpenAI's API documentation, then explore how other providers (Anthropic, Google, open-source via providers like Together.ai) use similar interfaces. Research authentication methods, request structure (messages array, system prompts, parameters), and response parsing. Understand the OpenAI-compatible standard that most providers now support.\n \n Deep dive into parameters that control model behavior: temperature, max_tokens, top_p, frequency_penalty, presence_penalty, and stop sequences. Research what each parameter does, when to adjust them, and how they interact. Create a reference guide with examples of parameter settings for different use cases (creative writing vs. structured data extraction).\n \n \n Investigate strategies for robust API integration: handling rate limits, implementing exponential backoff, managing timeouts, and gracefully handling API errors. Research monitoring and logging best practices. Look into fallback strategies when API calls fail and how to communicate errors to users effectively.\n \n \n \n Understand how LLMs tokenize text and why this matters for both cost and functionality. Research tokenization (how text is split into tokens, why \"strawberry\" has different token counts than \"hello\"), context window limits, and pricing models across providers. Learn to estimate costs before building and optimize prompts for token efficiency. Explore tools like tiktoken for counting tokens in different models.\n \n \n Research best practices for AI application development environments. Investigate how to manage API keys securely (environment variables, .env files, secrets management), structure projects for AI applications, and set up debugging workflows. Look into tools like LangSmith or other observability platforms for tracking LLM calls during development. Explore local development with free tiers and testing strategies that minimize API costs.\n \n \n Investigate responsible AI development practices. Research prompt injection attacks and how to defend against them, content filtering and moderation approaches, bias in LLM outputs, and privacy implications of sending user data to APIs. Study guidelines from AI providers about acceptable use cases and how to implement safety measures in applications. Consider how to communicate AI limitations to users transparently.\n \n \n Research the landscape of LLM providers in 2025. Compare OpenAI, Anthropic, Google, and open-source options on dimensions like cost, performance, context window size, and specific capabilities. Investigate the trend toward OpenAI-compatible APIs and what this means for portability. Look into when to use cloud APIs versus local models, and how to structure code to make switching providers easy.\n \n \n \n \n Investigate advanced techniques for reducing API costs while maintaining quality: prompt compression, caching strategies, using smaller models for simple tasks, batch processing, and implementing smart retry logic. Research when to use async/batch APIs that offer discounted rates.\n \n \n Explore running LLMs locally using tools like Ollama, llama.cpp, or LM Studio. Research the trade-offs between local and cloud deployment: privacy, cost, latency, and model quality. Understand quantization and how it enables running large models on consumer hardware.\n \n \n Investigate LLMs that can process images, audio, or other modalities alongside text. Research vision-language models and how to structure prompts that include images. Explore use cases where multi-modal AI adds value beyond text-only interactions.\n \n \n \n\n \n \n \n Build a command-line or simple web application that helps users improve their writing by providing suggestions, rewrites, or alternative phrasings based on their goals.\n Prompt engineering for writing tasks, handling user input and preferences, managing conversation context, and presenting AI-generated suggestions in a useful format. This project emphasizes iterative prompt refinement and understanding how to guide LLMs toward specific writing styles or goals.\n \n • Accepts user text input and a goal/instruction (e.g., \"make this more professional\" or \"simplify for a general audience\")\n • Uses well-crafted prompts to generate relevant writing suggestions\n • Provides multiple alternatives or iterative improvements\n • Handles edge cases gracefully (very short input, unclear instructions, API errors)\n • Includes clear user instructions and examples of what it can do\n • Demonstrates understanding of prompt engineering through documented prompt iterations\n • Manages API costs by being thoughtful about token usage\n \n \n \n • Craft prompts that specify writing style, tone, and audience clearly\n • Use few-shot examples to demonstrate desired transformations\n • Include constraints to prevent unwanted changes (e.g., \"preserve the main points\")\n • Test prompts with diverse writing samples to ensure consistency\n • Document what makes prompts effective or ineffective through iteration notes\n \n \n • Decide when to include original text, previous suggestions, or user feedback in prompts\n • Understand token limits and how much context you can reasonably include\n • Structure multi-turn interactions where users can refine suggestions\n • Balance context richness with API cost and response time\n \n \n • Present AI suggestions in a way that makes them easy to compare and evaluate\n • Provide clear feedback about what the AI is doing (\"Analyzing your text...\" \"Generating suggestions...\")\n • Handle the delay of API calls gracefully with appropriate loading states\n • Allow users to easily accept, reject, or request alternative suggestions\n • Consider how to show users what instructions they can give\n \n \n • Implement checks to ensure AI output meets basic quality standards\n • Detect and handle cases where the AI misunderstood the task\n • Provide fallback responses when API calls fail\n • Consider how to validate that suggestions actually improve the writing\n \n \n \n \n A tool that takes casual email drafts and transforms them into professional business correspondence. Users can specify the level of formality, and the tool suggests improvements to tone, structure, and word choice while preserving the core message.\n \n \n An application that rewrites technical or academic text for general audiences. It identifies jargon, complex sentences, and unclear explanations, then provides simplified alternatives that maintain accuracy while improving accessibility.\n \n \n A tool for fiction writers that analyzes passages and suggests improvements for specific elements: dialogue naturalness, descriptive language, pacing, or showing vs. telling. Users can focus on particular aspects of their craft.\n \n \n An application designed for non-native English speakers that identifies grammatical errors, awkward phrasings, and suggests more natural alternatives. It explains why suggestions improve the text, serving as a learning tool.\n \n \n \n \n Create a command-line tool that uses AI to perform a specific task more intelligently than traditional scripts could, demonstrating how LLMs can enhance developer productivity.\n Integrating AI into developer workflows, handling structured input/output, combining traditional programming logic with AI capabilities, and creating tools that are genuinely useful in daily work. This project emphasizes practical application and understanding when AI adds value.\n \n • Solves a real problem that developers encounter regularly\n • Combines traditional code logic with LLM capabilities effectively\n • Provides clear command-line interface with helpful documentation\n • Handles errors and edge cases gracefully\n • Produces output in a format that's immediately useful (not just raw AI responses)\n • Demonstrates understanding of when to use AI vs. traditional code\n • Includes examples and usage documentation\n • Is fast enough for regular use (considers API latency)\n \n \n \n • Use libraries like argparse or click to create intuitive command-line interfaces\n • Handle file input/output when needed\n • Provide helpful error messages and usage instructions\n • Structure code for maintainability and testing\n • Consider how to make the tool easy to install and run\n \n \n • Identify which parts of a task benefit from AI vs. deterministic code\n • Use traditional code for parsing, validation, and formatting\n • Use AI for tasks requiring understanding, generation, or judgment\n • Structure applications so AI components can be easily tested and replaced\n • Understand the cost/benefit trade-off of using AI for different subtasks\n \n \n • Design prompts that produce consistently formatted output (JSON, markdown, specific patterns)\n • Parse AI responses reliably even when format varies slightly\n • Validate that AI output matches expected structure before using it\n • Handle cases where AI doesn't follow format instructions\n • Consider using JSON mode or similar features when available\n \n \n • Make tools that integrate smoothly into existing workflows\n • Provide sensible defaults while allowing customization\n • Consider how output will be used (piped to other tools, copied, saved)\n • Think about performance and whether caching could help\n • Document the tool so others can use it without reading the code\n \n \n \n \n A tool that analyzes staged changes using git diff and generates meaningful commit messages following conventional commit format. It understands code changes and describes them clearly, saving developers time while improving commit history quality.\n \n \n An application that analyzes a codebase (file structure, main files, dependencies) and generates a comprehensive README with installation instructions, usage examples, and project description. It asks clarifying questions to fill in details it can't infer.\n \n \n A CLI tool that reviews code changes and provides feedback on potential issues, style improvements, and suggestions. It focuses on specific aspects like error handling, naming, or documentation based on user preferences.\n \n \n A tool that reads a function or class and generates unit test cases covering common scenarios, edge cases, and error conditions. It outputs tests in the appropriate format for the project's testing framework.\n \n \n An application that reads application logs and summarizes errors, identifies patterns, and suggests potential causes for issues. It can answer questions like \"why did this service crash?\" by analyzing log context.\n \n \n \n \n\n \n \n Build an application that intentionally generates multiple conflicting perspectives or interpretations of the same input, forcing users to think critically about AI outputs rather than accepting them as authoritative.\n \n A writing assistant that provides three different editorial perspectives (e.g., a harsh critic, an encouraging mentor, a technical editor) on the same text, each with valid but contradictory advice\n A code review tool that presents multiple valid but different approaches to solving the same problem, highlighting trade-offs rather than declaring one \"correct\" answer\n A documentation analyzer that generates explanations from different assumed expertise levels, showing how interpretation changes based on context\n \n \n \n Create a tool that uses AI to achieve maximum impact with minimum output—challenging the assumption that more AI-generated content is better.\n \n A writing assistant that only suggests deletions, helping users realize what they don't need to say\n A commit message generator that produces the shortest possible message that still conveys all essential information\n A summarizer that progressively compresses text to its absolute essence, showing what survives at each compression level\n \n \n \n Build an application that explicitly accounts for temporal context—when information was created, how it might age, or how perspectives change over time.\n \n A writing assistant that flags statements that might become dated and suggests more timeless phrasings\n A documentation tool that annotates code comments with \"freshness dates\" indicating when they might need review\n A CLI tool that provides different outputs based on how old the input data is, acknowledging that advice for fresh code differs from advice for legacy systems\n \n \n \n \n\n \n \n Python is the primary language for AI engineering due to its extensive ecosystem of AI/ML libraries and simple syntax. While you don't need to be a Python expert, you should be comfortable with core language features and common libraries for building applications.\n \n Variables, data types, control flow, functions, and basic object-oriented programming. Understanding of Python's common data structures (lists, dictionaries, sets) and how to work with them effectively.\n \n \n Using pip for package management, understanding virtual environments, reading library documentation, and integrating third-party packages into projects. Specific libraries: requests for API calls, python-dotenv for environment variables.\n \n \n Reading and writing files, string manipulation and formatting, working with JSON data, and handling different text encodings. These skills are essential for processing inputs and outputs in AI applications.\n \n \n Using try/except blocks appropriately, understanding common exception types, debugging with print statements or debuggers, and writing code that fails gracefully with helpful error messages.\n \n \n Using libraries like argparse or click to create professional CLI tools with flags, options, and subcommands. Understanding how to provide help text and handle user input validation.\n \n \n\n \n Working with LLM APIs is central to AI engineering. You need to understand HTTP requests, authentication, and how to interact with RESTful APIs programmatically. Most LLM providers use similar patterns, so these skills transfer across platforms.\n \n Understanding HTTP methods (GET, POST), request/response structure, headers, and status codes. Using the requests library or similar tools to make API calls programmatically.\n \n \n Managing API keys securely using environment variables, understanding authentication headers, and keeping credentials out of version control. Using .env files and python-dotenv for local development.\n \n \n Parsing JSON responses, constructing JSON request bodies, handling nested data structures, and converting between JSON and Python objects. Understanding how to extract specific fields from complex JSON structures.\n \n \n Understanding async/await in Python, making concurrent API calls, and handling multiple requests efficiently. This becomes important for applications that make many API calls or need responsive UIs.\n \n \n Implementing exponential backoff, handling rate limit errors gracefully, and designing applications that work within API quota constraints. Understanding when to retry vs. fail fast.\n \n \n\n \n Professional development practices help you build maintainable, shareable projects and collaborate effectively. These skills are especially important in a peer-led learning environment where you'll be reviewing each other's code.\n \n Basic git operations (commit, push, pull, branch), understanding when to commit and how to write meaningful commit messages, and using GitHub or similar platforms for sharing code. Understanding .gitignore for excluding sensitive files.\n \n \n Organizing code into logical modules, separating concerns (API logic, user interface, business logic), and structuring projects so others can understand them. Writing README files with setup instructions and usage examples.\n \n \n Using virtual environments (venv, conda) to isolate project dependencies, creating requirements.txt files, and documenting setup steps. Understanding why dependency management matters for reproducibility.\n \n \n Writing basic tests for non-AI components, validating inputs and outputs, and thinking about edge cases. Understanding that AI components require different testing approaches than traditional code.\n \n \n Writing clear code comments, documenting functions and their parameters, creating usage examples, and explaining design decisions. For AI applications, documenting prompt evolution and why certain approaches work is especially valuable.\n \n \n\n \n These concepts are specific to working with AI and LLMs. Understanding them helps you make better decisions about how to design prompts, manage costs, and debug issues when AI outputs don't meet expectations.\n \n Understanding how text is converted to tokens, why token count matters for both cost and context limits, and how to estimate tokens for different text lengths. Awareness that different models tokenize differently.\n \n \n Understanding what temperature, max_tokens, top_p, and other parameters do, when to adjust them, and how they interact. Being able to choose appropriate settings for different use cases (creative vs. deterministic tasks).\n \n \n Knowing how to structure effective prompts with clear instructions, appropriate context, and examples when needed. Understanding iterative refinement and how to systematically improve prompts. Recognizing common prompt patterns and when to use them.\n \n \n Understanding context window limits, how conversation history affects token usage, and strategies for working within constraints (summarization, selective context, chunking). Knowing when context limits will be a problem.\n \n \n Recognizing that AI outputs need validation, implementing checks for format and content, handling unexpected outputs gracefully, and knowing when to retry vs. accept imperfect results. Understanding that AI is non-deterministic.\n \n \n Calculating approximate costs for different usage patterns, identifying opportunities to reduce token usage without sacrificing quality, and making informed trade-offs between model capability and cost.\n \n \n \n\n \n For learners with limited Python experience: Focus on getting comfortable with basics first. You don't need to be a Python expert to build AI applications—many successful AI engineers learned Python specifically for this purpose. The official Python tutorial and resources like \"Automate the Boring Stuff with Python\" are great starting points.\n On choosing LLM providers: Most providers now support OpenAI-compatible APIs, meaning code written for one provider often works with others by just changing the base URL. Start with whichever provider offers the best free tier for learning (OpenAI, Anthropic, Google, or GitHub Models all have generous free tiers as of 2025). You can always switch later.\n About API costs during learning: Free tiers are usually sufficient for learning and small projects. To minimize costs: use cheaper models for experimentation (GPT-4o-mini instead of GPT-4), set max_tokens limits, and cache responses during development. Consider using local models via Ollama for unlimited free experimentation, though they're less capable than cloud models.\n On project scope: These projects are designed to be completable in a week while teaching core concepts. If you finish early, use the twists to explore creative variations. If you're struggling with scope, focus on core functionality first—a simple version that works is better than a complex version that doesn't. You can always enhance it later.\n Regarding the peer-led format: Take advantage of having peers working on similar problems. Share your prompts and discuss what works and what doesn't—prompt engineering improves faster with diverse perspectives. Code review is especially valuable for AI applications since there are often multiple valid approaches.\n \n
", + "generatedAt": "2025-10-24T22:06:27.951Z" + } + }, + { + "id": "0f87f3f5-179a-4f88-b3cf-f6f8e48ce5be", + "arcId": "2db84cf6-a1f3-4794-8aeb-53faefe3ad95", + "order": 2, + "title": "Module 2", + "description": "Help me decide", + "durationWeeks": 1, + "status": "complete", + "moduleData": { + "xmlContent": "\n\n \n \n AI-Generated\n claude-sonnet-4-5-20250929\n \n projects.xml\n skills.xml\n research.xml\n \n \n \n \n Created module description focusing on building production-grade LLM applications with proper engineering practices\n Based on course progression narrative, Module 2 should bridge foundational concepts from Module 1 to practical application development. Research shows that in 2025, AI engineering emphasizes building reliable, production-ready systems rather than just prototypes. The module focuses on LLM application architecture, prompt engineering techniques, context management, and basic agent patterns - essential skills before advancing to RAG systems in later modules.\n \n Agentic Design Patterns - emphasizes importance of building systems with proper architecture\n Anthropic's guidance on building effective AI agents with simple, composable patterns\n Step-by-step guide emphasizing iterative development and production considerations for LLM apps\n \n \n \n Defined four learning objectives covering LLM application development, prompt engineering, context management, and debugging\n These objectives align with 2025 industry standards where AI engineers need to understand both the technical implementation and the unique challenges of working with LLMs. Research indicates that successful AI engineering requires skills in prompt optimization, managing non-deterministic outputs, cost tracking, and debugging - all reflected in these objectives. The objectives build progressively from basic application structure to advanced techniques.\n \n Emphasizes LLM-native development requires hybrid skills including software engineering and research\n Microsoft's comprehensive prompt engineering techniques documentation\n The Prompt Report presenting taxonomy of 58 LLM prompting techniques\n \n \n \n Created 6 primary research topics covering LLM APIs, prompt engineering, context management, agent patterns, cost optimization, and observability\n These topics reflect current best practices in 2025 AI engineering. Research shows that production LLM applications require understanding of API integration, advanced prompting techniques (zero-shot, few-shot, chain-of-thought), context window management, basic agentic patterns, cost tracking, and debugging tools. Each topic includes specific guidance for learners to research effectively, with subdivisions to support collaborative learning in peer-led format.\n \n Comprehensive taxonomy of 58 prompting techniques from The Prompt Report\n Detailed strategies for context management in LLM applications\n Anthropic's patterns for agentic systems including workflows and autonomous agents\n \n \n \n Designed two project briefs: AI-Powered CLI Tool and Conversational Task Assistant\n These projects provide hands-on experience with core LLM application patterns. The CLI tool teaches structured interaction with LLMs, tool use, and output parsing - fundamental skills for any AI application. The conversational assistant introduces context management and multi-turn interactions. Both projects are scoped appropriately for a 1-week module with limited-experience learners (1-3 years), focusing on practical implementation rather than complex architectures. Examples are diverse and relevant to real-world use cases developers would encounter.\n \n Guidance on starting lean with LLM applications and iterative development\n Practical examples of using LLMs for code generation and tool building\n Best practices for moving from prototype to production\n \n \n \n Created 3 conceptual twists: The Socratic Debugger, The Unreliable Narrator, and The Time Traveler\n These twists follow the guideline of being conceptual curveballs that reframe the problem space rather than adding technical features. Each twist challenges learners to think differently about how AI applications interact with users: the Socratic Debugger teaches through questions rather than answers, the Unreliable Narrator generates multiple perspectives on the same information, and the Time Traveler adapts its communication style to different time periods. These encourage creative thinking about AI interaction design while remaining implementation-agnostic.\n \n Techniques for crafting prompts that elicit different behaviors and personas\n \n \n \n Defined skill categories for Python, LLM APIs, Development Tools, and Software Engineering Practices\n These skills align with the technical requirements for building LLM applications in 2025. Python remains the dominant language for AI development, with specific emphasis on async programming and environment management. Understanding LLM API concepts (tokens, temperature, streaming) is essential. Development tools like version control and debugging are fundamental. Most skills are marked as \"Recommended\" rather than \"Essential\" to avoid overwhelming learners, following best practices for peer-led learning where learners can support each other in acquiring skills.\n \n Python continues to dominate as the go-to language for AI development\n Emphasis on cost tracking, debuggability, and proper development practices\n \n \n \n \n \n \n
Projects/Briefs - Examples should be validated for relevance to target cohort
\n
Projects/Twists - Twists should be tested with learners to ensure they're appropriately challenging
\n
\n
\n
\n\n This module focuses on building your first production-grade LLM applications. You'll learn to architect reliable AI-powered tools, master prompt engineering techniques that actually work in production, manage context and conversation history effectively, and debug non-deterministic systems. By the end of this module, you'll have built working applications that integrate LLMs into real development workflows, understand the unique challenges of AI engineering (cost, latency, reliability), and have practical experience with the patterns that form the foundation of more complex AI systems.\n\n \n \n You'll understand how to structure LLM-powered applications from API integration through to user-facing functionality. This includes choosing appropriate orchestration patterns (single-call vs. multi-step workflows), managing state and context across interactions, implementing proper error handling for non-deterministic outputs, and designing applications that balance capability with cost and latency constraints. You'll be able to make informed architectural decisions about when to use simple prompt-response patterns versus more complex agentic workflows.\n \n \n You'll master practical prompt engineering techniques that go beyond basic prompting. This includes crafting effective system prompts, implementing few-shot learning for consistent outputs, using chain-of-thought prompting for complex reasoning tasks, structuring prompts for reliable output formatting, and iteratively testing and refining prompts based on real results. You'll understand the difference between prompts that work in demos and prompts that work in production.\n \n \n You'll learn to manage the LLM's context window as a precious resource. This includes strategies for maintaining conversation history without exceeding token limits, implementing summarization and compression techniques, deciding what context to include and exclude, managing multi-turn conversations effectively, and understanding the tradeoffs between context size, cost, and response quality. You'll be able to build applications that maintain coherent interactions over extended sessions.\n \n \n You'll develop skills for debugging AI systems that fail silently and non-deterministically. This includes implementing logging and tracing for LLM calls, tracking token usage and costs, testing prompts systematically, identifying when failures are due to prompts vs. model limitations, and using observability tools to understand what's happening inside your application. You'll understand that \"AI fails silently\" and know how to catch failures before they reach users.\n \n \n\n \n \n \n Understanding how to effectively integrate LLM APIs into applications is foundational. Research the major LLM providers (OpenAI, Anthropic, Google, open-source models via Hugging Face), their API structures, authentication patterns, and rate limiting. Look into how to handle API responses, manage streaming vs. batch processing, and implement proper error handling for API failures.\n \n Investigate the official SDKs for major LLM providers (openai, anthropic, google-generativeai Python packages). Compare their features, ease of use, and how they handle common patterns like retries and streaming. One learner could focus on OpenAI's SDK, another on Anthropic's, and compare findings. Look into whether to use official SDKs or make raw HTTP requests.\n \n \n Research patterns for making multiple LLM API calls efficiently. Look into Python's asyncio for concurrent requests, when to batch vs. stream, and how to manage rate limits across concurrent calls. Investigate libraries like aiohttp and httpx for async HTTP requests. One learner could implement a comparison of sync vs. async patterns.\n \n \n Study how to track token usage across API calls, estimate costs before making requests, and implement budgets or limits. Research tokenization (how text is converted to tokens), how different models price tokens differently, and tools for counting tokens (tiktoken). Create a simple cost calculator as part of your research.\n \n \n\n \n Prompt engineering in 2025 is a systematic discipline with established patterns. Research the taxonomy of prompting techniques including zero-shot, few-shot, and chain-of-thought prompting. Investigate how to structure system prompts vs. user prompts, use delimiters and formatting for reliable parsing, and implement role-based prompting. Look into recent research on prompt optimization and what makes prompts reliable in production.\n \n Research how to select and format examples for few-shot prompting. Investigate how many examples are optimal, how to structure them, and whether order matters. Look into techniques like semantic similarity for dynamic example selection. Test different few-shot patterns and document what works best for different task types.\n \n \n Study techniques that improve LLM reasoning: chain-of-thought (asking the model to think step-by-step), tree-of-thought, and self-consistency. Research when these techniques are worth the extra tokens/cost. Investigate ReAct pattern (reasoning + acting) and how it applies to tool-using agents. Create examples showing reasoning improvements.\n \n \n Investigate techniques for getting reliable structured output from LLMs: JSON formatting, using delimiters, function calling/tool use APIs, and constrained generation. Research how to validate and parse LLM outputs, handle malformed responses, and retry with corrections. Look into newer features like JSON mode in various providers.\n \n \n\n \n Context windows are limited (typically 4k-128k tokens depending on model), and managing them effectively is crucial for multi-turn applications. Research strategies for context compression (summarization, truncation), maintaining conversation history, implementing memory systems (short-term vs. long-term), and deciding what context to include. Look into sliding window approaches, vector-based memory retrieval, and context caching techniques.\n \n Research patterns for managing chat history: sliding windows (keeping only recent N messages), summarization (compressing older messages), and selective retention (keeping important messages). Investigate how to detect when context is getting too long and implement automatic compression. Compare different strategies with code examples.\n \n \n Study how to decide what context to include when space is limited. Research relevance scoring, semantic search for context selection, and priority-based inclusion. Look into techniques like context re-ranking and query-aware context assembly. Investigate how frameworks like LangChain handle context selection.\n \n \n\n \n Agents extend LLMs with the ability to use tools and take actions. Research basic agent patterns: ReAct (reason + act), tool use/function calling, and simple orchestration workflows. Understand the difference between workflows (predefined paths) and agents (dynamic decision-making). Look into when to use agents vs. simpler patterns, and how to implement tool calling with major LLM providers.\n \n Investigate how different LLM providers implement function/tool calling. Research how to define tools, parse tool calls from model outputs, execute tools safely, and return results to the model. Look into OpenAI's function calling, Anthropic's tool use, and generic implementations. Create examples of useful tools (web search, calculator, file operations).\n \n \n Study the ReAct pattern in depth: how it combines reasoning and action, the observation-thought-action loop, and when to stop iterating. Research implementations in frameworks vs. building from scratch. Investigate error handling and infinite loop prevention. Build a simple ReAct agent as part of your research.\n \n \n\n \n LLM API calls can be expensive and slow. Research strategies for optimizing costs and performance: model selection (smaller/cheaper models for simple tasks), prompt compression, caching responses, batching requests, and streaming for perceived performance. Investigate monitoring and alerting for cost overruns. Look into techniques like prompt caching (Anthropic) and cached contexts.\n \n Research how to choose the right model for each task. Compare capabilities and costs of different models (GPT-4 vs. GPT-3.5, Claude Opus vs. Sonnet). Investigate routing patterns: using cheaper models for simple tasks and expensive models only when needed. Look into cascading (trying cheap model first, falling back to expensive) and classification-based routing.\n \n \n Study caching strategies for LLM applications: response caching (storing results for identical inputs), semantic caching (similar inputs), and prompt caching (provider-level features). Research when caching is appropriate and how to implement cache invalidation. Look into tools like Redis for caching and vector databases for semantic caching.\n \n \n\n \n Debugging AI applications requires different tools than traditional software. Research observability platforms for LLM applications: LangSmith, Weights & Biases, Helicone, and others. Investigate how to trace requests through multi-step workflows, log prompts and responses, track token usage, and identify failure patterns. Look into testing strategies for non-deterministic systems and techniques for evaluating prompt quality systematically.\n \n Research how to implement comprehensive logging for LLM applications. Investigate what to log (prompts, responses, tokens, latency, costs), structured logging formats, and tools for log analysis. Look into distributed tracing for multi-step workflows. Study privacy considerations when logging user data and LLM interactions.\n \n \n Study how to test LLM applications systematically. Research evaluation metrics (accuracy, relevance, consistency), creating test datasets, and automated evaluation using LLMs as judges. Investigate regression testing for prompts (ensuring changes don't break existing functionality). Look into tools like promptfoo and Braintrust for LLM testing.\n \n \n \n\n \n \n \n \n \n \n \n \n \n\n \n \n \n Build a command-line tool that uses an LLM to perform a useful task, with proper error handling, cost tracking, and structured output parsing.\n LLM API integration, prompt engineering for reliable outputs, output parsing and validation, error handling for API failures, and basic tool use patterns. This project teaches you to build the fundamental building block of LLM applications: a reliable, single-purpose tool that integrates AI capabilities into a developer workflow.\n \n • Successfully integrates with an LLM API (OpenAI, Anthropic, or similar) with proper authentication and error handling\n • Implements a clear, focused use case that demonstrates practical value\n • Uses structured prompting techniques to get reliable, parseable outputs\n • Tracks and displays token usage and estimated costs for each operation\n • Handles API errors gracefully (rate limits, timeouts, invalid responses)\n • Includes basic testing or validation of outputs\n • Provides helpful CLI interface with clear usage instructions\n • Documents prompt engineering decisions and iterations\n \n \n \n • Choose an LLM provider and set up API authentication (environment variables, API keys)\n • Make basic API calls using official SDK or HTTP requests\n • Handle streaming vs. batch responses appropriately for your use case\n • Implement retry logic for transient failures\n • Parse API responses and extract relevant information\n • Consider using async/await for better performance if making multiple calls\n \n \n • Design system prompts that establish clear roles and constraints\n • Use delimiters or formatting instructions to get parseable output (JSON, markdown, etc.)\n • Implement few-shot examples if needed for consistency\n • Test prompts iteratively and document what works and what doesn't\n • Handle edge cases where model output doesn't match expected format\n • Consider using function calling or JSON mode if available for your provider\n \n \n • Use argparse or click for command-line argument parsing\n • Implement clear help text and usage examples\n • Handle user input validation before making expensive API calls\n • Provide progress indicators for long-running operations\n • Format output clearly for terminal display (consider rich or colorama for formatting)\n • Implement proper exit codes and error messages\n \n \n • Use tiktoken or similar to count tokens before API calls\n • Calculate and display estimated costs based on model pricing\n • Track cumulative costs across multiple operations in a session\n • Warn users if operations will exceed a cost threshold\n • Log token usage for analysis and optimization\n \n \n \n \n A CLI tool that analyzes staged git changes and generates conventional commit messages. Uses LLM to understand code changes and suggest appropriate commit types (feat, fix, refactor), scope, and descriptions. Implements structured output parsing to format messages correctly and offers multiple suggestions for the user to choose from.\n \n \n A tool that takes code snippets or files as input and provides structured feedback on potential issues, improvements, and best practices. Uses few-shot prompting with examples of good reviews to ensure consistent, helpful output. Formats results as markdown with severity levels and specific suggestions.\n \n \n Analyzes Python functions or classes and generates comprehensive docstrings following a specific format (Google, NumPy, or Sphinx style). Uses structured prompting to ensure consistent formatting and includes parameter descriptions, return values, and usage examples. Validates that generated docstrings are syntactically correct.\n \n \n Takes complex API responses (JSON) and translates them into human-readable summaries or explanations. Useful for debugging or understanding third-party APIs. Uses chain-of-thought prompting to explain the data structure before summarizing, and handles various JSON schemas gracefully.\n \n \n Reads application log files and uses LLM to identify patterns, errors, and potential issues. Implements context window management to handle large log files by chunking. Outputs structured analysis with severity levels, affected components, and suggested actions.\n \n \n \n\n \n Build a conversational assistant that maintains context across multiple interactions to help users complete a complex task, implementing proper memory management and conversation history handling.\n Multi-turn conversation management, context window optimization, state management across interactions, conversation history compression, and building natural conversational flows. This project teaches you to handle the unique challenges of stateful AI applications where context accumulates over time.\n \n • Maintains coherent conversation across multiple turns (minimum 10 interactions)\n • Implements effective context management to stay within token limits\n • Manages conversation state and remembers relevant information from earlier in the conversation\n • Uses appropriate memory strategies (sliding window, summarization, or selective retention)\n • Provides clear conversation interface (CLI, simple web UI, or chat interface)\n • Handles context overflow gracefully when conversations get too long\n • Implements conversation reset or new session functionality\n • Tracks and displays cumulative token usage and costs\n • Demonstrates understanding of when to include vs. exclude context\n \n \n \n • Store and retrieve conversation history across turns\n • Implement data structures for messages (role, content, timestamp, metadata)\n • Decide on storage mechanism (in-memory, file-based, or database)\n • Handle conversation persistence and loading\n • Implement conversation branching or multiple sessions if needed\n • Consider using frameworks like LangChain's ConversationBufferMemory or building custom\n \n \n • Calculate current context size in tokens before each API call\n • Implement sliding window (keeping only recent N messages)\n • Build summarization system for older messages (using LLM or extractive methods)\n • Prioritize which messages to keep based on relevance or importance\n • Test different context strategies and measure impact on coherence\n • Handle edge cases where single messages exceed context limits\n \n \n • Track task progress and user goals across conversation\n • Extract and store important facts or decisions from conversation\n • Implement \"working memory\" for current task vs. \"long-term memory\" for user preferences\n • Use structured data (dictionaries, objects) to represent state\n • Update state based on user inputs and assistant actions\n • Provide visibility into current state for debugging\n \n \n • Design natural conversation patterns (greeting, clarification, confirmation, completion)\n • Implement turn-taking and context-aware responses\n • Handle ambiguous user inputs with clarifying questions\n • Provide progress indicators and next-step suggestions\n • Design graceful error recovery when assistant doesn't understand\n • Test conversation flows with real users and iterate\n \n \n \n \n Helps users break down a project into tasks, estimate timelines, and identify dependencies. Maintains context about the project goals, constraints, and decisions made in earlier conversation turns. Uses summarization to compress detailed task discussions while retaining key information. Outputs a structured project plan at the end.\n \n \n Assists users in modifying recipes based on dietary restrictions, available ingredients, or serving size changes. Remembers user's dietary preferences and kitchen inventory across the conversation. Uses context to make consistent suggestions (e.g., if user said they're vegetarian, doesn't suggest meat substitutions later).\n \n \n Conducts mock interviews for a specific role, remembering the job description and user's background. Asks follow-up questions based on previous answers and provides cumulative feedback. Implements state tracking for interview progress (introduction, technical questions, behavioral questions, wrap-up) and adjusts difficulty based on user performance.\n \n \n Helps developers debug issues by asking clarifying questions and suggesting solutions. Maintains context about the codebase, error messages, and attempted solutions. Uses conversation history to avoid suggesting the same solution twice and to build on previous hypotheses. Implements memory of what worked and what didn't.\n \n \n Works with users to create personalized learning plans for a new skill or technology. Asks about current knowledge, goals, and time constraints. Maintains context about user's learning style preferences and constraints. Builds a progressive plan that references earlier conversation about prerequisites and goals.\n \n \n \n \n\n \n \n Your assistant never directly provides answers or solutions. Instead, it asks probing questions that guide users to discover solutions themselves, like a Socratic teacher. The challenge is making questions helpful and progressive without being frustrating.\n \n A code review tool that doesn't point out bugs directly, but asks \"What happens to this variable when the list is empty?\" or \"Have you considered the case where the user input is null?\"\n A debugging assistant that responds to \"My code isn't working\" with questions like \"What did you expect to happen? What actually happened? What's the smallest change that reproduces the issue?\"\n \n \n\n \n Your assistant generates multiple perspectives or interpretations of the same information, potentially contradictory, forcing users to think critically about which interpretation makes sense. The AI plays devil's advocate against its own suggestions.\n \n A code architecture advisor that suggests three different approaches (microservices, monolith, serverless) and argues for each one, then argues against each one, leaving the decision to the user with full context of tradeoffs.\n A documentation generator that creates two versions: one optimized for beginners (verbose, lots of examples) and one for experts (terse, assumes knowledge), showing how the same code can be explained differently based on audience.\n A log analyzer that generates competing hypotheses about what caused an error, ranking them by likelihood but explaining why each could be wrong.\n \n \n\n \n Your assistant adapts its communication style and technical recommendations based on a specified time period (past or future), using historically appropriate terminology and technology constraints. This reframes technical problems through different temporal lenses.\n \n A code reviewer that can critique code as if from 1995 (concerned about memory usage, suggesting procedural approaches) or from 2030 (assuming quantum-safe cryptography, edge computing everywhere).\n A documentation generator that writes docs in the style of different eras: 1980s man pages, 2000s JavaDoc, 2015 markdown READMEs, or 2030 interactive AR documentation.\n An architecture advisor that designs systems with constraints from different eras: 1990s (dial-up internet, desktop apps), 2010s (mobile-first, cloud), 2025 (AI-native), 2035 (ambient computing).\n \n \n \n \n\n \n \n Python is the dominant language for AI engineering in 2025, and you'll need comfort with specific Python patterns that are common in LLM applications. Focus on async programming for concurrent API calls, environment management for API keys and configuration, and working with JSON for structured data.\n \n \n \n \n \n \n \n \n\n \n Understanding how LLM APIs work is fundamental to building reliable applications. You need to understand tokens (how text is chunked), model parameters (temperature, top_p, max_tokens), and the request/response structure. These concepts are consistent across providers even though implementation details differ.\n \n \n \n \n \n \n \n \n\n \n Building LLM applications requires the same development discipline as any software project. Version control is essential for tracking prompt changes, debugging tools help understand what's happening, and testing frameworks let you validate behavior systematically.\n \n \n \n \n \n \n \n\n \n LLM applications benefit from good software engineering practices: clear code organization, documentation of prompt decisions, and systematic testing. The non-deterministic nature of LLMs makes these practices even more important than in traditional software.\n \n \n \n \n \n \n \n \n\n \n This module intentionally focuses on building applications with LLM APIs rather than using high-level frameworks like LangChain. This \"build from first principles\" approach helps learners understand what's actually happening before adding framework abstractions. Frameworks can be introduced in later modules once fundamentals are solid.\n The emphasis on cost tracking and token management throughout this module reflects real-world concerns in production AI applications. Even in learning projects, developing awareness of costs helps build good habits and understanding of tradeoffs.\n Debugging and observability are emphasized because \"AI fails silently\" is one of the biggest challenges in moving from demos to production. Learners need to develop intuition for when LLMs are failing and how to catch failures early.\n The project briefs are deliberately scoped for 1-week completion by developers with 1-3 years of experience. They focus on core concepts rather than complex features, allowing time for iteration and learning from mistakes.\n \n
", + "generatedAt": "2025-10-24T22:09:52.317Z" + } + }, + { + "id": "83ab8e32-ed67-44a0-9d53-fd0524a7d499", + "arcId": "2db84cf6-a1f3-4794-8aeb-53faefe3ad95", + "order": 3, + "title": "Module 3", + "description": "Help me decide", + "durationWeeks": 1, + "status": "complete", + "moduleData": { + "xmlContent": "\n\n \n \n AI-Generated\n claude-sonnet-4-5-20250929\n \n projects.xml\n skills.xml\n research.xml\n \n \n \n \n Created module focused on evaluation, testing, and production readiness for LLM applications\n Based on course progression (Modules 1-2 covered foundations, later modules cover RAG), Module 3 bridges the gap between building demos and production deployment. Industry research shows evaluation and testing are critical gaps in 2025, with teams struggling to move from \"vibe checks\" to systematic quality assurance. This module addresses the most pressing need for learners transitioning to production-grade AI engineering.\n \n DeepEval and evaluation frameworks overview showing systematic evaluation is essential for production\n Observability and evaluation as critical pillars for LLM systems\n LLM evaluations as cornerstone of moving from prototype to production\n \n \n \n Defined three core learning objectives around evaluation, prompt optimization, and production deployment\n These objectives reflect current industry best practices for 2025. Research shows that systematic evaluation, iterative prompt engineering, and production observability are the three critical skills separating prototype builders from production engineers. Objectives are practical and measurable, appropriate for developers with 1-3 years experience.\n \n Seven best practices emphasizing evaluation, observability, and prompt management\n 2025 evaluation guide showing evolution from simple accuracy to comprehensive system evaluation\n \n \n \n Created 7 primary research topics covering evaluation frameworks, prompt engineering, observability, testing strategies, cost optimization, safety, and deployment patterns\n Topics selected based on 2025 industry priorities. Research consistently shows these seven areas as critical for production LLM applications. Each topic includes practical guidance for self-directed learning and can be subdivided for team research. Topics progress from evaluation fundamentals through production concerns.\n \n Comprehensive overview of evaluation tools and practices for 2025\n OpenAI's prompt engineering best practices\n LLMOps trends showing observability and production monitoring as critical\n \n \n \n Designed two project briefs: LLM Evaluation Harness and Prompt Optimization Laboratory\n Projects chosen to provide hands-on experience with the two most critical skills for production readiness: systematic evaluation and prompt engineering. Both projects are scoped for 1-week completion by developers with limited AI experience, build on Modules 1-2 foundations, and produce artifacts useful in real-world work. The Evaluation Harness teaches systematic testing; the Prompt Lab teaches iterative optimization.\n \n Guide emphasizing test case creation and evaluation as foundational\n Prompt engineering guide highlighting iterative refinement as essential\n \n \n \n Created 5 diverse examples for LLM Evaluation Harness project\n Examples span different domains (customer support, code generation, content moderation, financial analysis, education) to show versatility of evaluation techniques. Each example represents a real-world use case that learners might encounter. Variety ensures learners can find relevant applications regardless of their domain focus.\n \n Real-world application evaluation across finance, healthcare, and software engineering\n \n \n \n Created 5 diverse examples for Prompt Optimization Laboratory project\n Examples chosen to demonstrate different prompt engineering challenges: structured extraction, creative generation, analytical reasoning, technical documentation, and multi-step workflows. Each represents a common pattern in production LLM applications. Diversity helps learners understand when different optimization techniques apply.\n \n Comprehensive prompt engineering guide showing variety of techniques and use cases\n \n \n \n Created 3 conceptual twists: The Adversarial Auditor, The Drift Detective, and The Cost Accountant\n Twists designed as conceptual reframings rather than feature additions, following project twist guidelines. Each twist introduces a different philosophical lens on evaluation: adversarial testing (finding failures), temporal monitoring (detecting changes), and economic optimization (balancing quality vs cost). These represent real production concerns that push learners beyond basic evaluation.\n \n Drift detection as critical production concern\n Cost tracking emphasized as essential for production LLM apps\n \n \n \n Organized skills into 4 categories: Evaluation & Testing, Prompt Engineering, Production Operations, and Development Tools\n Skill categories align with project requirements and research topics. Most skills marked as \"Recommended\" rather than \"Essential\" to respect learner autonomy and varied backgrounds. Skills selected based on 2025 industry tools and practices. Each category includes both conceptual understanding and practical tool usage.\n \n Top evaluation tools for 2025\n LLMOps frameworks and tools\n \n \n \n Added 4 stretch topics: Adversarial Testing, Multi-Model Orchestration, Fine-Tuning Economics, and Regulatory Compliance\n Stretch topics represent advanced concerns that go beyond Module 3 scope but are relevant for ambitious learners or those in specific domains. Topics selected based on emerging 2025 trends and specialized industry needs. Marked as stretch to avoid overwhelming core learning objectives.\n \n Security and adversarial testing in prompt engineering\n Responsible AI evaluation and compliance considerations\n \n \n \n \n \n \n
Projects/Twists - Verify twists meet conceptual reframing criteria
\n
Module/Description - Confirm alignment with overall course narrative
\n
\n
\n
\n\n This module focuses on the critical transition from building AI prototypes to deploying production-ready LLM applications. You'll learn how to systematically evaluate LLM outputs, optimize prompts through data-driven iteration, and prepare applications for real-world deployment. By building evaluation harnesses and prompt optimization tools, you'll develop the engineering discipline needed to ship reliable AI systems that users can trust. This module bridges the gap between \"it works on my machine\" and \"it works in production.\"\n\n \n \n You'll be able to design and implement comprehensive evaluation frameworks for LLM applications, including defining success metrics, creating test datasets, implementing automated evaluation pipelines, and interpreting results to drive improvements. You'll understand the difference between model evaluation and system evaluation, and know when to use quantitative metrics versus qualitative assessment.\n \n \n You'll master the iterative process of prompt engineering through systematic experimentation. You'll be able to version prompts, run A/B tests, analyze performance across different inputs, and optimize for multiple objectives (accuracy, cost, latency). You'll understand prompt engineering as an engineering discipline, not trial-and-error, and be able to document and share your findings with teams.\n \n \n You'll understand the operational concerns of running LLM applications in production, including observability, monitoring, cost management, error handling, and graceful degradation. You'll be able to instrument applications for debugging, set up alerting for quality issues, and make informed decisions about when an application is ready to ship.\n \n \n\n \n \n \n Understanding how to measure LLM application quality is foundational to everything else in this module. Research the landscape of evaluation approaches, from traditional NLP metrics (BLEU, ROUGE) to modern LLM-specific metrics (faithfulness, relevance, groundedness). Explore frameworks like DeepEval, RAGAS, and Evidently.\n \n Start by reading comparison articles that survey the evaluation landscape, then dive into documentation for 2-3 specific frameworks. Pay attention to: What metrics are available? How do you create test datasets? How do automated metrics correlate with human judgment? What are the tradeoffs between different approaches?\n \n If multiple learners tackle this topic, consider dividing by: (1) evaluation metrics and theory, (2) specific framework deep-dives, (3) test dataset creation strategies, (4) human evaluation vs automated evaluation.\n \n \n Prompt engineering has evolved from an art to a discipline with established patterns and techniques. Research current best practices for 2025, including zero-shot and few-shot prompting, chain-of-thought reasoning, structured output formatting, role-based prompting, and prompt chaining.\n \n Look for authoritative sources like OpenAI's prompt engineering guide, Anthropic's documentation, and academic papers on prompting techniques. Focus on understanding: When does each technique apply? What are the tradeoffs? How do you structure prompts for consistency? How do you handle edge cases?\n \n Practice is essential here - don't just read about techniques, try them. Document what works and what doesn't. If splitting this topic, consider: (1) foundational techniques (zero-shot, few-shot), (2) advanced techniques (chain-of-thought, self-consistency), (3) structured output and formatting, (4) prompt chaining and workflows.\n \n \n Production LLM applications fail in unique ways - they don't crash with stack traces, they just produce bad outputs. Research observability tools and techniques specific to LLM applications, including tracing, logging, monitoring, and debugging approaches.\n \n Explore tools like LangSmith, Weights & Biases, Arize Phoenix, and LangFuse. Understand concepts like: distributed tracing for multi-step LLM workflows, prompt and response logging, latency monitoring, cost tracking, quality drift detection, and user feedback loops.\n \n Key questions to answer: How do you instrument an LLM application? What should you log? How do you debug when outputs are wrong but the code isn't broken? How do you detect when quality degrades over time? Consider dividing by: (1) tracing and logging strategies, (2) specific observability tools, (3) debugging techniques, (4) production monitoring.\n \n \n High-quality evaluation requires high-quality test data. Research strategies for creating, managing, and evolving test datasets for LLM applications. This includes manual curation, synthetic generation, sampling from production data, and edge case identification.\n \n Look into: How do you create representative test cases? How many test cases do you need? How do you handle the cold start problem (no data yet)? How do you use LLMs to generate test data? How do you version and manage test datasets over time? What's the role of adversarial examples?\n \n This topic has both theoretical and practical components. If splitting, consider: (1) test case design principles, (2) synthetic data generation techniques, (3) production data sampling and privacy, (4) dataset management and versioning tools.\n \n \n LLM API costs can escalate quickly in production. Research strategies for optimizing costs while maintaining quality, including prompt compression, caching, model selection, batch processing, and fallback strategies.\n \n Understand: How are LLM APIs priced? What factors drive cost? How do you measure cost per request or per user? What are strategies for reducing costs without sacrificing quality? When should you use smaller vs larger models? How do you implement caching effectively?\n \n Look for real-world case studies and cost analysis. Consider: (1) pricing models and cost drivers, (2) prompt optimization for cost, (3) caching and batching strategies, (4) model selection and routing.\n \n \n Production LLM applications must handle safety concerns, including harmful content generation, hallucinations (factually incorrect outputs), prompt injection attacks, and data leakage. Research detection and mitigation strategies.\n \n Explore: How do you detect hallucinations? What are common prompt injection techniques and defenses? How do you implement content filtering? What are the tradeoffs between safety and capability? What tools exist for safety testing?\n \n This is critical for any user-facing application. Research both technical solutions (guardrails, content filters) and process solutions (human review, staged rollouts). If splitting: (1) hallucination detection techniques, (2) adversarial prompt defenses, (3) content filtering and moderation, (4) safety testing frameworks.\n \n \n Moving from development to production requires understanding deployment patterns, infrastructure choices, scaling strategies, and operational concerns. Research common architectures and deployment patterns for LLM applications in 2025.\n \n Consider: API-based vs self-hosted models, serverless vs container-based deployment, synchronous vs asynchronous processing, rate limiting and throttling, error handling and retries, fallback strategies, and gradual rollouts.\n \n Look for architecture diagrams and real-world examples. Understand the tradeoffs between different approaches. If splitting: (1) infrastructure and hosting options, (2) scaling and performance patterns, (3) error handling and reliability, (4) deployment strategies (blue-green, canary, etc.).\n \n \n \n \n \n \n \n \n \n\n \n \n \n Build a comprehensive evaluation system that can test an LLM application across multiple dimensions, track results over time, and provide actionable insights for improvement.\n Systematic evaluation, automated testing, metrics implementation, test dataset management, and results analysis. The harness should be reusable across different LLM applications and support both automated metrics and human evaluation workflows.\n \n • Define at least 3 evaluation metrics appropriate for your application (e.g., accuracy, relevance, safety, cost)\n • Create or source a test dataset with minimum 20 diverse test cases covering happy paths and edge cases\n • Implement automated evaluation pipeline that runs all tests and computes metrics\n • Generate clear reports showing performance across metrics, with examples of failures\n • Support versioning so you can compare results across prompt changes or model updates\n • Include at least one \"LLM-as-judge\" metric where an LLM evaluates outputs\n • Document your evaluation methodology and how to interpret results\n • Make the harness configurable for different applications (not hardcoded to one use case)\n \n \n \n • Understand difference between component metrics (e.g., relevance) and outcome metrics (e.g., task success)\n • Know when to use quantitative metrics vs qualitative assessment\n • Implement both deterministic metrics (exact match, keyword presence) and LLM-based metrics\n • Handle cases where ground truth isn't available or is subjective\n • Balance multiple metrics that may conflict (accuracy vs creativity, safety vs capability)\n \n \n • Identify representative inputs that cover your application's use cases\n • Create edge cases and adversarial examples to stress-test the system\n • Generate synthetic test data when real data isn't available\n • Sample and anonymize production data ethically\n • Version and manage test datasets as they evolve\n • Document expected outputs or evaluation criteria for each test case\n \n \n • Structure code for testability and reproducibility\n • Implement test runners that execute evaluations consistently\n • Handle async operations and API rate limits gracefully\n • Generate structured results that can be analyzed programmatically\n • Set up CI/CD integration for continuous evaluation\n • Implement caching to avoid re-running expensive evaluations\n \n \n • Aggregate metrics across test cases to identify patterns\n • Visualize results to make insights accessible\n • Identify which test cases are failing and why\n • Compare results across versions to track improvements or regressions\n • Generate actionable recommendations from evaluation results\n • Communicate findings to non-technical stakeholders\n \n \n \n \n Build an evaluation harness for a customer support chatbot that measures response quality, helpfulness, tone appropriateness, and factual accuracy. Include test cases covering common questions, edge cases (angry customers, ambiguous requests), and adversarial inputs (attempts to get off-topic responses). Implement metrics like relevance scoring, sentiment analysis, and hallucination detection.\n \n \n Create an evaluation system for an AI coding assistant that generates Python functions from descriptions. Test cases include simple functions, edge cases (error handling, complex logic), and security-sensitive code. Metrics include code correctness (does it run?), test coverage, code quality (linting), security vulnerabilities, and whether it matches the specification.\n \n \n Build a harness that evaluates a content moderation system for detecting harmful content. Test dataset includes benign content, clearly harmful content, and edge cases (sarcasm, context-dependent harm). Metrics include precision and recall for different harm categories, false positive rate (blocking safe content), and consistency across similar inputs.\n \n \n Create an evaluation system for an LLM that summarizes financial earnings reports. Test cases include various report types and lengths. Metrics include factual accuracy (checking numbers against source), completeness (covering key points), conciseness, and readability. Implement both automated metrics and a workflow for domain expert review.\n \n \n Build a harness for evaluating AI-generated educational explanations. Test cases cover different topics and difficulty levels. Metrics include accuracy, clarity, appropriate reading level, engagement, and pedagogical quality. Include both automated metrics (readability scores) and rubrics for human evaluation by educators.\n \n \n \n\n \n Create a systematic prompt engineering environment that enables rapid experimentation, A/B testing, and data-driven optimization of prompts for a specific LLM application.\n Prompt versioning, systematic experimentation, performance tracking across multiple dimensions, and iterative optimization. The laboratory should make prompt engineering reproducible and collaborative rather than ad-hoc and individual.\n \n • Support versioning of prompts with metadata (author, date, rationale for changes)\n • Enable side-by-side comparison of different prompt versions on the same inputs\n • Track performance metrics for each prompt version (accuracy, cost, latency, user satisfaction)\n • Implement A/B testing capability to evaluate prompts on held-out test sets\n • Provide visualization of how prompts perform across different input categories\n • Support templating so prompts can be parameterized and reused\n • Include prompt engineering best practices as linting or suggestions\n • Generate reports comparing prompt versions with recommendations for which to deploy\n • Make it easy to share prompts and results with team members\n \n \n \n • Design experiments with clear hypotheses about what will improve performance\n • Change one variable at a time to isolate effects\n • Test prompts on diverse inputs, not just examples that work\n • Document what you tried and what you learned\n • Recognize when you've reached diminishing returns on optimization\n • Balance competing objectives (quality vs cost, accuracy vs creativity)\n \n \n • Use version control (Git) for prompt templates\n • Write clear commit messages explaining why prompts changed\n • Tag versions that are deployed to production\n • Maintain changelog of prompt evolution\n • Handle prompt variations for different contexts or user segments\n • Implement rollback strategy when new prompts underperform\n \n \n • Define success metrics before starting optimization\n • Collect baseline measurements before making changes\n • Use statistical significance testing to avoid over-interpreting noise\n • Track multiple metrics (quality, cost, latency) not just one\n • Identify which input categories benefit from prompt changes\n • Recognize when prompt changes help some cases but hurt others\n \n \n • Apply techniques like few-shot learning, chain-of-thought, role-based prompting\n • Structure prompts for consistency and clarity\n • Use delimiters and formatting to improve parsing\n • Implement prompt chaining for complex multi-step tasks\n • Handle edge cases and error conditions in prompts\n • Optimize prompt length to balance context and cost\n \n \n \n \n Build a prompt lab for optimizing an AI email assistant that drafts responses to customer inquiries. Test different prompt structures (role-based, few-shot examples, explicit instructions) and measure quality (helpfulness, tone, accuracy), cost (tokens used), and latency. Track performance across different inquiry types (complaints, questions, requests). Implement A/B testing to validate improvements before deployment.\n \n \n Create a laboratory for optimizing prompts that generate e-commerce product descriptions. Experiment with different creative levels, length constraints, and style guides. Measure engagement metrics (click-through rate), SEO quality, and brand consistency. Support templating so prompts can be parameterized by product category. Track which prompt variations work best for different product types.\n \n \n Build a system for optimizing prompts that extract structured data from unstructured text (e.g., extracting dates, names, and amounts from invoices). Focus on improving extraction accuracy and handling edge cases. Test different output formatting approaches (JSON, CSV, key-value pairs). Measure precision and recall for each field. Track cost per extraction and optimize for both quality and efficiency.\n \n \n Create a prompt optimization environment for an AI that summarizes technical documentation. Experiment with different summarization strategies (extractive vs abstractive, different length targets, different reading levels). Measure accuracy (coverage of key points), readability, and usefulness to different audiences. Support creating audience-specific prompt variations (for developers vs managers).\n \n \n Build a laboratory for optimizing prompts that convert natural language questions into SQL queries. Test different approaches for providing schema context, handling ambiguity, and generating correct syntax. Measure query correctness, efficiency, and robustness to variations in phrasing. Track which types of questions benefit from different prompt strategies.\n \n \n \n \n\n \n \n Reframe your evaluation system as an adversarial red team that actively tries to break your LLM application and expose failure modes you haven't considered.\n \n Generate adversarial test cases that attempt prompt injection, jailbreaking, or eliciting harmful outputs. Create a \"failure taxonomy\" that categorizes how the system breaks.\n Build an automated adversary that uses one LLM to generate attacks against another LLM, creating an evolutionary arms race that surfaces edge cases.\n Implement a \"chaos testing\" mode that introduces realistic production conditions (degraded API performance, partial responses, rate limits) to see how the system handles adversity.\n \n \n\n \n Transform your evaluation harness into a continuous monitoring system that detects when LLM behavior changes over time, even when the code hasn't changed.\n \n Track a \"fingerprint\" of your LLM's behavior over time and alert when outputs start diverging from baseline, potentially indicating model updates by the provider or data distribution shifts.\n Build a system that compares today's outputs to last week's outputs on the same inputs, highlighting where behavior has changed and whether quality improved or degraded.\n Create a \"time-travel debugger\" that lets you replay old requests and compare how the system would respond now vs how it responded in the past.\n \n \n\n \n Reframe prompt optimization as an economic optimization problem where you must maximize value per dollar spent, making explicit tradeoffs between quality and cost.\n \n Implement a \"quality-cost frontier\" visualization that shows the Pareto optimal prompts, helping you choose the right point on the quality-cost tradeoff curve for your use case.\n Build a dynamic routing system that uses cheaper models for easy requests and expensive models only for hard requests, optimizing total cost while maintaining quality thresholds.\n Create a \"cost budget\" system where prompts must achieve quality targets within a fixed cost constraint, forcing creative optimization like prompt compression or caching strategies.\n \n \n \n \n\n \n \n Skills for systematically assessing LLM application quality through metrics, test datasets, and automated evaluation pipelines. These skills separate production-grade applications from demos.\n Understanding and using frameworks like DeepEval, RAGAS, Evidently, or MLflow for LLM evaluation\n Implementing evaluation metrics including accuracy, relevance, faithfulness, groundedness, and custom metrics\n Using LLMs to evaluate other LLM outputs with appropriate prompting and calibration\n Creating, versioning, and maintaining test datasets for LLM applications\n Applying statistical tests to determine if performance differences are meaningful\n Designing and implementing human review processes for qualitative assessment\n \n\n \n Systematic approaches to designing, optimizing, and managing prompts as engineering artifacts rather than ad-hoc experimentation.\n Using version control to track prompt changes with clear commit messages and history\n Providing examples in prompts to guide model behavior\n Structuring prompts to elicit step-by-step reasoning\n Designing prompts to produce consistent, parseable outputs (JSON, CSV, etc.)\n Creating reusable prompt templates with variables\n Optimizing prompts to reduce token usage while maintaining quality\n Breaking complex tasks into sequences of simpler prompts\n \n\n \n Skills for running LLM applications reliably in production, including monitoring, debugging, cost management, and incident response.\n Using tools like LangSmith, Weights & Biases, or Arize Phoenix for tracing and monitoring\n Implementing tracing to debug multi-step LLM applications\n Monitoring token usage and API costs, implementing cost reduction strategies\n Gracefully handling API failures, rate limits, and timeouts\n Implementing caching to reduce costs and latency for repeated requests\n Implementing client-side rate limiting to avoid hitting API limits\n Setting up alerts for quality degradation or operational issues\n Implementing canary deployments or A/B tests for new prompts or models\n \n\n \n Practical tools and libraries for building evaluation systems and prompt optimization laboratories.\n Using pytest or unittest for structuring evaluation tests\n Analyzing evaluation results and generating reports\n Creating charts and dashboards to communicate results\n Using OpenAI, Anthropic, or other LLM provider SDKs\n Handling concurrent API calls efficiently\n Managing API keys, configurations, and environment variables securely\n Running evaluations automatically in continuous integration pipelines\n \n \n\n \n This module intentionally focuses on evaluation and optimization rather than introducing new application types. The goal is to deepen engineering discipline around LLM applications before moving to more complex architectures in later modules.\n While the module includes research on safety and adversarial testing, these topics are introduced as stretch content. The core focus remains on systematic evaluation and prompt optimization, which are foundational skills for all LLM work.\n Projects are designed to produce reusable artifacts (evaluation harnesses, prompt labs) that learners can apply to their own work beyond the course. This aligns with the apprenticeship model of building practical skills.\n The module emphasizes data-driven decision making and moving away from \"vibe checks\" - a key theme in 2025 industry discourse about maturing LLM engineering practices.\n \n
", + "generatedAt": "2025-10-24T22:13:16.583Z" + } + } + ] + }, + { + "id": "4fa8f2b2-e3f6-437b-87fc-027a9449e34a", + "order": 2, + "title": "Arc 2", + "description": "Describe the broad thematic focus of this arc", + "theme": "Help me decide", + "durationWeeks": 2, + "modules": [ + { + "id": "510a858f-2d6b-415d-96a7-8f4e16753631", + "arcId": "4fa8f2b2-e3f6-437b-87fc-027a9449e34a", + "order": 1, + "title": "Module 1", + "description": "Help me decide", + "durationWeeks": 1, + "status": "complete", + "moduleData": { + "xmlContent": "\n\n \n \n AI-Generated\n claude-sonnet-4-5-20250929\n \n projects.xml\n skills.xml\n research.xml\n \n \n \n \n Created module description focusing on foundational AI engineering skills\n Module 1 needs to establish core competencies in prompt engineering, LLM interaction, and basic application development. Research shows these are essential foundations for AI engineering work in 2025, with prompt engineering being a critical skill for working with modern LLMs.\n \n The Prompt Report presenting taxonomy of 58 LLM prompting techniques\n Comprehensive prompt engineering guide with 3M+ learners\n AI Engineering Best Practices emphasizing prompt engineering before fine-tuning\n \n \n \n Defined three core learning objectives aligned with 2025 AI engineering fundamentals\n Learning objectives focus on practical skills identified in industry research: prompt engineering techniques, API integration, and understanding LLM behavior. These align with current industry standards where AI engineers must understand model capabilities and limitations without requiring deep ML theory knowledge initially.\n \n Microsoft Azure AI Engineer certification emphasizing API usage and solution building\n 2025 AI Skills Roadmap highlighting prompt engineering and API usage as core skills\n \n \n \n Created 7 primary research topics covering foundational AI engineering concepts\n Topics selected based on current industry practices and academic research on AI engineering education. Includes prompt engineering techniques, LLM APIs, model selection, cost management, and ethical considerations - all identified as critical for 2025 AI engineers.\n \n Systematic Survey of Prompt Engineering covering 29 distinct techniques\n Anthropic API documentation for current best practices\n Microsoft prompt engineering techniques including few-shot learning\n \n \n \n Designed three project briefs progressing from basic to intermediate complexity\n Projects follow bottom-up approach recommended in recent AI engineering literature, starting with simple prompt-based applications before adding complexity. Each project teaches specific skills while building toward real-world applications appropriate for beginners with 1-3 years experience.\n \n Building LLM Apps guide recommending bottom-up approach starting lean\n 12 Best LLM Projects identifying appropriate beginner-level applications\n \n \n \n Created three conceptual twists that reframe problem-solving approaches\n Twists designed to challenge learners to think differently about AI applications, moving beyond feature additions to philosophical constraints. This encourages creative thinking about how AI systems can serve different purposes and perspectives.\n \n \n \n Organized skills into Python, API Integration, and Professional Practice categories\n Skills reflect current industry requirements for AI engineers, emphasizing practical implementation over theoretical knowledge. Python remains the primary language for AI work, with API integration and professional practices being critical for production systems.\n \n Python identified as lingua franca of AI with 2025 skills emphasis\n Microsoft AI Engineer training emphasizing API calls and application development\n \n \n \n \n \n \n
Projects/Briefs - Examples should be validated for appropriateness to cohort
\n
Projects/Twists - Conceptual twists may need refinement based on facilitator feedback
\n
\n
\n
\n\n This foundational module introduces you to AI engineering by exploring how to effectively interact with Large Language Models (LLMs) through prompt engineering and API integration. You'll learn to craft effective prompts, understand model behavior, and build your first LLM-powered applications. By the end of this module, you'll have hands-on experience working with LLM APIs, understanding their capabilities and limitations, and creating simple but functional AI applications that solve real problems.\n\n \n \n You'll understand core prompt engineering techniques including zero-shot, few-shot, and chain-of-thought prompting. You'll be able to design effective prompts that elicit desired behaviors from LLMs, handle edge cases, and iterate on prompt designs based on model outputs. You'll recognize when prompts need refinement versus when model limitations require different approaches.\n \n \n You'll gain practical experience integrating LLM APIs (such as OpenAI or Anthropic) into applications. You'll understand how to make API calls, handle responses, manage errors, and work within rate limits and cost constraints. You'll be able to read API documentation and adapt examples to your specific use cases.\n \n \n You'll develop intuition for how LLMs behave, including their strengths, limitations, and failure modes. You'll understand concepts like temperature, tokens, and context windows. You'll be able to evaluate model outputs critically, recognize hallucinations and biases, and make informed decisions about when and how to use LLMs in applications.\n \n \n\n \n \n \n Research the fundamental techniques used to elicit desired behaviors from LLMs. Start with the DAIR.AI Prompt Engineering Guide and The Prompt Report (arXiv:2406.06608) which catalogs 58 different prompting techniques. Focus initially on understanding zero-shot prompting (no examples), few-shot prompting (providing examples), and chain-of-thought prompting (reasoning step-by-step).\n \n Explore how providing examples in prompts affects model behavior. Research when to use few-shot learning versus zero-shot approaches. Look at Microsoft's Azure OpenAI prompt engineering documentation for practical examples. If multiple people tackle this, divide by: (1) zero-shot techniques and use cases, (2) few-shot example selection and formatting, (3) comparing effectiveness across different model sizes.\n \n \n Investigate techniques that encourage models to show their reasoning process. Research chain-of-thought prompting, tree-of-thought, and least-to-most prompting. Examine when explicit reasoning improves outputs versus when it adds unnecessary complexity. Divide by: (1) basic chain-of-thought patterns, (2) advanced reasoning techniques, (3) debugging reasoning failures.\n \n \n Study how to iteratively improve prompts through testing and refinement. Research prompt templating, variable injection, and A/B testing approaches. Look at how companies structure reusable prompt libraries. Divide by: (1) systematic testing approaches, (2) prompt versioning and management, (3) measuring prompt effectiveness.\n \n \n \n Understand how to interact with LLMs programmatically through APIs. Start with official documentation from OpenAI and Anthropic. Focus on understanding the Messages API pattern, authentication, request/response formats, and error handling. Research both REST API calls and official SDK usage in Python.\n \n Learn the basics of making API calls to LLM services. Research authentication methods, request structure (including messages, system prompts, and parameters), and response parsing. Look at quickstart guides and basic tutorials. Divide by: (1) API authentication and setup, (2) request structure and parameters, (3) response handling and error management.\n \n \n Explore official SDKs from OpenAI and Anthropic. Research how SDKs simplify API interaction, handle retries, and manage streaming responses. Look at LangChain as a higher-level abstraction. Divide by: (1) comparing raw API vs SDK approaches, (2) SDK-specific features and utilities, (3) when to use frameworks like LangChain.\n \n \n \n Research how to choose appropriate models for different tasks. Understand the trade-offs between model size, cost, speed, and capability. Compare offerings from different providers (GPT-4, Claude, Gemini) and understand when to use smaller vs larger models.\n \n Compare capabilities across different model families and sizes. Research benchmarks, but also understand real-world performance differences. Look at model cards and official documentation. Divide by: (1) comparing flagship models (GPT-4, Claude Sonnet, etc.), (2) understanding smaller/faster model options, (3) specialized models and use cases.\n \n \n Understand how context windows work and how to manage token usage. Research tokenization, how different content types consume tokens, and strategies for working within context limits. Divide by: (1) token counting and estimation, (2) context window strategies, (3) managing long conversations or documents.\n \n \n \n Learn to build cost-effective AI applications by understanding pricing models, implementing caching strategies, and handling rate limits. Research how API costs scale with usage and techniques to optimize spending while maintaining quality.\n \n Understand how LLM APIs charge for usage (typically per token). Research cost differences between models and strategies to reduce costs (prompt optimization, caching, model selection). Look at real-world cost examples. Divide by: (1) understanding pricing structures, (2) cost estimation and monitoring, (3) optimization techniques.\n \n \n Research how API providers implement rate limits and quotas. Learn strategies for handling rate limit errors, implementing retry logic with exponential backoff, and designing systems that respect limits. Divide by: (1) understanding rate limit types, (2) implementing retry strategies, (3) quota management for production systems.\n \n \n \n Study how to design reusable, maintainable prompt templates. Research variable injection, conditional logic in prompts, and separating prompt content from application code. Look at how frameworks like LangChain handle prompt templates.\n \n \n Learn techniques for extracting structured data from LLM responses. Research JSON mode, function calling, and regex-based parsing. Understand how to validate outputs and handle cases where the model doesn't follow instructions.\n \n \n Understand the ethical implications of using LLMs, including bias, hallucinations, privacy concerns, and responsible AI practices. Research how to detect and mitigate common issues. Look at frameworks like IEEE standards for AI ethics and Anthropic's responsible AI guidelines.\n \n \n \n \n \n \n \n \n \n \n\n \n \n \n Build a command-line interface (CLI) application that acts as a personal assistant, answering questions and helping with tasks using an LLM API.\n This project focuses on making your first API calls to an LLM service, handling user input, displaying responses, and managing conversation history. You'll practice prompt engineering to shape the assistant's personality and capabilities.\n \n - Successfully authenticates with an LLM API (OpenAI or Anthropic)\n - Accepts user input from the command line and sends it to the LLM\n - Displays responses in a readable format\n - Maintains conversation context across multiple turns\n - Implements basic error handling for API failures\n - Uses environment variables for API keys (never hardcoded)\n - Includes a system prompt that defines the assistant's behavior\n - Provides a way to clear conversation history or start fresh\n \n \n \n - Store API keys securely using environment variables or .env files\n - Use the python-dotenv library to load environment variables\n - Never commit API keys to version control (add .env to .gitignore)\n - Understand the difference between API keys, tokens, and authentication headers\n \n \n - Use the official SDK (openai or anthropic library) to make API calls\n - Understand the Messages API pattern with roles (system, user, assistant)\n - Structure requests with model selection, messages array, and parameters\n - Parse API responses to extract the assistant's message content\n - Handle streaming vs non-streaming responses appropriately\n \n \n - Store message history as a list of message objects\n - Append user and assistant messages to maintain context\n - Understand how context affects model responses\n - Implement a strategy to prevent context from exceeding limits (truncation or summarization)\n - Consider when to clear or reset conversation history\n \n \n - Create an intuitive command-line interface with clear prompts\n - Handle user input with appropriate validation\n - Display multi-line responses in a readable format\n - Provide feedback during API calls (loading indicators)\n - Implement commands like /clear, /exit, or /help for control\n \n \n - Catch and handle API errors (rate limits, authentication failures, network issues)\n - Provide user-friendly error messages\n - Implement retry logic with exponential backoff for transient failures\n - Handle edge cases like empty inputs or very long messages\n \n \n - Write effective system prompts that define assistant behavior\n - Experiment with different personas and instruction styles\n - Understand how system prompts influence model responses\n - Test and iterate on system prompts based on output quality\n \n \n \n \n A CLI assistant that helps with studying by explaining concepts, creating practice questions, and providing feedback on answers. The system prompt positions it as a patient tutor who asks clarifying questions and breaks down complex topics.\n \n \n A command-line tool where you can paste code snippets and get feedback on style, potential bugs, and improvements. The system prompt makes it act like a senior developer who provides constructive criticism with explanations.\n \n \n An assistant that helps improve writing by suggesting edits, explaining grammar rules, and offering alternative phrasings. It maintains context to understand the document you're working on and provides consistent feedback.\n \n \n A CLI tool that helps organize your day by discussing tasks, priorities, and time management. It remembers what you've told it about your schedule and can help adjust plans as the day evolves.\n \n \n An assistant that helps practice a foreign language through conversation, provides corrections, and explains grammar points. The system prompt can switch between the target language and English for explanations.\n \n \n \n\n \n Create a web-based application that allows you to test different prompts side-by-side, compare outputs, and iterate on prompt designs with a visual interface.\n This project emphasizes systematic prompt engineering, A/B testing, and understanding how different prompt variations affect model outputs. You'll build tooling to support the prompt development workflow.\n \n - Provides a web interface for entering and editing prompts\n - Allows testing the same input with different prompt templates\n - Displays multiple model responses side-by-side for comparison\n - Saves prompt versions and test results for future reference\n - Supports variable injection into prompt templates\n - Shows token counts and estimated costs for each request\n - Allows adjustment of model parameters (temperature, max_tokens)\n - Exports successful prompts in a reusable format\n \n \n \n - Build a simple web interface using Flask, FastAPI, or Streamlit\n - Create forms for prompt input and parameter configuration\n - Display results in an organized, readable layout\n - Handle asynchronous API calls from the frontend\n - Implement basic styling for usability (CSS or component libraries)\n \n \n - Design prompt templates with variable placeholders\n - Implement variable substitution using string formatting or template engines\n - Support multiple template formats (f-strings, Jinja2, etc.)\n - Allow saving and loading prompt templates\n - Handle edge cases in template rendering\n \n \n - Make multiple API calls with different prompts or parameters\n - Display results in a comparison view (side-by-side or tabbed)\n - Implement basic metrics for comparing outputs (length, sentiment, etc.)\n - Allow users to rate or annotate results\n - Track which prompt variations perform best\n \n \n - Store prompt templates and test results (JSON files, SQLite, or similar)\n - Implement versioning for prompt iterations\n - Allow users to review historical tests and results\n - Export data in useful formats (JSON, CSV, Markdown)\n - Handle concurrent access if multiple users test simultaneously\n \n \n - Calculate token counts for prompts and responses\n - Estimate costs based on model pricing\n - Display running totals for testing sessions\n - Warn users about expensive operations\n - Implement basic budgeting or limits\n \n \n \n \n A tool for testing different ways to phrase customer support responses. Compare formal vs casual tones, different levels of detail, and various approaches to handling complaints. Save the best-performing templates for use in production.\n \n \n Test different prompts for generating marketing copy, blog posts, or social media content. Compare outputs with different creativity settings and prompt structures. Track which combinations produce the most engaging content.\n \n \n Experiment with prompts that generate code documentation, API references, or README files. Test different levels of technical detail and documentation styles to find what works best for your audience.\n \n \n Compare prompts for explaining complex topics at different education levels. Test how different instructional approaches (analogies, examples, step-by-step) affect clarity and understanding.\n \n \n \n\n \n Build an application that takes long-form content (articles, documents, transcripts) and generates customizable summaries using LLMs with different summarization strategies.\n This project explores working with longer inputs, managing context windows, implementing different summarization approaches, and extracting structured information from unstructured text.\n \n - Accepts various input formats (text files, URLs, pasted content)\n - Offers multiple summarization styles (brief, detailed, bullet points)\n - Handles content that exceeds the model's context window\n - Extracts key points, action items, or themes from content\n - Allows customization of summary length and focus\n - Provides confidence indicators or source citations\n - Implements caching to avoid re-processing the same content\n - Exports summaries in multiple formats (text, Markdown, JSON)\n \n \n \n - Read and parse different input formats (txt, PDF, web pages)\n - Split long content into manageable chunks\n - Implement strategies for handling content exceeding context limits\n - Preserve important context when chunking (don't split mid-sentence)\n - Reassemble chunked summaries coherently\n \n \n - Design prompts that produce different summary styles\n - Use few-shot examples to demonstrate desired output format\n - Instruct the model to focus on specific aspects (technical details, main arguments, etc.)\n - Handle edge cases like very short content or content with no clear summary\n - Iterate on prompts to improve summary quality\n \n \n - Use prompts to generate structured data (JSON, bullet lists)\n - Parse and validate model outputs\n - Extract specific information types (key points, action items, quotes)\n - Handle cases where the model doesn't follow output format instructions\n - Implement fallback strategies for parsing failures\n \n \n - Fetch content from URLs using requests or similar libraries\n - Extract main content from web pages (remove navigation, ads)\n - Parse PDF files or other document formats\n - Handle encoding issues and special characters\n - Respect robots.txt and implement rate limiting for web scraping\n \n \n - Implement caching to avoid re-processing identical content\n - Use file hashing to detect duplicate inputs\n - Store processed summaries for quick retrieval\n - Balance cache size and storage costs\n - Implement cache invalidation strategies\n \n \n \n \n Process meeting transcripts to extract key decisions, action items, and discussion points. Generate different views: executive summary, detailed notes, and task list. Track recurring themes across multiple meetings.\n \n \n Summarize academic papers into accessible formats for different audiences. Extract methodology, findings, and implications. Generate both technical summaries for peers and plain-language versions for general readers.\n \n \n Summarize news articles from multiple sources on the same topic. Compare different perspectives and identify consensus vs disagreement. Generate a balanced overview that captures the full story.\n \n \n Process video transcripts (from YouTube, podcasts, lectures) to create study guides, show notes, or quick reference summaries. Extract timestamps for key moments and generate chapter markers.\n \n \n Summarize legal documents, contracts, or terms of service into plain language. Highlight important clauses, obligations, and potential concerns. Make legal text accessible to non-lawyers.\n \n \n \n \n\n \n \n Your AI application must actively question and critique its own outputs, presenting multiple perspectives or highlighting potential flaws before offering a final response.\n \n A code assistant that not only suggests solutions but also points out potential bugs, edge cases, or better alternatives in its own suggestions.\n A content summarizer that identifies potential biases in the source material and its own summary, flagging areas where interpretation might vary.\n A personal assistant that proactively mentions limitations, uncertainties, or situations where its advice might not apply.\n \n \n\n \n Design your application to generate multiple distinct viewpoints or framings of the same information, forcing users to consider different angles before accepting any single interpretation.\n \n A news summarizer that presents the same story from different political or cultural perspectives, highlighting what each viewpoint emphasizes or omits.\n A writing coach that suggests revisions in multiple voices or tones (formal, casual, persuasive, empathetic) so you can choose the right approach.\n A study assistant that explains concepts using different analogies or frameworks, helping you find the explanation that clicks for your learning style.\n \n \n\n \n Your application must explicitly indicate confidence levels, knowledge gaps, or areas of uncertainty in every response, making the AI's limitations transparent rather than hidden.\n \n A CLI assistant that rates each part of its response by confidence level and explicitly states \"I'm less certain about this part\" or \"This is based on general knowledge rather than specific expertise.\"\n A summarizer that highlights which parts of the summary are direct quotes vs interpretations, and flags areas where the source material was ambiguous or contradictory.\n A content generator that suggests multiple variations and explains the trade-offs or assumptions behind each option.\n \n \n \n \n\n \n \n Python is the primary language for AI engineering in 2025. These skills focus on the Python-specific knowledge needed to build LLM applications, from basic syntax to working with APIs and handling data.\n \n Creating and managing virtual environments (venv, conda), installing packages with pip, managing dependencies with requirements.txt, understanding Python versions and compatibility.\n \n \n Using os.environ and python-dotenv to manage configuration, storing secrets securely, understanding the difference between development and production environments.\n \n \n Using the requests library for API calls, handling responses and errors, working with JSON data, understanding REST API patterns, implementing retry logic.\n \n \n Reading and writing files (text, JSON, CSV), parsing different file formats, handling file paths across operating systems, working with large files efficiently.\n \n \n Using f-strings for string interpolation, working with multi-line strings, text cleaning and preprocessing, regular expressions for pattern matching.\n \n \n Using try/except blocks effectively, creating custom exceptions, understanding exception hierarchies, logging errors appropriately.\n \n \n Manipulating Python data structures, list comprehensions, dictionary operations, understanding when to use different data structures.\n \n \n Understanding asynchronous programming, using asyncio for concurrent API calls, handling async/await syntax, knowing when async provides benefits.\n \n \n\n \n Working with LLM APIs is central to AI engineering. These skills cover authentication, making requests, handling responses, and managing the complexities of production API usage.\n \n Understanding API keys vs OAuth tokens, setting authentication headers, managing credentials securely, rotating keys, understanding rate limits and quotas.\n \n \n Using official SDKs (openai, anthropic), understanding SDK abstractions vs raw API calls, reading SDK documentation, handling SDK-specific features and limitations.\n \n \n Structuring API requests with proper parameters, parsing JSON responses, handling nested data structures, extracting relevant information from responses.\n \n \n Handling HTTP error codes (4xx, 5xx), implementing retry logic with exponential backoff, dealing with rate limiting, graceful degradation when APIs fail.\n \n \n Working with server-sent events, handling streaming API responses, updating UI during streaming, managing partial responses.\n \n \n Understanding pricing models (per token, per request), estimating costs before requests, implementing usage tracking, setting budget limits.\n \n \n\n \n Building production-quality AI applications requires following software engineering best practices. These skills ensure your code is maintainable, secure, and professional.\n \n Committing code regularly, writing meaningful commit messages, understanding .gitignore for secrets, branching and merging basics, using GitHub or similar platforms.\n \n \n Writing clear docstrings, creating README files, documenting API usage examples, explaining complex logic with comments, maintaining up-to-date documentation.\n \n \n Never hardcoding API keys, using environment variables, understanding .env and .gitignore, recognizing common security pitfalls, input validation and sanitization.\n \n \n Using print statements and logging effectively, understanding stack traces, using debuggers (pdb, IDE debuggers), systematic problem-solving approaches.\n \n \n Structuring projects with clear file organization, separating concerns (API logic, UI, utilities), following Python naming conventions, keeping functions focused and modular.\n \n \n Writing simple unit tests, testing API integrations, mocking API responses for testing, understanding when and what to test.\n \n \n Using Python's logging module, setting appropriate log levels, logging API usage and errors, understanding observability basics.\n \n \n\n \n If building web applications, these skills help create user interfaces and handle web-specific concerns. Not essential for all projects, but valuable for creating accessible tools.\n \n Creating simple web applications, defining routes and endpoints, handling form submissions, serving HTML templates, understanding request/response cycles.\n \n \n Using Streamlit to quickly build data apps, creating interactive widgets, displaying results, understanding Streamlit's execution model.\n \n \n Basic HTML structure, styling with CSS, creating forms, responsive design basics, using CSS frameworks (Bootstrap, Tailwind).\n \n \n Handling user interactions, making async requests from the browser, manipulating the DOM, understanding modern JavaScript (ES6+).\n \n \n \n\n \n This module intentionally focuses on prompt engineering and API integration rather than model training or fine-tuning. The goal is to build practical skills quickly while establishing foundations for more advanced topics in later modules.\n Learners are encouraged to experiment with different LLM providers (OpenAI, Anthropic, others) to understand how models differ in capabilities, pricing, and behavior. Most concepts transfer across providers.\n Cost management is emphasized throughout because running up unexpected API bills is a common beginner mistake. Encourage learners to set usage alerts and start with smaller, cheaper models for experimentation.\n The \"bottom-up approach\" recommended in research suggests starting with simple, single-prompt applications before building complex multi-step systems. This module follows that philosophy, saving chains and agents for later modules.\n Ethical considerations and model limitations should be discussed throughout, not just as a separate topic. Every project should include reflection on potential biases, hallucinations, and responsible use.\n \n
", + "generatedAt": "2025-10-24T22:16:17.926Z" + } + }, + { + "id": "0ee18d1f-06ea-441f-bffb-5cf0901174e5", + "arcId": "4fa8f2b2-e3f6-437b-87fc-027a9449e34a", + "order": 2, + "title": "Module 2", + "description": "Help me decide", + "durationWeeks": 1, + "status": "complete", + "moduleData": { + "xmlContent": "\n\n \n \n AI-Generated\n claude-sonnet-4-5-20250929\n \n projects.xml\n skills.xml\n research.xml\n \n \n \n \n Created comprehensive module description for advanced prompt engineering and LLM application development\n Based on course context showing Module 2 follows foundational Module 1, this module focuses on advanced prompt engineering techniques, structured outputs, and building production-ready LLM applications. Research shows prompt engineering and structured output generation are critical skills for 2025 AI engineering, with emphasis on iterative optimization and reliability.\n \n The Prompt Report: A Systematic Survey of Prompt Engineering Techniques - comprehensive taxonomy of 58 LLM prompting techniques\n Anthropic's context engineering best practices emphasizing high-signal tokens and minimal viable context\n OpenAI Structured Outputs achieving 100% reliability in schema adherence\n \n \n \n Defined 5 learning objectives covering prompt optimization, structured outputs, iterative refinement, cost management, and production deployment\n Research indicates that modern AI engineering requires mastery of prompt optimization techniques, structured output generation for reliability, iterative refinement processes, cost/performance tradeoffs, and production deployment patterns. These objectives align with 2025 industry best practices emphasizing reliability over one-shot prompting.\n \n Prompt optimization as architecting reliable outputs, not just prompt tuning\n Automatic Prompt Optimization survey showing importance of systematic improvement\n PromptWizard demonstrating feedback-driven iterative refinement achieving superior results\n \n \n \n Created 8 primary research topics covering prompt engineering techniques, structured outputs, optimization methods, temperature/sampling, context window management, cost optimization, evaluation frameworks, and production patterns\n These topics reflect current industry priorities in 2025: systematic prompt optimization over ad-hoc prompting, structured outputs for reliability, understanding model parameters, managing context efficiently, cost-performance tradeoffs, rigorous evaluation, and production deployment. Research shows these are foundational for production AI systems.\n \n 2025 prompt engineering covering temperature, context windows, iterative refinement, and prompt chaining\n Microsoft Azure prompt engineering techniques including few-shot learning and primary content handling\n Academic research showing continuous optimization compounds to 156% improvement over 12 months\n \n \n \n Designed 3 project briefs: Prompt Laboratory, Structured Data Extractor, and Decision Support Agent\n Projects progress from systematic prompt experimentation to structured output generation to complex decision-making applications. This sequence builds skills incrementally while addressing real production needs. Research shows structured outputs and systematic optimization are critical for 2025 production systems.\n \n OpenAI's GPT-5 Prompt Optimizer demonstrating systematic prompt improvement workflows\n LangChain structured output patterns using Pydantic and JSON Schema\n SchemaBench research showing LLMs still struggle with valid JSON generation, requiring systematic approaches\n \n \n \n Created 3 conceptual twist ideas: The Contrarian Advisor, The Unreliable Oracle, and The Meta-Prompter\n These twists reframe the problem space philosophically rather than adding technical features. They encourage learners to think about AI behavior, reliability, and self-improvement in novel ways. The Meta-Prompter twist is particularly relevant given 2025 research on using LLMs to optimize their own prompts.\n \n GPT-5 prompting guide showing success using models as meta-prompters for themselves\n Meta-prompting as powerful technique for improving prompt effectiveness\n \n \n \n Defined skill categories for Python fundamentals, LLM APIs, development tools, and AI engineering practices\n Skills selected based on practical requirements for building production LLM applications in 2025. Emphasis on API interaction patterns, JSON handling for structured outputs, version control for prompt management, environment management, and testing/evaluation frameworks. Research shows these are essential for production AI engineering.\n \n LLMOps 2025 emphasizing prompt versioning, evaluation datasets, and production deployment\n 2025 LLM application development emphasizing structured outputs and CI/CD integration\n \n \n \n \n \n \n
Projects/Twists - conceptual twists may need refinement based on facilitator experience
\n
\n
\n
\n\n This module explores advanced prompt engineering techniques and building reliable LLM-powered applications. You'll learn to systematically optimize prompts through iterative refinement, generate structured outputs that integrate seamlessly with code, and deploy production-ready LLM applications. We'll build a prompt experimentation laboratory, a structured data extraction system, and an intelligent decision support agent—all while mastering the engineering practices that separate prototype demos from production systems.\n\n \n \n You'll develop a disciplined approach to prompt engineering, moving beyond trial-and-error to systematic experimentation. You'll understand how to decompose prompts into components (instructions, context, examples, constraints), measure their effectiveness quantitatively, and apply iterative refinement techniques. You'll learn to use few-shot learning strategically, apply chain-of-thought reasoning where appropriate, and optimize prompts based on empirical evidence rather than intuition.\n \n \n You'll master techniques for generating reliable, parseable outputs from LLMs using JSON schemas, Pydantic models, and function calling. You'll understand the difference between prompting for structure vs. enforcing structure through API features, handle validation and error cases gracefully, and design schemas that balance flexibility with reliability. You'll learn when to use strict mode, JSON mode, or tool calling based on your application requirements.\n \n \n You'll build evaluation frameworks to measure prompt performance objectively, create test datasets that capture edge cases, and implement feedback loops for continuous improvement. You'll learn to identify failure modes, diagnose why prompts fail, and systematically address issues. You'll understand that prompt engineering is an ongoing process, not a one-time task, and develop workflows that enable continuous optimization.\n \n \n You'll develop practical skills in balancing model capability, latency, and cost for production applications. You'll learn to choose appropriate models for different tasks, optimize token usage through context management, use caching strategically, and implement rate limiting. You'll understand the cost implications of different prompting strategies and make informed tradeoffs between quality and expense.\n \n \n You'll learn patterns for deploying LLM applications reliably, including error handling for non-deterministic outputs, implementing retry logic with exponential backoff, managing API keys and environment configuration, logging for debugging and improvement, and monitoring for performance degradation. You'll understand how to move from Jupyter notebooks to production-ready code.\n \n \n\n \n \n \n Research the taxonomy of modern prompt engineering techniques for 2025. Start with \"The Prompt Report\" (arXiv 2406.06608) which catalogs 58 prompting techniques. Focus on understanding when to use zero-shot vs. few-shot prompting, how chain-of-thought reasoning improves complex tasks, the role of delimiters in structuring prompts, and prompt chaining for multi-step workflows. Pay attention to the difference between techniques that work well in demos vs. production.\n \n Investigate how to select effective examples for few-shot prompting. Research shows that example quality matters more than quantity—explore how to curate diverse, canonical examples that represent desired behavior. Look into the \"Lost in the Middle\" problem where examples in the middle of long prompts get ignored. Divide research: one person focuses on example selection strategies, another on example placement and formatting.\n \n \n Explore when and how to use chain-of-thought (CoT) prompting. Research the difference between explicit CoT (\"think step by step\") and implicit reasoning. Investigate newer reasoning models (o1, o3, DeepSeek R1) that have reasoning \"baked in\" and how prompting differs for them. One person can focus on CoT for mathematical/logical tasks, another on reasoning for ambiguous real-world scenarios.\n \n \n Study Anthropic's concept of \"context engineering\" as the evolution of prompt engineering. Research how to maximize the utility of tokens within attention budgets, techniques for context compaction, and strategies for managing long-context scenarios. Explore the principle of \"smallest set of high-signal tokens\" and how it applies to production systems.\n \n \n\n \n Investigate methods for generating reliable structured outputs from LLMs. Start with OpenAI's Structured Outputs documentation and LangChain's with_structured_output() patterns. Research the differences between JSON mode, function calling, and strict mode. Explore JSON Schema as a specification language and how different providers (OpenAI, Anthropic, open-source models) handle structured generation. Look into reliability statistics—OpenAI claims 100% schema adherence with strict mode vs. 35% with prompting alone.\n \n Deep dive into JSON Schema as a specification language and Pydantic as a Python validation framework. Research how to design schemas that are neither too rigid nor too loose, handle optional vs. required fields, specify constraints (min/max, enums, patterns), and nest complex objects. Look into validation error handling and schema evolution over time.\n \n \n Explore function calling APIs across different providers. Research how to define function signatures, when the model should call vs. respond directly, handling multi-turn tool use conversations, and parallel function calling. Investigate the relationship between function calling and structured outputs—when to use each approach.\n \n \n Research advanced techniques like grammar-based decoding (Outlines, Jsonformer) that constrain token generation at runtime. Understand how these approaches differ from prompt-based methods and when they're necessary. Explore the tradeoffs between flexibility and reliability.\n \n \n\n \n Study systematic approaches to prompt optimization beyond manual iteration. Research automatic prompt optimization (APO) techniques, meta-prompting where LLMs improve their own prompts, and tools like OpenAI's Prompt Optimizer. Investigate \"local prompt optimization\" which focuses edits on specific sections rather than rewriting entire prompts. Explore how to measure prompt performance quantitatively and establish optimization metrics.\n \n Research how to evaluate prompt performance objectively. Explore metrics like accuracy, task completion rate, schema adherence, hallucination detection, and cost per successful output. Investigate evaluation frameworks and how to build golden datasets. Look into failure mode analysis and systematic debugging approaches.\n \n \n Study workflows for continuous prompt improvement. Research A/B testing for prompts, version control strategies, feedback collection mechanisms, and how to incorporate user corrections. Explore the concept that prompt optimization should compound over time—research shows 156% improvement over 12 months with systematic processes.\n \n \n\n \n Research how model parameters affect output behavior. Focus on temperature (creativity vs. determinism), top-p/nucleus sampling, frequency/presence penalties, and max tokens. Understand the guidance that \"temperature 0 is best for factual use cases\" and when higher temperatures are appropriate. Explore how these parameters interact with structured output requirements and cost implications.\n \n\n \n Investigate practical strategies for working within context window limits. Research token counting, context window sizes across different models (4K to 2M+ tokens), strategies for summarization when context exceeds limits, and sliding window approaches. Explore caching mechanisms for repeated context and the cost implications of large context windows. Study the \"Lost in the Middle\" research showing that LLMs struggle with information buried in long contexts.\n \n\n \n Research the cost-performance tradeoffs across different models. Investigate pricing structures (per-token costs, caching discounts, batch processing), strategies for using smaller models where appropriate, and techniques for reducing token usage. Explore when to use flagship models (GPT-5, Claude Sonnet) vs. smaller models (GPT-4o-mini, Haiku), and how to route requests intelligently based on task complexity.\n \n\n \n Study patterns for handling the non-deterministic nature of LLMs in production. Research retry strategies with exponential backoff, fallback mechanisms when structured outputs fail validation, timeout handling, rate limit management, and circuit breaker patterns. Explore logging and monitoring approaches specific to LLM applications. Investigate how to handle partial failures and maintain system reliability despite occasional LLM errors.\n \n\n \n Research best practices for deploying LLM applications to production. Investigate environment management (API keys, configuration), secrets management, deployment architectures (serverless vs. containers), monitoring and observability, cost tracking, and A/B testing in production. Explore LLMOps practices emerging in 2025, including prompt versioning, evaluation datasets, and continuous improvement pipelines.\n \n \n\n \n \n \n \n \n \n \n \n \n \n \n\n \n \n \n Build an experimentation framework for systematically testing and optimizing prompts with quantitative evaluation\n Prompt decomposition, A/B testing, evaluation metrics, version control, iterative refinement workflows\n \n - Define at least one concrete task (e.g., summarization, classification, extraction, generation)\n - Create a test dataset with at least 20 examples covering normal cases and edge cases\n - Implement prompt versioning to track changes over time\n - Build an evaluation system that scores prompts quantitatively (accuracy, quality, cost, latency)\n - Test at least 5 different prompt variations systematically\n - Visualize results to compare prompt performance\n - Document what worked, what didn't, and why\n - Calculate cost per successful output for each prompt variant\n \n \n \n - Break prompts into components: system message, instructions, context, examples, constraints, output format\n - Understand which components affect which aspects of output quality\n - Experiment with component ordering and formatting\n - Use delimiters to structure prompt sections clearly\n \n \n - Define success metrics appropriate to your task (accuracy, F1, BLEU, human ratings, schema compliance)\n - Build evaluation functions that score outputs automatically where possible\n - Create golden datasets with expected outputs\n - Calculate aggregate statistics (mean, median, std dev) across test cases\n - Identify failure modes and categorize errors\n \n \n - Change one variable at a time to isolate effects\n - Use version control (git) to track prompt changes\n - Document hypotheses before testing (what you expect to improve and why)\n - Run multiple trials to account for non-determinism\n - Use temperature=0 for reproducibility during testing\n - Build comparison views to see prompt variants side-by-side\n \n \n - Count tokens in prompts and completions\n - Calculate cost per request based on model pricing\n - Track total cost across evaluation runs\n - Identify opportunities to reduce token usage without sacrificing quality\n - Compare cost-effectiveness across prompt variants\n \n \n - Select diverse, high-quality examples that represent desired behavior\n - Experiment with 0-shot, 1-shot, 3-shot, 5-shot variants\n - Format examples consistently (input/output pairs)\n - Test example placement (beginning vs. end of prompt)\n - Measure the marginal benefit of additional examples vs. cost\n \n \n \n \n Build a system that classifies customer support emails by urgency (low, medium, high, critical) and category (billing, technical, account, feature request). Create a test set of 30 emails, experiment with different instruction phrasings, test few-shot examples, and measure classification accuracy. Track which prompt variants reduce false positives for \"critical\" urgency.\n \n \n Create a prompt that generates structured meeting summaries with sections: decisions made, action items (with owners), open questions, and next steps. Test on transcripts of varying lengths and meeting types. Experiment with chain-of-thought reasoning (\"First identify all decisions, then extract action items...\") vs. direct instruction. Measure completeness (did it catch all action items?) and accuracy.\n \n \n Build prompts that review code snippets for common issues: security vulnerabilities, performance problems, style violations, and logical errors. Test on code samples with known issues. Experiment with providing context about the codebase, using examples of good vs. bad code, and different output formats. Measure false positive rate and issue detection rate.\n \n \n Create prompts that generate compelling product descriptions from structured product data (specs, features, price). Test consistency in tone, accuracy in representing features, and persuasiveness. Experiment with persona instructions (\"Write as an enthusiastic tech reviewer\" vs. \"Write as a pragmatic buyer's guide\"). Measure which variants lead to descriptions that match brand voice.\n \n \n \n\n \n Build a system that extracts structured information from unstructured text with reliable schema adherence and validation\n JSON Schema, Pydantic models, function calling, validation, error handling, schema design\n \n - Define a clear extraction task with structured output requirements\n - Design a JSON Schema or Pydantic model that captures the data structure\n - Implement extraction using structured output features (function calling, JSON mode, or strict mode)\n - Handle validation errors gracefully with retry logic\n - Test on at least 20 diverse input examples including edge cases\n - Achieve >90% schema compliance on test cases\n - Measure and report extraction accuracy, cost per extraction, and failure modes\n - Provide clear error messages when extraction fails\n \n \n \n - Design schemas that balance specificity with flexibility\n - Use appropriate JSON Schema constraints (required fields, types, enums, patterns, min/max)\n - Nest objects and arrays appropriately for complex data\n - Choose between strict schemas (fail on invalid) vs. lenient schemas (extract what you can)\n - Document schema fields with clear descriptions that guide the LLM\n \n \n - Define Pydantic classes with type hints and Field descriptions\n - Use validators for custom validation logic\n - Handle optional fields with defaults\n - Serialize/deserialize Pydantic objects to JSON\n - Leverage Pydantic's validation error messages\n \n \n - Use with_structured_output() in LangChain or equivalent in other frameworks\n - Understand differences between JSON mode (prompt-based), function calling, and strict mode\n - Choose the appropriate method based on model capabilities and reliability requirements\n - Handle the case where models refuse to generate structured output\n - Parse and validate structured outputs programmatically\n \n \n - Catch validation errors and extract meaningful error messages\n - Implement retry logic with modified prompts when validation fails\n - Log failures for debugging and improvement\n - Provide fallback behavior when extraction fails repeatedly\n - Distinguish between schema violations vs. missing information in input\n \n \n - Write clear extraction instructions that reference schema fields\n - Provide examples of input text and expected output structure\n - Handle cases where information is missing or ambiguous\n - Instruct the model on how to handle optional fields\n - Test prompt variations that improve extraction accuracy\n \n \n \n \n Extract structured data from resumes: contact info, education (degree, institution, year), work experience (company, title, dates, description), and skills. Handle varying resume formats (chronological, functional, hybrid). Design a schema with nested objects for education and experience arrays. Test on resumes with missing information, ambiguous dates, and multiple formats. Measure extraction accuracy per field.\n \n \n Parse invoices to extract: vendor name, invoice number, date, line items (description, quantity, unit price, total), subtotal, tax, and total amount. Handle invoices in different formats and layouts. Design a schema that captures line items as an array of objects. Test on invoices with varying numbers of line items, different tax structures, and OCR errors. Calculate extraction accuracy and cost per invoice.\n \n \n Extract structured information from news articles: headline, publication date, author, main entities mentioned (people, organizations, locations), key topics/themes, sentiment (positive/negative/neutral), and a one-sentence summary. Design a schema with arrays for entities and topics. Test on articles from different sources and topics. Measure entity extraction recall and sentiment accuracy.\n \n \n Extract structured insights from customer feedback: sentiment score (1-5), main issue category (from predefined list), specific product/feature mentioned, priority level (low/medium/high), and suggested action. Design a schema with enums for categories and priority. Test on diverse feedback (short vs. long, positive vs. negative, clear vs. ambiguous). Measure classification accuracy and agreement with human labels.\n \n \n \n\n \n Build an intelligent agent that helps users make informed decisions by gathering information, analyzing options, and providing structured recommendations\n Multi-turn conversations, context management, tool use, reasoning transparency, cost optimization\n \n - Define a clear decision domain (e.g., choosing technology, planning projects, evaluating options)\n - Implement multi-turn conversation with context persistence\n - Use structured outputs for recommendations (options, pros/cons, confidence scores)\n - Provide reasoning transparency (show how the agent arrived at recommendations)\n - Implement at least one tool/function the agent can call (e.g., search, calculation, data lookup)\n - Handle ambiguous user requests by asking clarifying questions\n - Test with at least 5 complete decision scenarios\n - Optimize for cost by managing context efficiently\n - Measure user satisfaction, recommendation quality, and decision time\n \n \n \n - Maintain conversation history across multiple turns\n - Manage context window limits by summarizing or pruning old messages\n - Track conversation state (what information has been gathered, what's still needed)\n - Handle topic switches and context resets\n - Implement conversation memory (short-term and long-term)\n \n \n - Define function signatures with clear descriptions and parameters\n - Implement functions that the agent can call (search, calculate, lookup data)\n - Parse function call responses and integrate results into conversation\n - Handle function call failures gracefully\n - Decide when to call functions vs. rely on model knowledge\n \n \n - Prompt the agent to explain its reasoning (chain-of-thought)\n - Structure recommendations with clear pros/cons/tradeoffs\n - Provide confidence scores or uncertainty indicators\n - Show which information sources influenced recommendations\n - Make reasoning transparent enough for users to validate or challenge\n \n \n - Detect when user requests are ambiguous or underspecified\n - Ask targeted clarifying questions to gather missing information\n - Provide options when multiple interpretations are possible\n - Confirm understanding before providing final recommendations\n - Handle changing user preferences during the conversation\n \n \n - Output recommendations in a consistent structured format\n - Include: options ranked by suitability, key criteria evaluated, pros/cons for each option\n - Provide actionable next steps\n - Cite sources or reasoning for each recommendation\n - Allow users to adjust criteria weights and see updated recommendations\n \n \n \n \n Build an agent that helps developers choose technology stacks for new projects. Ask clarifying questions about project requirements (scale, team size, timeline, constraints). Provide structured recommendations for frameworks, databases, hosting, etc. with pros/cons for each option. Implement a tool to look up current GitHub stars, recent releases, or documentation quality. Test with scenarios: \"I'm building a real-time chat app\", \"I need a static site for a small business\", \"I'm prototyping an ML application\".\n \n \n Create an agent that helps users plan career transitions. Gather information about current skills, interests, constraints, and goals through conversation. Recommend career paths with required skills, typical timelines, salary ranges, and job market outlook. Provide structured learning plans with specific resources. Implement tools to look up job postings or skill requirements. Test with scenarios: \"I'm a teacher wanting to move into tech\", \"I'm a developer interested in management\", \"I want to specialize in AI\".\n \n \n Build an agent that helps teams prioritize projects or features. Gather information about each option: effort estimate, business value, dependencies, risks. Ask clarifying questions about team capacity and strategic goals. Provide structured recommendations with priority scores and reasoning. Implement a tool to calculate weighted scores based on user-defined criteria. Test with scenarios: different team sizes, conflicting priorities, resource constraints.\n \n \n Create an agent that helps plan trips by analyzing preferences, constraints, and options. Ask about budget, interests, travel dates, and must-see destinations. Recommend structured itineraries with daily schedules, estimated costs, and logistics. Implement tools to look up travel times, opening hours, or weather. Provide alternatives when constraints conflict (e.g., \"You can't visit all 5 cities in 3 days—here are 3 optimized options\"). Test with various trip types: family vacation, business trip, adventure travel.\n \n \n \n \n\n \n \n Your agent must always present the strongest possible case AGAINST the user's initial preference before offering balanced recommendations\n \n If user says \"I want to use React\", agent must first articulate compelling reasons to consider Vue, Svelte, or vanilla JS, then provide balanced analysis\n For career advice, if user leans toward management, agent must first explore individual contributor paths deeply before comparing options\n In project prioritization, if team wants to build feature X, agent must thoroughly challenge that assumption with alternative approaches\n \n \n\n \n Your system deliberately introduces controlled uncertainty—it must generate multiple plausible but different recommendations and explain why it cannot be certain which is best\n \n For tech stack selection, generate 3 equally defensible but architecturally different solutions, explaining the assumptions under which each is optimal\n In data extraction, when information is ambiguous, return multiple possible interpretations with confidence scores instead of picking one\n For prompt optimization, show how small prompt changes lead to different but valid outputs, exploring the sensitivity of the system\n \n \n\n \n Your system uses an LLM to continuously improve its own prompts based on performance feedback, creating a self-optimizing loop\n \n After each extraction failure, use an LLM to analyze the failure and suggest prompt modifications, then test the modified prompt\n For decision support, use an LLM to review conversation transcripts and suggest improvements to clarifying questions or recommendation formats\n In the prompt laboratory, use an LLM to generate new prompt variations based on what worked/didn't work in previous experiments\n \n \n \n \n\n \n \n Core Python skills needed to build LLM applications, focusing on practical patterns for API interaction, data handling, and error management.\n \n - Parse JSON strings to Python dictionaries with json.loads()\n - Serialize Python objects to JSON with json.dumps()\n - Handle JSON parsing errors gracefully\n - Pretty-print JSON for debugging\n - Validate JSON structure programmatically\n \n \n - Use python-dotenv to load API keys from .env files\n - Access environment variables with os.environ\n - Keep secrets out of version control\n - Manage different configurations for development vs. production\n \n \n - Use try/except blocks to catch API errors\n - Distinguish between different error types (rate limit, invalid request, network error)\n - Implement retry logic with exponential backoff\n - Log errors with context for debugging\n \n \n - Read and write text files for prompts and test data\n - Use pathlib for cross-platform file paths\n - Handle file encoding issues (UTF-8)\n - Process files in batches for large datasets\n \n \n - Use list comprehensions for transforming data\n - Filter and map over collections\n - Work with nested data structures (lists of dicts)\n - Use enumerate() and zip() effectively\n \n \n\n \n Practical skills for interacting with LLM APIs, using modern libraries, and handling common patterns in LLM application development.\n \n - Initialize client with API key\n - Make chat completion requests with messages array\n - Use system, user, and assistant message roles\n - Set parameters: temperature, max_tokens, top_p\n - Handle streaming responses\n - Use structured outputs with response_format\n \n \n - Initialize chat models with init_chat_model()\n - Use with_structured_output() for reliable parsing\n - Chain operations with LCEL (LangChain Expression Language)\n - Handle prompt templates\n - Use output parsers for structured data\n \n \n - Understand API differences across providers\n - Use Claude's tool use API\n - Work with Gemini's multimodal inputs\n - Compare pricing and capabilities across providers\n \n \n - Use tiktoken to count tokens in strings\n - Estimate costs before making API calls\n - Track token usage across requests\n - Optimize prompts to reduce token count\n \n \n - Implement exponential backoff for retries\n - Handle rate limit errors (429 status codes)\n - Use tenacity library for retry logic\n - Implement request queuing for batch processing\n \n \n\n \n Essential tools for managing code, dependencies, and development workflows in AI engineering projects.\n \n - Track prompt changes in version control\n - Use meaningful commit messages for prompt iterations\n - Create branches for experimental prompts\n - Review prompt diffs to understand what changed\n - Tag successful prompt versions\n \n \n - Create isolated Python environments with venv\n - Manage dependencies with requirements.txt\n - Understand why dependency isolation matters\n - Activate/deactivate environments\n \n \n - Use notebooks for interactive prompt development\n - Display rich outputs (dataframes, visualizations)\n - Export notebooks to Python scripts\n - Understand when to move from notebooks to scripts\n \n \n - Use Python's logging module\n - Log prompts, responses, and errors with context\n - Set appropriate log levels (DEBUG, INFO, ERROR)\n - Review logs to diagnose issues\n - Implement structured logging for production\n \n \n\n \n Practices specific to building reliable, maintainable, and cost-effective AI applications in production environments.\n \n - Store prompts in separate files or databases\n - Version prompts alongside code\n - Document prompt changes and performance impacts\n - Implement A/B testing for prompt variants\n - Roll back to previous prompts when needed\n \n \n - Collect diverse test cases including edge cases\n - Create golden outputs for comparison\n - Balance dataset across different input types\n - Update datasets as you discover new failure modes\n - Version datasets alongside prompts\n \n \n - Define metrics appropriate to your task\n - Implement automated evaluation where possible\n - Use human evaluation for subjective quality\n - Track metrics over time to detect degradation\n - Compare performance across prompt/model variants\n \n \n - Track API costs per request and in aggregate\n - Set budget alerts\n - Optimize token usage to reduce costs\n - Compare cost-effectiveness across models\n - Implement caching to reduce redundant requests\n \n \n - Validate LLM outputs programmatically\n - Implement fallback behaviors when validation fails\n - Retry with modified prompts on failure\n - Set maximum retry limits to prevent infinite loops\n - Log failures for analysis and improvement\n \n \n \n\n \n This module assumes learners have completed Module 1 covering basic LLM interaction, prompt fundamentals, and simple application development. Module 2 builds on that foundation with systematic optimization, production patterns, and reliability engineering.\n The emphasis on \"systematic\" and \"iterative\" reflects 2025 industry best practices. Research shows that continuous optimization compounds significantly over time, with academic studies documenting 156% improvement over 12 months compared to static prompts. Encourage learners to think of prompt engineering as an ongoing discipline, not a one-time task.\n Structured outputs are increasingly critical for production systems. OpenAI's strict mode achieves 100% schema adherence vs. 35% with prompting alone. Help learners understand when to use different approaches: prompt-based for flexibility, function calling for tool integration, strict mode for reliability.\n Cost awareness is essential. LLM applications can become expensive quickly in production. Encourage learners to track costs throughout development, experiment with smaller models where appropriate, and optimize token usage. The best prompt is often the one that achieves acceptable quality at the lowest cost.\n The projects progress in complexity: Prompt Laboratory focuses on systematic experimentation, Structured Data Extractor adds reliability requirements, and Decision Support Agent combines multiple skills in a complex application. Learners can work on projects in parallel based on interest, but encourage completing Prompt Laboratory first as it builds evaluation skills needed for the others.\n The \"twists\" are intentionally conceptual rather than technical. They're designed to make learners think differently about AI behavior and reliability. The Contrarian Advisor encourages critical thinking, The Unreliable Oracle explores uncertainty and confidence, and The Meta-Prompter introduces self-improvement loops. These are stretch goals for learners who finish early.\n \n
", + "generatedAt": "2025-10-24T22:20:02.205Z" + } + } + ] + } + ], + "createdAt": "2025-10-14T15:59:49.138Z", + "updatedAt": "2025-10-14T15:59:49.138Z", + "courseNarrative": "The AI Engineering Apprenticeship is designed to transform experienced developers into skilled AI engineers capable of building production-grade AI applications and integrating AI-enhanced workflows into their daily practice. Over 5 weeks, learners progress through 2 carefully sequenced arcs that build from foundational concepts to advanced implementation and real-world application. The journey begins with establishing core AI engineering skills—prompt engineering, working with LLMs, and understanding AI-assisted development workflows. Learners then advance through building intelligent applications with RAG systems and vector databases.\n\nThis curriculum reflects the current state of the industry in 2025, where AI engineering has emerged as a critical discipline combining software engineering expertise with AI-specific skills. The course emphasizes practical, hands-on learning appropriate for a facilitated structure, with each weekly session building on previous knowledge while allowing learners to immediately apply concepts in their professional work. By focusing on both the engineering fundamentals and the unique challenges of AI systems—such as prompt optimization, handling non-deterministic outputs, cost management, and ethical considerations—learners develop the comprehensive skill set needed to succeed in this rapidly evolving field.", + "progressionNarrative": "The 2 arcs of this apprenticeship are designed to build progressively more sophisticated AI engineering capabilities. Arc 1 establishes the foundation by teaching learners to work effectively with LLMs through prompt engineering and basic application development. Arc 2 extends these capabilities by introducing knowledge systems through RAG, enabling learners to build applications grounded in proprietary data." +} \ No newline at end of file diff --git a/docs/archives/outputs/themis/v0_0_1-2025-10-15.xml b/docs/archives/outputs/themis/v0_0_1-2025-10-15.xml deleted file mode 100644 index abf9cd2..0000000 --- a/docs/archives/outputs/themis/v0_0_1-2025-10-15.xml +++ /dev/null @@ -1,398 +0,0 @@ - - - - - 5 - 18 - - - - - - - - - - - - 20 - 4 - - - - The AI Engineering Apprenticeship takes experienced developers on a transformative 42-week journey from AI-curious practitioners to confident AI engineers capable of building production-ready intelligent systems. The course recognizes that modern AI engineering requires both breadth and depth: understanding how to effectively leverage large language models and AI-enhanced workflows while also grasping the underlying principles that make these systems work. Throughout the year, learners will progress from foundational prompt engineering and model integration to sophisticated agentic systems, knowledge engineering, and ultimately production deployment and optimization. - The course is structured around five thematic arcs that build technical sophistication while maintaining practical applicability. Early arcs focus on establishing core competencies in AI interaction patterns and development workflows, middle arcs explore advanced architectural patterns for agentic and knowledge-based systems, and later arcs synthesize these skills into production-ready applications through intensive project work. Each arc is designed to be thematically coherent while building on capabilities developed in previous phases, ensuring learners can immediately apply their growing expertise to real-world challenges in their current roles. - - - - The five arcs of the AI Engineering Apprenticeship create a carefully sequenced journey that builds both technical capability and practical judgment. The course begins with Foundations that establish core AI engineering skills applicable across all domains, then explores two sophisticated architectural patterns in parallel—Agentic Systems for autonomous task completion and Knowledge Systems for grounding AI in factual information. These middle arcs are thematically independent, allowing learners to develop deep expertise in distinct areas of AI engineering. The Production AI Systems arc then synthesizes these capabilities by addressing the operational concerns that separate prototypes from production systems: evaluation, observability, and optimization. - The Capstone arc serves as both culmination and demonstration of mastery, requiring learners to integrate techniques from all previous arcs into a cohesive, production-ready application. This final phase transforms accumulated knowledge into practical competence through the challenges of real-world system building. Throughout the course, the facilitated structure enables peer learning, with experienced developers sharing insights from their domains while learning from each other's approaches to common challenges. By course end, learners will have transformed from AI-curious developers into confident AI engineers capable of architecting, building, and operating sophisticated AI systems in their professional contexts. - - - - - Establishing core competencies in AI model interaction, prompt engineering fundamentals, and integration patterns that form the foundation for all subsequent AI engineering work. Includes incorporating AI-assisted workflows in the development process. - - - - - - - - - - - - This arc establishes the foundational mindset and technical skills required for AI engineering, focusing on how developers interact with, integrate, and leverage AI models in their daily work. Learners will develop fluency in prompt engineering, understand model capabilities and limitations, and begin incorporating AI into their development workflows. The emphasis is on practical integration patterns and building intuition about when and how to apply AI solutions. - - - - The arc progresses from understanding AI model fundamentals and basic interaction patterns to sophisticated prompt engineering techniques, then to practical integration architectures, and finally to AI-enhanced development workflows. Each module builds technical depth while expanding the contexts in which learners can confidently apply AI, moving from controlled experimentation to production integration patterns. By the end, learners will have transformed their daily development practice to leverage AI as a core tool. - - - - - Introduction to large language models, their capabilities, limitations, and basic interaction patterns. Learners will develop mental models for how AI systems process information and generate responses, establishing the foundation for all subsequent AI engineering work. Hands-on experimentation with multiple model providers and APIs builds practical familiarity. - - - - - - - - - - - Explain the architecture and capabilities of modern large language models including transformers and attention mechanisms - Interact effectively with AI models through multiple interfaces (chat, API, SDK) and evaluate response quality - Identify appropriate use cases for AI models based on their strengths and limitations - Implement basic API calls to multiple AI providers (OpenAI, Anthropic, etc.) with proper authentication and error handling - Analyze model outputs for quality, bias, and reliability in different contexts - - - Large Language Model architecture and capabilities - Model providers and API ecosystems (OpenAI, Anthropic, Google, open-source) - Tokenization, context windows, and generation parameters - Model limitations, hallucinations, and failure modes - Basic API integration patterns and SDKs - Cost considerations and rate limiting - - - - - - Deep dive into prompt engineering techniques that reliably elicit desired behaviors from AI models. Learners will master structured prompting, few-shot learning, chain-of-thought reasoning, and prompt optimization strategies. Emphasis on systematic experimentation and evaluation methodologies for prompt quality. - - - - - - - - - - - Design effective prompts using structured techniques including role-based, few-shot, and chain-of-thought prompting - Implement systematic prompt evaluation and iteration workflows to improve output quality - Apply advanced prompting patterns including self-consistency, tree-of-thought, and ReAct frameworks - Create reusable prompt templates with variable injection for common use cases - Evaluate and compare prompt performance across different models and contexts - - - Prompt anatomy and structure (system, user, assistant roles) - Few-shot learning and in-context learning techniques - Chain-of-thought and reasoning prompts - Prompt optimization and A/B testing methodologies - Advanced patterns: ReAct, self-consistency, constitutional AI - Prompt injection vulnerabilities and defensive prompting - - - - - - Practical patterns for integrating AI capabilities into existing applications and systems. Learners will explore architectural decisions around synchronous vs asynchronous processing, streaming responses, caching strategies, and fallback mechanisms. Focus on building robust, production-ready integrations that handle edge cases gracefully. - - - - - - - - - - - Design integration architectures that incorporate AI capabilities into existing application stacks - Implement streaming response patterns for improved user experience with long-running AI operations - Build robust error handling, retry logic, and fallback mechanisms for AI service failures - Apply caching strategies to optimize cost and latency for repeated or similar queries - Integrate AI capabilities with databases, APIs, and other backend services in a cohesive architecture - - - Synchronous vs asynchronous AI processing patterns - Streaming responses and server-sent events - Caching strategies (semantic caching, result caching) - Error handling, timeouts, and circuit breakers - Queue-based architectures for AI workloads - API gateway patterns and rate limit management - - - - - - Transforming the development process itself by incorporating AI assistants for code generation, review, testing, and documentation. Learners will establish effective workflows for pair programming with AI, automated code review, test generation, and documentation creation. Emphasis on maintaining code quality and developer agency while maximizing AI assistance. - - - - - - - - - - - Establish effective pair programming workflows with AI coding assistants (GitHub Copilot, Cursor, etc.) - Generate comprehensive test suites using AI assistance while maintaining test quality and coverage - Leverage AI for code review, refactoring suggestions, and technical debt identification - Automate documentation generation while ensuring accuracy and maintainability - Evaluate when AI assistance improves vs hinders development velocity and code quality - - - AI-powered IDEs and coding assistants - Prompt engineering for code generation - AI-assisted test generation and test-driven development - Automated code review and refactoring suggestions - Documentation generation and maintenance - Best practices for maintaining code quality with AI assistance - - - - - - - - - - Building autonomous and semi-autonomous AI agents that can plan, execute multi-step tasks, use tools, and interact with external systems. Learners will progress from simple tool-using agents to sophisticated multi-agent systems with memory and planning capabilities. - - - - - - - - - - - - This arc explores the frontier of AI engineering: building agents that can autonomously accomplish complex tasks through planning, tool use, and iterative problem-solving. Learners will understand the architectural patterns that enable agents to break down problems, select appropriate tools, and execute multi-step workflows with minimal human intervention. The focus shifts from direct model interaction to designing systems where AI agents act as autonomous problem-solvers. - - - - The arc begins with foundational concepts of tool use and function calling, enabling models to interact with external systems. It then progresses to planning and reasoning frameworks that allow agents to decompose complex tasks, followed by memory systems that enable stateful, context-aware interactions. Finally, learners explore multi-agent architectures where specialized agents collaborate to solve complex problems, building toward sophisticated agentic systems. - - - - - Enabling AI models to interact with external systems through structured function calling and tool use patterns. Learners will implement function schemas, handle tool selection and execution, and build reliable tool-calling loops. Focus on creating robust interfaces between AI reasoning and external actions. - - - - - - - - - - - Implement function calling interfaces using OpenAI and Anthropic tool use APIs - Design effective function schemas with clear descriptions, parameters, and validation rules - Build tool execution loops that handle tool selection, parameter extraction, and result integration - Create custom tools for database queries, API calls, file operations, and calculations - Handle errors and edge cases in tool execution with appropriate fallback strategies - - - Function calling APIs and tool use patterns - Function schema design and parameter specification - Tool selection and execution loops - Parallel tool calling and tool chaining - Error handling in tool execution - Building custom tool libraries and registries - - - - - - Implementing planning frameworks that enable agents to decompose complex tasks into executable steps. Learners will explore ReAct, plan-and-execute, and other reasoning patterns that allow agents to think through problems systematically. Emphasis on balancing autonomy with reliability and observability. - - - - - - - - - - - Implement ReAct (Reasoning + Acting) agents that interleave thinking and action steps - Build plan-and-execute agents that create task plans before execution - Design observation and reflection loops that allow agents to learn from execution results - Apply task decomposition strategies for breaking complex goals into manageable subtasks - Implement guardrails and human-in-the-loop patterns for agent oversight - - - ReAct framework and reasoning traces - Plan-and-execute architectures - Task decomposition and subgoal generation - Observation, reflection, and self-correction loops - Agent frameworks (LangGraph, CrewAI, AutoGPT patterns) - Human-in-the-loop and approval workflows - - - - - - Building agents with persistent memory that can maintain context across interactions and learn from past experiences. Learners will implement conversation memory, semantic memory retrieval, and episodic memory patterns. Focus on designing memory systems that scale while maintaining relevant context. - - - - - - - - - - Implement conversation memory systems that maintain context across multi-turn interactions - Build semantic memory using vector databases for relevant information retrieval - Design episodic memory patterns that store and retrieve past agent experiences - Apply memory summarization and compression techniques for long-running agents - Architect memory hierarchies that balance detail, relevance, and context window constraints - - - Conversation memory and context management - Semantic memory with vector embeddings - Episodic memory and experience replay - Memory summarization and compression - Memory retrieval strategies and relevance ranking - State persistence and session management - - - - - - Architecting systems where multiple specialized agents collaborate to solve complex problems. Learners will design agent communication protocols, task delegation patterns, and coordination mechanisms. Emphasis on when multi-agent architectures provide value over single-agent systems and how to manage complexity. - - - - - - - - - - - Design multi-agent architectures with specialized agents for distinct capabilities - Implement agent communication protocols and message passing patterns - Build coordination mechanisms including supervisory agents and democratic consensus - Apply task delegation and routing strategies based on agent capabilities - Evaluate trade-offs between single-agent and multi-agent approaches for different problem types - - - Multi-agent architecture patterns - Agent communication protocols and message formats - Supervisory agents and hierarchical coordination - Democratic and consensus-based agent systems - Task routing and agent specialization - Debugging and observability in multi-agent systems - - - - - - - - - - Building systems that effectively store, retrieve, and reason over large knowledge bases using embeddings, vector databases, and retrieval-augmented generation (RAG). Learners will master the full lifecycle of knowledge engineering from ingestion to intelligent retrieval. - - - - - - - - - - - - This arc focuses on enabling AI systems to access and reason over vast amounts of domain-specific knowledge through retrieval-augmented generation and knowledge base engineering. Learners will master the technical stack for transforming unstructured data into queryable knowledge systems, from embedding generation to sophisticated retrieval strategies. The emphasis is on building systems that ground AI responses in factual information while maintaining performance at scale. - - - - The arc begins with embeddings and vector similarity as the foundation for semantic search, then progresses to building complete RAG pipelines with document ingestion and retrieval. It advances to sophisticated retrieval strategies that improve accuracy and relevance, and culminates in graph-based knowledge systems that capture relationships and enable complex reasoning. Each module adds layers of sophistication to how systems store, access, and leverage knowledge. - - - - - - - - - - - - - Synthesizing all learned techniques to build production-ready AI applications with focus on evaluation, observability, optimization, and deployment. Learners will master the operational aspects of running AI systems at scale with reliability and cost-effectiveness. - - - - - - - - - - - - This arc bridges the gap between prototype AI systems and production-ready applications by focusing on evaluation, monitoring, optimization, and operational excellence. Learners will develop the engineering discipline required to deploy AI systems that meet production standards for reliability, performance, cost, and safety. The emphasis shifts from building features to building systems that can be maintained, debugged, and improved over time. - - - - The arc progresses from establishing rigorous evaluation frameworks that measure AI system quality, to implementing comprehensive observability that enables debugging and improvement, and finally to optimization strategies that balance quality, cost, and latency. Each module addresses a critical gap between experimental AI systems and production deployments, ensuring learners can confidently operate AI applications at scale. - - - - - - - - - - - - Applying all learned techniques to design, build, and deploy a comprehensive AI application that demonstrates mastery of AI engineering principles. Learners will work through the complete lifecycle from requirements gathering to production deployment. - - - - - - - - - - - - The capstone arc provides learners with the opportunity to synthesize all techniques learned throughout the course into a single, production-ready AI application. Working individually or in small teams, learners will navigate the full product development lifecycle including requirements analysis, architecture design, implementation, evaluation, and deployment. This arc emphasizes real-world constraints, trade-offs, and the integration challenges that arise when combining multiple AI techniques into cohesive systems. - - - The capstone progresses through three major phases: planning and architecture where learners define requirements and design their systems, implementation and iteration where they build and refine their applications based on evaluation results, and deployment and presentation where they launch their systems and demonstrate their capabilities. Each phase builds on the previous one, with regular check-ins and peer feedback ensuring learners stay on track and learn from each other's approaches. - - - - - - - - - - - diff --git a/docs/archives/outputs/themis/v0_0_2-2025-10-17.xml b/docs/archives/outputs/themis/v0_0_2-2025-10-17.xml deleted file mode 100644 index b51b7f0..0000000 --- a/docs/archives/outputs/themis/v0_0_2-2025-10-17.xml +++ /dev/null @@ -1,696 +0,0 @@ - - - - - 5 - 18 - - - - - - - - - - - - 20 - 4 - - - - The AI Engineering Apprenticeship takes experienced developers on a transformative 42-week journey from AI consumers to AI builders. The course recognizes that modern AI engineering requires both practical implementation skills with frontier AI models and a foundational understanding of underlying machine learning principles. Learners progress from understanding AI fundamentals and prompt engineering, through building production-grade AI applications, to mastering advanced techniques like fine-tuning and RAG systems, culminating in a comprehensive capstone project that synthesizes their new expertise. - This apprenticeship is structured around hands-on, project-based learning that respects participants' existing software engineering experience while systematically building AI-specific competencies. Each arc introduces progressively sophisticated AI engineering challenges, moving from individual workflows and tools to complex system architecture and deployment. The facilitated structure ensures learners benefit from peer collaboration, code reviews, and guided exploration of rapidly evolving AI technologies, preparing them to lead AI initiatives within their organizations. - - - - The AI Engineering Apprenticeship follows a carefully sequenced progression that builds from foundational skills through advanced techniques to comprehensive project execution. Arc 1 establishes essential competencies in AI-enhanced workflows and API integration that learners can immediately apply in their current roles. Arc 2 builds on this foundation by teaching application architecture and development patterns specific to AI systems, enabling participants to build complete features rather than just integrate APIs. Arc 3 introduces advanced techniques like RAG and fine-tuning that allow for sophisticated, domain-specific AI systems, while Arc 4 addresses the critical production concerns of deployment, monitoring, and responsible AI practices that distinguish prototype systems from enterprise-ready solutions. - The capstone arc (Arc 5) provides the opportunity to synthesize all prior learning into a substantial project that demonstrates mastery across the full AI engineering lifecycle. Throughout the course, the facilitated structure enables peer learning, collaborative problem-solving, and exposure to diverse approaches and use cases from the cohort. By the conclusion, learners will have transformed from developers who use AI tools into AI engineers capable of architecting, building, and operating sophisticated AI-powered systems within their organizations. - - - - - Establish core competencies in AI-enhanced development workflows, prompt engineering, and fundamental machine learning concepts that underpin modern AI systems. - - - - - - - - - - - - This foundational arc introduces learners to the AI engineering landscape, bridging their existing development expertise with AI-specific tools and methodologies. Participants explore how AI transforms the software development lifecycle while building essential skills in prompt engineering and understanding the machine learning foundations that power modern AI systems. - - - - The arc begins with AI-enhanced development workflows that immediately impact daily productivity, then progresses to systematic prompt engineering techniques. Learners then explore the machine learning fundamentals necessary to understand model behavior, concluding with practical API integration skills that enable immediate application of AI capabilities in existing systems. - - - - - Explore how AI coding assistants, automated testing, and intelligent tooling transform the software development lifecycle. Learners establish best practices for leveraging AI tools while maintaining code quality, understanding when to trust AI suggestions, and integrating these tools into existing development processes. - - - - - - - - - - - Effectively integrate AI coding assistants (GitHub Copilot, Cursor, etc.) into daily development workflows - Evaluate and validate AI-generated code for correctness, security, and performance - Apply AI tools for automated testing, documentation generation, and code review - Establish team practices for responsible AI-assisted development - Measure productivity improvements while maintaining code quality standards - - - AI coding assistants and IDE integrations - Prompt engineering for code generation - AI-assisted debugging and refactoring - Automated test generation and documentation - Code quality validation and security considerations - Team collaboration with AI tools - - - - - - Master systematic approaches to prompt design, understanding how to elicit desired behaviors from large language models. This module covers prompt patterns, few-shot learning, chain-of-thought reasoning, and techniques for improving consistency and reliability of LLM outputs. - - - - - - - - - - - Design effective prompts using established patterns and frameworks - Apply few-shot learning and chain-of-thought techniques to improve output quality - Implement prompt chaining and decomposition for complex tasks - Evaluate and iterate on prompt performance systematically - Understand limitations and failure modes of LLM-based systems - - - Prompt engineering fundamentals and best practices - Few-shot and zero-shot learning techniques - Chain-of-thought and reasoning frameworks - Prompt templates and systematic iteration - Output parsing and structured generation - Evaluation metrics for prompt effectiveness - - - - - - Build essential understanding of machine learning concepts that inform AI engineering decisions. This module demystifies neural networks, training processes, and model evaluation without requiring deep mathematical expertise, focusing on practical knowledge needed to work effectively with AI systems. - - - - - - - - - - - Explain core machine learning concepts including training, inference, and generalization - Understand neural network architectures and their applications to different problem types - Interpret model evaluation metrics and make informed decisions about model selection - Recognize common ML challenges including overfitting, bias, and data quality issues - Apply transfer learning concepts to leverage pre-trained models effectively - - - Supervised, unsupervised, and reinforcement learning paradigms - Neural network fundamentals and architectures - Training processes, loss functions, and optimization - Model evaluation and performance metrics - Overfitting, regularization, and generalization - Transfer learning and pre-trained models - - - - - - Develop practical skills for integrating AI capabilities into applications through APIs from OpenAI, Anthropic, Google, and other providers. Learners explore API design patterns, rate limiting, cost optimization, error handling, and building robust applications on top of AI services. - - - - - - - - - - - Integrate multiple AI provider APIs (OpenAI, Anthropic, Google, etc.) into applications - Implement robust error handling, retry logic, and fallback strategies - Optimize API usage for cost efficiency and performance - Design abstraction layers for provider-agnostic AI integration - Monitor and log AI API interactions for debugging and improvement - - - Major AI provider APIs and their capabilities - Authentication, rate limiting, and quota management - Streaming responses and asynchronous processing - Cost optimization and token management - Error handling and resilience patterns - API abstraction and multi-provider strategies - - - - - - - - - - Progress from API integration to building complete AI-powered applications with proper architecture, state management, and user experience design for AI features. - - - - - - - - - - - - This arc focuses on the engineering practices required to build production-quality applications that incorporate AI as a core feature. Learners explore architectural patterns specific to AI applications, manage conversational state and context, and design user experiences that account for AI's probabilistic nature and potential failures. - - - - Beginning with application architecture patterns, learners establish structural foundations for AI systems. The arc then addresses the unique challenges of managing conversational context and state in AI interactions, followed by specialized techniques for building AI agents with tool use. Finally, learners explore UX patterns and testing strategies specific to AI-powered features. - - - - - Explore architectural patterns for AI-powered applications, including microservices for AI features, queue-based processing, caching strategies, and separating AI logic from business logic. Learn to design systems that are maintainable, testable, and scalable as AI capabilities evolve. - - - - - - - - - - - Design application architectures that cleanly separate AI components from core business logic - Implement queue-based and asynchronous processing patterns for AI workloads - Apply caching strategies to optimize performance and reduce API costs - Build modular systems that accommodate multiple AI providers and models - Design for observability and debugging in AI-powered systems - - - Microservices and modular architecture for AI features - Queue-based processing and background jobs - Caching strategies for AI responses - Database design for AI application data - Configuration management for models and prompts - Logging, monitoring, and observability patterns - - - - - - Master techniques for managing conversational context, memory, and state in AI applications. This module covers conversation history management, context window optimization, summarization strategies, and designing multi-turn interactions that maintain coherence and relevance. - - - - - - - - - - - Implement effective conversation history management within token limits - Apply context window optimization techniques including summarization and selective retention - Design multi-turn conversational flows with appropriate state management - Build memory systems that retain relevant information across sessions - Handle context switching and topic management in extended conversations - - - Conversation history and context window management - Summarization techniques for long conversations - Memory systems and persistent context - Session management and state persistence - Context relevance scoring and pruning - Multi-turn conversation design patterns - - - - - - Build AI agents that can use tools, call functions, and interact with external systems. Learners explore function calling, tool selection strategies, agent frameworks, and patterns for creating autonomous systems that accomplish complex tasks through iterative reasoning and action. - - - - - - - - - - - Implement function calling and tool use with LLM APIs - Design tool schemas and descriptions that enable effective agent behavior - Build agents that chain multiple tool calls to accomplish complex tasks - Apply agent frameworks (LangChain, LlamaIndex, etc.) appropriately - Implement safety constraints and validation for agent actions - - - Function calling and structured outputs - Tool schema design and documentation - Agent reasoning patterns (ReAct, Plan-and-Execute) - Agent frameworks and orchestration tools - Error recovery and retry strategies for agents - Safety constraints and action validation - - - - - - Explore user experience patterns specific to AI-powered features, including handling uncertainty, managing user expectations, and designing for AI failures. Learn testing strategies for non-deterministic systems, including evaluation frameworks and human-in-the-loop validation. - - - - - - - - - - - Design user interfaces that appropriately communicate AI capabilities and limitations - Implement progressive disclosure and confidence indicators for AI outputs - Create effective loading states and feedback mechanisms for AI processing - Develop testing strategies for non-deterministic AI features - Build evaluation frameworks that combine automated metrics and human judgment - - - UX patterns for AI-powered features - Managing user expectations and trust - Loading states and streaming UI updates - Error handling and graceful degradation - Testing strategies for non-deterministic systems - Evaluation frameworks and human feedback loops - - - - - - - - - - Master advanced techniques including retrieval-augmented generation (RAG), embeddings, vector databases, and fine-tuning to build sophisticated AI systems tailored to specific domains and use cases. - - - - - - - - - - - - This arc elevates learners from working with general-purpose AI models to building specialized systems optimized for specific domains and knowledge bases. Through embeddings and RAG architectures, participants learn to ground AI systems in proprietary data, while fine-tuning techniques enable customization of model behavior for specialized tasks. - - - - The arc begins with embeddings and semantic search as foundational technologies, then progresses to building complete RAG systems that combine retrieval with generation. Learners then explore advanced RAG patterns for production systems, concluding with fine-tuning techniques that enable deeper model customization when RAG alone is insufficient. - - - - - Understand vector embeddings as numerical representations of meaning and build semantic search systems. This module covers embedding models, vector similarity metrics, and practical applications including search, clustering, and recommendation systems based on semantic understanding. - - - - - - - - - - - Explain how embeddings represent semantic meaning in vector space - Generate and work with embeddings from text, code, and other modalities - Implement semantic search using vector similarity metrics - Apply embeddings to clustering, classification, and recommendation tasks - Evaluate embedding quality and select appropriate models for different use cases - - - Vector embeddings and semantic representation - Embedding models (OpenAI, Sentence Transformers, etc.) - Similarity metrics (cosine, dot product, Euclidean) - Semantic search implementation - Clustering and classification with embeddings - Multi-modal embeddings and applications - - - - - - Build retrieval-augmented generation (RAG) systems that ground AI responses in specific knowledge bases. Learners explore vector database technologies, chunking strategies, retrieval techniques, and basic RAG architectures that combine search with generation for accurate, sourced responses. - - - - - - - - - - - Design and implement vector databases for efficient similarity search at scale - Apply effective chunking and preprocessing strategies for different document types - Build basic RAG pipelines that retrieve relevant context and generate grounded responses - Optimize retrieval quality through query transformation and reranking - Evaluate RAG system performance using relevance and accuracy metrics - - - Vector database technologies (Pinecone, Weaviate, Qdrant, etc.) - Document chunking and preprocessing strategies - Basic RAG architecture and implementation - Query transformation and expansion techniques - Reranking and retrieval optimization - RAG evaluation metrics and methodologies - - - - - - Explore sophisticated RAG architectures including hybrid search, multi-stage retrieval, agentic RAG, and graph-based approaches. This module covers production considerations including incremental indexing, metadata filtering, handling diverse data sources, and monitoring RAG system performance at scale. - - - - - - - - - - - Implement hybrid search combining semantic and keyword-based retrieval - Build multi-stage RAG pipelines with query routing and specialized retrievers - Design agentic RAG systems that iteratively refine retrieval and generation - Integrate diverse data sources and handle real-time index updates - Monitor and optimize RAG systems in production environments - - - Hybrid search and multi-stage retrieval - Query routing and specialized retrievers - Agentic and iterative RAG patterns - Graph-based RAG and knowledge graphs - Incremental indexing and real-time updates - Production monitoring and optimization - - - - - - Learn when and how to fine-tune models for specialized tasks and domains. This module covers fine-tuning techniques including full fine-tuning, LoRA, and prompt tuning, along with dataset preparation, training strategies, and evaluating custom models against base models and RAG alternatives. - - - - - - - - - - - Determine when fine-tuning is appropriate versus RAG or prompt engineering approaches - Prepare high-quality datasets for fine-tuning specific tasks - Implement fine-tuning using LoRA and other parameter-efficient techniques - Train and evaluate custom models for specialized domains or behaviors - Deploy and serve fine-tuned models in production environments - - - Fine-tuning vs. RAG vs. prompt engineering tradeoffs - Dataset preparation and quality for fine-tuning - Parameter-efficient fine-tuning (LoRA, QLoRA) - Training infrastructure and optimization - Model evaluation and comparison methodologies - Deployment and serving of custom models - - - - - - - - - - Develop expertise in deploying, monitoring, and maintaining AI systems in production, including MLOps practices, evaluation frameworks, safety considerations, and cost optimization. - - - - - - - - - - - - This arc addresses the critical gap between prototype AI applications and production-ready systems. Learners explore the operational, safety, and governance challenges unique to AI systems, developing practices for reliable deployment, continuous evaluation, and responsible AI development that meets enterprise requirements. - - - - Beginning with MLOps and deployment infrastructure, learners establish operational foundations for AI systems. The arc then addresses evaluation and monitoring frameworks necessary for maintaining quality, followed by comprehensive coverage of AI safety, security, and ethical considerations. The arc concludes with cost optimization and scaling strategies for sustainable production systems. - - - - - Master deployment patterns for AI systems including containerization, model versioning, A/B testing, and continuous integration/deployment adapted for AI workloads. Learners build deployment pipelines that handle the unique challenges of AI systems including model updates, prompt versioning, and managing multiple AI providers. - - - - - - - - - - - Design and implement CI/CD pipelines for AI-powered applications - Apply containerization and orchestration strategies for AI services - Implement model and prompt versioning with rollback capabilities - Build A/B testing frameworks for evaluating AI system changes - Manage multi-environment deployments with appropriate configuration management - Establish incident response procedures for AI system failures - - - CI/CD for AI applications - Containerization and orchestration (Docker, Kubernetes) - Model and prompt versioning strategies - A/B testing and gradual rollouts - Environment management and configuration - Deployment monitoring and rollback procedures - Infrastructure as code for AI systems - - - - - - Build comprehensive evaluation and monitoring systems for AI applications. This module covers automated evaluation frameworks, human-in-the-loop review processes, real-time monitoring, and observability practices that provide visibility into AI system behavior and performance in production. - - - - - - - - - - - Design automated evaluation frameworks combining metrics and LLM-as-judge approaches - Implement human-in-the-loop review and feedback collection systems - Build real-time monitoring dashboards for AI system health and performance - Apply observability practices to trace AI requests and debug issues - Establish alerting strategies for AI system degradation and failures - Create continuous evaluation loops for detecting model drift and quality issues - - - Automated evaluation frameworks and metrics - LLM-as-judge evaluation techniques - Human feedback collection and review workflows - Real-time monitoring and alerting - Distributed tracing for AI requests - Model drift detection and quality monitoring - Observability tools and dashboards - - - - - - Address safety, security, and ethical considerations in AI systems. Learners explore prompt injection and jailbreak prevention, content filtering, bias mitigation, privacy protection, and establishing governance frameworks for responsible AI development aligned with organizational values and regulatory requirements. - - - - - - - - - - - Identify and mitigate security vulnerabilities including prompt injection and data leakage - Implement content filtering and safety guardrails for AI outputs - Detect and address bias in AI systems across the development lifecycle - Apply privacy-preserving techniques and ensure compliance with data regulations - Establish governance frameworks and review processes for AI features - Design systems with appropriate human oversight and intervention capabilities - - - Prompt injection and jailbreak prevention - Content filtering and moderation - Bias detection and mitigation strategies - Privacy protection and data handling - Compliance with AI regulations (GDPR, AI Act, etc.) - AI governance frameworks and ethics review - Human-in-the-loop safeguards and oversight - Transparency and explainability practices - - - - - - - - - - Synthesize learning from all previous arcs by designing, building, and deploying a complete AI-powered system that demonstrates mastery of AI engineering practices from conception through production deployment. - - - - - - - - - - - - The capstone arc provides learners with the opportunity to demonstrate comprehensive AI engineering competency through a substantial project. Working individually or in small teams, participants apply the full spectrum of techniques learned—from architecture and development through deployment and operations—while receiving peer feedback and facilitated guidance. - - - - Learners begin with project planning and architecture design, establishing clear objectives and technical approaches. The development phase allows for iterative building with regular peer reviews and feedback sessions. The arc concludes with deployment, documentation, and presentation of the complete system, along with reflection on lessons learned and future improvements. - - - - - Define project scope, establish success criteria, and design comprehensive system architecture. Learners identify a meaningful problem within their domain, select appropriate AI techniques, plan technical implementation, and establish evaluation criteria. This phase includes peer review of proposals and facilitated architecture discussions. - - - - - - - - - - - Define a clear problem statement and success criteria for an AI-powered system - Design comprehensive system architecture incorporating appropriate AI techniques - Plan technical implementation including technology stack and development approach - Establish evaluation frameworks and quality metrics for the project - Create project timeline with milestones and deliverables - Present and defend architectural decisions to peers and facilitators - - - Problem definition and scope setting - Requirements gathering and user story development - System architecture design and documentation - Technology selection and justification - Data strategy and knowledge base planning - Evaluation framework design - Project planning and risk assessment - Architecture review and peer feedback - - - - - - Build the core functionality of the capstone project through iterative development cycles. Learners implement key features, integrate AI components, establish evaluation processes, and refine based on testing and peer feedback. Regular check-ins and code reviews ensure projects stay on track and maintain quality standards. - - - - - - - - - - - Implement core AI functionality using techniques from previous arcs - Apply iterative development practices with regular evaluation and refinement - Integrate multiple AI components into a cohesive system - Conduct thorough testing including edge cases and failure modes - Incorporate peer feedback and facilitated guidance into development - Document code and design decisions for maintainability - - - Iterative development and agile practices - Feature implementation and integration - Testing strategies for AI components - Code review and peer feedback sessions - Performance optimization and debugging - Documentation and code quality - Progress tracking and milestone completion - - - - - - Deploy the capstone project to a production or production-like environment, complete comprehensive documentation, and present the system to the cohort. Learners demonstrate their complete AI engineering workflow, reflect on challenges and solutions, and provide peer feedback on other projects. - - - - - - - - - - - Deploy the complete system to a production or production-like environment - Create comprehensive documentation including architecture, deployment, and usage guides - Present the project effectively, demonstrating key features and technical decisions - Reflect critically on the development process, challenges faced, and lessons learned - Provide constructive peer feedback on other capstone projects - Identify future enhancements and scaling considerations - - - Production deployment and configuration - Documentation and knowledge transfer - Demonstration and presentation skills - Project retrospective and reflection - Peer review and feedback - Future roadmap and enhancement planning - Portfolio development and career application - Celebration and cohort closure - - - - - - - - - - diff --git a/docs/archives/outputs/themis/v0_0_3-2025-10-17.xml b/docs/archives/outputs/themis/v0_0_3-2025-10-17.xml deleted file mode 100644 index 4a1dfb0..0000000 --- a/docs/archives/outputs/themis/v0_0_3-2025-10-17.xml +++ /dev/null @@ -1,679 +0,0 @@ - - - - - 5 - 18 - - - - - - - - - - - - 20 - 4 - - - - This AI Engineering Apprenticeship takes experienced developers on a transformative journey from traditional software development to AI-enhanced engineering practices. The course recognizes that AI engineering in 2025 is fundamentally about combining software engineering excellence with frontier AI techniques—not replacing one with the other. Learners begin by mastering the foundations of working with large language models and prompt engineering, then progress through increasingly sophisticated patterns including retrieval-augmented generation and autonomous agent systems. The curriculum reflects current industry reality: that successful AI engineers are software engineers who understand how to architect, deploy, and maintain production AI systems, not just build impressive demos. Throughout the 42 weeks, learners work on progressively complex projects that mirror real-world applications, culminating in a capstone that demonstrates their ability to architect and deploy complete AI-powered systems. The course emphasizes practical skills that match the 2025 job market, where AI engineering roles have grown significantly and companies seek developers who can build reliable, scalable, and responsible AI applications. - - - - The course follows a carefully designed progression from foundational concepts to advanced autonomous systems, culminating in an integrative capstone. The first arc establishes essential AI engineering skills—understanding LLMs, prompt engineering, and basic application development—that form the basis for all subsequent work. The second arc builds on this foundation by introducing RAG systems, teaching learners how to ground AI in external knowledge, a critical capability for most production applications. The third arc explores the frontier of autonomous agents, showing how AI systems can reason, plan, and act with increasing independence. The fourth arc introduces complementary machine learning techniques, ensuring learners understand when and how to use traditional ML alongside LLMs. Finally, the capstone arc synthesizes all previous learning into a complete, production-ready application. While each arc is thematically independent and focuses on distinct AI engineering patterns, they build on each other temporally—concepts from earlier arcs are prerequisites for later ones, and the capstone requires integrating techniques from across the entire curriculum. This structure mirrors how AI engineers actually work: starting with foundations, progressively adding sophisticated capabilities, and ultimately combining multiple techniques to solve complex real-world problems. - - - - - This arc establishes the essential foundations for AI engineering, introducing learners to LLM capabilities, prompt engineering techniques, and the fundamental patterns for building AI-enhanced applications. Learners transition from traditional development practices to understanding how to effectively work with and integrate AI models into software systems. - - - - - - - - - - - - This arc focuses on building a solid foundation in AI engineering principles, starting with understanding LLM capabilities and limitations, mastering prompt engineering as a core skill, and learning to build simple but production-ready LLM applications. Learners develop the mindset shift from deterministic programming to working with probabilistic AI systems. - - - - The arc begins with hands-on exploration of LLM capabilities and limitations, establishing realistic expectations. It then progresses to systematic prompt engineering techniques that form the basis of all AI interactions. Next, learners build their first LLM applications using modern frameworks, before concluding with essential practices for evaluation, testing, and responsible AI deployment. - - - - - This module introduces learners to the landscape of large language models, their capabilities and limitations, and how they're transforming software development workflows. Learners gain hands-on experience with various LLM APIs and develop an intuition for what tasks are well-suited for AI assistance versus traditional programming approaches. - - - - - - - - - - - Evaluate the capabilities and limitations of current LLM models (GPT-4, Claude, Gemini) for different tasks - Integrate AI coding assistants effectively into development workflows while maintaining code quality - Distinguish between appropriate and inappropriate use cases for LLM-based solutions - Configure and use multiple LLM provider APIs with proper authentication and error handling - Analyze the cost-performance tradeoffs of different models for various application scenarios - - - Overview of LLM architectures and training approaches (transformers, attention mechanisms) - Comparing major LLM providers: OpenAI, Anthropic, Google, and open-source alternatives - Using AI coding assistants (GitHub Copilot, Cursor) effectively in development - Understanding context windows, tokens, and model parameters - Cost optimization strategies for LLM API usage - Common failure modes: hallucinations, context limitations, and reasoning errors - - - - - - This module teaches systematic prompt engineering techniques that are essential for reliable AI interactions. Learners move beyond casual prompting to master structured approaches including few-shot learning, chain-of-thought reasoning, and prompt templates that produce consistent, high-quality outputs. - - - - - - - - - - - Design effective prompts using established techniques (zero-shot, few-shot, chain-of-thought) - Create reusable prompt templates with proper variable substitution and formatting - Apply systematic prompt refinement processes to improve output quality iteratively - Implement prompt engineering best practices for different task types (summarization, extraction, generation) - Evaluate prompt effectiveness using quantitative metrics and qualitative assessment - - - Core prompting techniques: zero-shot, few-shot, and chain-of-thought prompting - Prompt structure: instructions, context, examples, and output formatting - Role-based prompting and persona assignment for specialized outputs - Prompt chaining and decomposition for complex tasks - Temperature, top-p, and other sampling parameters for controlling output - Common prompt engineering pitfalls and debugging strategies - - - - - - This module introduces learners to production-ready frameworks for building LLM applications, focusing on LangChain and similar tools that simplify common patterns. Learners build end-to-end applications that combine LLM calls with traditional software engineering practices, including proper error handling, logging, and API design. - - - - - - - - - - - Build complete LLM applications using LangChain or similar frameworks with proper architecture - Implement robust error handling and retry logic for LLM API calls - Design and develop RESTful APIs that expose LLM functionality to client applications - Manage application state and conversation history in multi-turn interactions - Deploy LLM applications using modern cloud platforms and containerization - - - LangChain fundamentals: chains, prompts, and output parsers - Building conversational interfaces with memory and context management - Integrating LLMs with traditional backend services and databases - API design patterns for LLM-powered applications - Environment management, configuration, and secrets handling - Containerization and deployment strategies for LLM applications - - - - - - This module addresses the critical challenge of evaluating and testing non-deterministic AI systems. Learners develop systematic approaches to measuring LLM performance, implementing guardrails, and ensuring responsible AI practices including bias detection and mitigation. The module emphasizes that production AI systems require rigorous testing beyond traditional software QA. - - - - - - - - - - - Design comprehensive evaluation strategies for LLM outputs using both automated metrics and human review - Implement testing frameworks specifically designed for non-deterministic AI systems - Apply responsible AI principles including bias detection, fairness assessment, and transparency - Create guardrails and safety mechanisms to prevent harmful or inappropriate outputs - Establish monitoring and observability practices for production LLM applications - - - Evaluation metrics for LLM outputs: accuracy, relevance, coherence, and safety - Creating golden datasets and evaluation benchmarks for specific use cases - Automated testing strategies for prompt templates and LLM chains - Bias detection and mitigation techniques in LLM applications - Content filtering, moderation, and safety guardrails - Observability tools: LangSmith, Weights & Biases, and custom logging solutions - - - - - - - - - - This arc focuses on building AI systems that can access and reason over external knowledge bases through retrieval-augmented generation (RAG). Learners master the complete RAG pipeline from document processing and embedding to vector search and context-aware generation, enabling them to build AI applications grounded in specific domain knowledge. - - - - - - - - - - - - This arc explores how to overcome LLM knowledge limitations by integrating external data sources through RAG architectures. Learners develop expertise in the complete data pipeline—from ingestion and chunking to embedding generation and semantic search—that enables AI systems to provide accurate, up-to-date, and domain-specific responses grounded in organizational knowledge. - - - - The arc begins with understanding vector embeddings and semantic search fundamentals, then progresses to implementing complete RAG pipelines with proper document processing. Learners then explore advanced retrieval strategies and optimization techniques before tackling the challenges of deploying and maintaining RAG systems in production environments. - - - - - This module introduces the mathematical foundations of vector embeddings and semantic similarity that power modern RAG systems. Learners gain hands-on experience with embedding models, vector databases, and similarity search algorithms, understanding both the theory and practical implementation considerations. - - - - - - - - - - - Explain how embedding models transform text into vector representations capturing semantic meaning - Implement semantic search using vector databases (Pinecone, Weaviate, ChromaDB, or FAISS) - Compare different embedding models and select appropriate ones for specific domains - Optimize vector search performance through indexing strategies and similarity metrics - Design chunking strategies that balance context preservation with retrieval precision - - - Embedding model architectures and how they capture semantic relationships - Vector database fundamentals: indexing, approximate nearest neighbor search - Comparing embedding models: OpenAI, Cohere, sentence-transformers, domain-specific models - Distance metrics: cosine similarity, Euclidean distance, dot product - Text chunking strategies: fixed-size, semantic, recursive splitting - Metadata enrichment and hybrid search combining vector and keyword approaches - - - - - - This module covers the end-to-end implementation of RAG systems, from document ingestion and preprocessing to retrieval and generation. Learners build complete RAG applications that can ingest various document formats, maintain up-to-date knowledge bases, and generate accurate responses grounded in retrieved context. - - - - - - - - - - - Design and implement complete RAG architectures from data ingestion through response generation - Process diverse document formats (PDFs, HTML, markdown) with proper text extraction and cleaning - Build data ingestion pipelines that handle updates and maintain vector database freshness - Implement context-aware generation that properly cites sources and acknowledges limitations - Handle edge cases including insufficient context, conflicting information, and out-of-scope queries - - - RAG architecture patterns: indexing pipeline vs. retrieval-generation pipeline - Document loaders and parsers for various formats (PDFs, Word, web pages) - Data preprocessing: OCR correction, table extraction, metadata extraction - Retrieval strategies: top-k search, MMR (maximum marginal relevance), reranking - Prompt engineering for RAG: incorporating retrieved context effectively - Source attribution and citation generation in RAG responses - - - - - - This module explores advanced techniques for improving RAG system performance including query transformation, hybrid search, reranking, and caching strategies. Learners optimize RAG systems for both quality and efficiency, addressing common challenges like retrieval accuracy and latency. - - - - - - - - - - - Implement advanced retrieval techniques including query expansion, hypothetical document embeddings, and multi-query retrieval - Apply reranking models to improve retrieval precision and relevance - Design multi-level caching strategies to optimize cost and latency in RAG systems - Evaluate RAG system performance using metrics for both retrieval and generation quality - Optimize RAG pipelines for production workloads balancing quality, cost, and speed - - - Query transformation techniques: expansion, rephrasing, hypothetical answers - Hybrid search combining dense and sparse retrieval methods - Cross-encoder reranking for improved retrieval precision - Caching strategies: embedding cache, retrieval cache, response cache - RAG evaluation metrics: retrieval accuracy, answer relevance, faithfulness - Performance optimization: batch processing, async operations, connection pooling - - - - - - This module addresses the operational challenges of maintaining RAG systems in production, including data freshness, monitoring, debugging, and scaling. Learners develop strategies for keeping knowledge bases current, troubleshooting retrieval issues, and ensuring consistent performance as data volumes grow. - - - - - - - - - - - Design data refresh strategies that maintain knowledge base currency without excessive reprocessing - Implement comprehensive monitoring and alerting for RAG system health and performance - Debug common RAG issues including poor retrieval, context overflow, and hallucinations - Scale RAG systems to handle growing document collections and user traffic - Establish processes for continuous improvement through user feedback and evaluation - - - Incremental indexing and update strategies for evolving knowledge bases - Monitoring RAG systems: retrieval quality, latency, cost, and error rates - Debugging tools and techniques for RAG pipelines - Horizontal scaling strategies for vector databases and retrieval systems - A/B testing and experimentation frameworks for RAG improvements - User feedback collection and integration into RAG refinement - - - - - - - - - - This arc explores the frontier of autonomous AI agents that can reason, plan, and execute actions using external tools. Learners build increasingly sophisticated agent systems that can interact with APIs, execute code, and orchestrate multi-step workflows, understanding both the capabilities and critical safety considerations of autonomous systems. - - - - - - - - - - - - This arc focuses on agentic AI—systems that can autonomously make decisions, use tools, and execute multi-step plans to accomplish goals. Learners develop skills in designing agent architectures, implementing tool-use capabilities, and managing the complexity and safety challenges that come with increased autonomy. The emphasis is on building reliable, controllable agents suitable for production deployment. - - - - The arc begins with understanding agent architectures and implementing basic tool-calling capabilities. It progresses to building reasoning and planning systems that enable multi-step problem solving. Learners then explore multi-agent coordination patterns before concluding with critical safety, reliability, and governance considerations for autonomous systems. - - - - - This module introduces the fundamental patterns for building AI agents, focusing on the ReAct (Reasoning and Acting) pattern and function calling capabilities. Learners implement agents that can select and use appropriate tools from a toolkit to accomplish tasks, understanding the architecture that enables autonomous decision-making. - - - - - - - - - - - Design agent architectures using established patterns (ReAct, tool-use, function calling) - Implement function calling with proper tool definitions and parameter validation - Build tool integrations that allow agents to interact with external APIs and services - Create agent memory systems that maintain context across multi-turn interactions - Debug agent reasoning loops and tool execution errors effectively - - - Agent architecture patterns: ReAct, tool-use, plan-and-execute - Function calling and tool definition specifications (OpenAI, Anthropic formats) - Building tool wrappers for APIs, databases, and external services - Agent memory: short-term (conversation) and long-term (vector store) memory - Agent execution loops: perception, reasoning, action, observation - Error handling and recovery in agent systems - - - - - - This module focuses on enabling agents to break down complex problems, generate plans, and execute code to solve tasks. Learners build agents capable of multi-step reasoning, code generation and execution, and iterative refinement based on feedback, mirroring how human developers approach problem-solving. - - - - - - - - - - - Implement planning algorithms that decompose complex tasks into executable steps - Build secure code execution environments for agent-generated code (sandboxing, containers) - Design feedback loops that allow agents to learn from execution results and iterate - Apply chain-of-thought and tree-of-thought reasoning techniques for complex problem-solving - Evaluate agent performance on reasoning benchmarks and real-world tasks - - - Task decomposition and hierarchical planning for agents - Code generation agents: from requirements to executable code - Secure code execution: sandboxing, Docker containers, E2B, and similar solutions - Iterative refinement: test-driven development with AI agents - Advanced reasoning patterns: chain-of-thought, tree-of-thought, self-consistency - Agent evaluation on coding benchmarks (SWE-bench, HumanEval) - - - - - - This module explores architectures where multiple specialized agents collaborate to accomplish complex goals. Learners design agent teams with different roles and expertise, implement communication protocols, and build orchestration layers that coordinate agent activities, mirroring how human teams work together. - - - - - - - - - - - Design multi-agent systems with specialized agents for different subtasks or domains - Implement agent communication protocols and message passing architectures - Build orchestration layers that coordinate agent activities and resolve conflicts - Apply delegation patterns where supervisor agents manage worker agent teams - Evaluate multi-agent system performance and debug coordination issues - - - Multi-agent architecture patterns: hierarchical, peer-to-peer, blackboard - Agent specialization and role assignment strategies - Communication protocols: direct messaging, shared memory, event-driven - Orchestration frameworks: LangGraph, AutoGen, CrewAI - Consensus and conflict resolution in multi-agent systems - Debugging and observability for multi-agent interactions - - - - - - This module addresses the critical challenges of deploying autonomous agents safely and reliably. Learners implement guardrails, human-in-the-loop approval workflows, monitoring systems, and governance frameworks that ensure agents operate within acceptable boundaries and can be audited, controlled, and improved over time. - - - - - - - - - - - Implement safety guardrails that constrain agent actions to acceptable boundaries - Design human-in-the-loop workflows for high-stakes agent decisions requiring approval - Build comprehensive monitoring and alerting systems for agent behavior and performance - Apply responsible AI principles to autonomous agent design including transparency and accountability - Create governance frameworks for agent deployment, auditing, and continuous improvement - - - Agent safety mechanisms: action whitelisting, approval workflows, kill switches - Human-in-the-loop patterns for agent oversight and intervention - Monitoring agent behavior: action logging, decision auditing, anomaly detection - Responsible AI for agents: transparency, explainability, accountability - Security considerations: preventing prompt injection, tool misuse, and data leakage - Agent governance: deployment policies, testing requirements, incident response - - - - - - - - - - This arc introduces essential machine learning concepts and techniques that complement LLM-based AI engineering. Learners gain practical experience with classical ML, fine-tuning approaches, and integrating specialized models into AI applications, understanding when traditional ML is more appropriate than LLMs and how to combine both approaches effectively. - - - - - - - - - - - - This arc bridges the gap between LLM-based AI engineering and traditional machine learning, recognizing that production AI systems often require both. Learners develop practical ML skills for tasks where LLMs aren't optimal—classification, prediction, recommendation—and learn to fine-tune models for specialized domains. The focus is on practical application rather than deep theoretical foundations. - - - - The arc begins with practical machine learning fundamentals for common tasks like classification and prediction. It progresses to model training and evaluation workflows, then explores fine-tuning techniques for adapting pre-trained models. The arc concludes with integrating diverse AI models into cohesive systems that leverage the right technique for each task. - - - - - This module introduces essential machine learning concepts and techniques through practical applications. Learners work with scikit-learn and similar libraries to build classification, regression, and clustering models, understanding when these approaches are more appropriate than LLMs for specific tasks like structured prediction or time-series forecasting. - - - - - - - - - - - Apply supervised learning techniques (classification, regression) to structured data problems - Implement feature engineering and preprocessing pipelines for ML models - Train, validate, and evaluate ML models using appropriate metrics and cross-validation - Distinguish between use cases best suited for traditional ML versus LLM-based approaches - Deploy scikit-learn models as APIs and integrate them with LLM-based applications - - - Supervised learning fundamentals: classification and regression algorithms - Feature engineering and selection for structured data - Model training workflows: train-validation-test splits, cross-validation - Evaluation metrics: accuracy, precision, recall, F1, RMSE, MAE - Scikit-learn pipelines for reproducible ML workflows - When to use ML vs. LLMs: structured prediction, time-series, recommendation systems - - - - - - This module covers the operational aspects of machine learning including experiment tracking, model versioning, and deployment pipelines. Learners build reproducible ML workflows using modern MLOps tools, understanding how to manage the lifecycle of ML models from experimentation through production deployment and monitoring. - - - - - - - - - - - Implement experiment tracking and model versioning using MLflow or Weights & Biases - Build reproducible ML pipelines with proper data versioning and dependency management - Deploy ML models using modern serving frameworks (FastAPI, BentoML, or similar) - Monitor ML model performance in production and detect model drift - Apply basic MLOps principles to maintain and improve deployed models over time - - - Experiment tracking and hyperparameter tuning with MLflow or W&B - Model versioning and registry systems for production ML - ML deployment patterns: batch prediction, real-time serving, edge deployment - Model serving frameworks and API design for ML models - Monitoring ML models: performance metrics, data drift, concept drift - CI/CD for machine learning: testing, validation, automated deployment - - - - - - This module explores techniques for adapting pre-trained models to specific domains and tasks through fine-tuning. Learners work with both embedding models and LLMs, understanding when fine-tuning is worthwhile versus prompt engineering, and implementing efficient fine-tuning approaches like LoRA that make adaptation practical for resource-constrained environments. - - - - - - - - - - - Prepare high-quality training datasets for fine-tuning specific tasks or domains - Fine-tune embedding models to improve retrieval performance for specialized domains - Apply parameter-efficient fine-tuning techniques (LoRA, QLoRA) to adapt LLMs - Evaluate fine-tuned models against base models and prompt-engineered alternatives - Make informed decisions about when fine-tuning provides value versus other approaches - - - Fine-tuning fundamentals: transfer learning, domain adaptation, task adaptation - Dataset preparation for fine-tuning: quality, quantity, format requirements - Fine-tuning embedding models for improved domain-specific retrieval - Parameter-efficient fine-tuning: LoRA, QLoRA, prefix tuning, adapter layers - Fine-tuning LLMs via API (OpenAI, Anthropic) versus local fine-tuning - Evaluating fine-tuning ROI: cost, performance improvement, maintenance burden - - - - - - - - - - This arc is the culminating experience where learners design, build, and deploy a complete AI-powered application that integrates techniques from across the course. Working individually or in small teams, learners tackle real-world problems, make architectural decisions, implement production-ready systems, and present their work, demonstrating mastery of AI engineering principles. - - - - - - - - - - - - The capstone arc challenges learners to synthesize everything they've learned into a production-quality AI application. This is where theory meets practice—learners must make real architectural decisions, balance competing concerns like cost and performance, implement proper testing and monitoring, and deliver a system that could actually be deployed. The emphasis is on end-to-end ownership and professional-grade execution. - - - - The arc begins with project scoping and architectural design, requiring learners to define requirements and make informed technology choices. Implementation follows with emphasis on production-ready code, testing, and documentation. The arc concludes with deployment, presentation, and reflection on lessons learned, preparing learners for real-world AI engineering roles. - - - - - This module guides learners through defining their capstone project, conducting requirements analysis, and designing system architecture. Learners identify real-world problems suitable for AI solutions, evaluate different technical approaches, and create detailed architectural plans that will guide their implementation, receiving feedback from instructors and peers. - - - - - - - - - - - Define a well-scoped AI application project with clear success criteria and deliverables - Conduct requirements analysis identifying functional and non-functional requirements - Design system architecture selecting appropriate AI techniques and technologies - Create technical specifications including data flows, API contracts, and deployment plans - Present and defend architectural decisions to technical audiences incorporating feedback - - - Project scoping: defining problem, success metrics, and constraints - Requirements gathering and analysis for AI applications - Architectural decision-making: choosing between RAG, agents, fine-tuning, or hybrid approaches - System design: component diagrams, data flows, API specifications - Technology selection and justification: models, frameworks, infrastructure - Risk assessment and mitigation planning for AI projects - - - - - - This module is the core development phase where learners implement their capstone projects with emphasis on production-quality code, comprehensive testing, proper error handling, and documentation. Learners apply software engineering best practices alongside AI-specific considerations, building systems that are maintainable, observable, and ready for real-world deployment. - - - - - - - - - - - Implement a complete AI application following software engineering best practices - Build comprehensive test suites covering unit, integration, and AI-specific evaluation tests - Implement proper error handling, logging, and observability throughout the application - Create production-grade documentation including setup guides, API docs, and architecture diagrams - Apply security best practices for AI applications including secrets management and input validation - - - Iterative development and version control workflows for AI projects - Testing strategies: unit tests, integration tests, LLM output evaluation - Error handling and resilience patterns for AI applications - Logging and observability: structured logging, tracing, metrics collection - Documentation: code comments, API documentation, deployment guides - Security considerations: API key management, input sanitization, rate limiting - - - - - - This module focuses on deploying the capstone project to production infrastructure, preparing professional presentations, and conducting thorough retrospectives. Learners deploy their applications using cloud platforms, create compelling demonstrations of their work, and reflect on lessons learned throughout the project, solidifying their understanding and preparing for career transitions. - - - - - - - - - - - Deploy AI applications to production cloud infrastructure with proper CI/CD pipelines - Create compelling technical presentations that communicate project value and design decisions - Demonstrate working applications handling real-world scenarios and edge cases - Conduct thorough retrospectives identifying successes, challenges, and lessons learned - Articulate the business value and technical innovations of their AI engineering work - - - Cloud deployment: containerization, orchestration, scaling strategies - CI/CD pipelines for AI applications: automated testing and deployment - Monitoring and alerting in production environments - Technical presentation skills: explaining complex AI systems to diverse audiences - Creating effective demos and documentation for portfolio purposes - Project retrospectives: identifying learnings and areas for future improvement - - - - - - - - - - diff --git a/docs/dev/ref/palettes/metisPalette.json b/docs/dev/ref/palettes/metisPalette.json new file mode 100644 index 0000000..4f61a08 --- /dev/null +++ b/docs/dev/ref/palettes/metisPalette.json @@ -0,0 +1,44 @@ +{ + "dark": { + "name": "void", + "colour": "#00121Fff" + }, + "background": { + "primary": { + "name": "cyan", + "main": "#5BA8CCff", + "subtle": "#E8F1F8ff", + "nav": "#D0E4F0ff" + }, + "alternate": { + "name": "coral", + "main": "#E88D56ff", + "subtle": "#FEF3EDff", + "nav": "#FCDFD0ff" + } + }, + "foreground": { + "primary": { + "name": "terracotta", + "dark": "#A0542Dff", + "midi": "#CC6D3Cff", + "lite": "#E0854Fff" + }, + "alternate": { + "name": "teal", + "dark": "#096A78ff", + "midi": "#0E9191ff", + "lite": "#17B8C4ff" + } + }, + "line": { + "primary": { + "name": "shadow", + "colour": "#010E14ff" + }, + "alternate": { + "name": "rust", + "colour": "#0D0703ff" + } + } +} diff --git a/docs/dev/ref/palettes/metisPalette.ts b/docs/dev/ref/palettes/metisPalette.ts new file mode 100644 index 0000000..c1a0325 --- /dev/null +++ b/docs/dev/ref/palettes/metisPalette.ts @@ -0,0 +1,49 @@ +const metisPalette = { + metadata: { + requiredImprovements: [] + }, + colours: { + dark: { + name: "void", + colour: "#00121Fff" + }, + background: { + primary: { + name: "cyan", + main: "#5BA8CCff", + subtle: "#E8F1F8ff", + nav: "#D0E4F0ff", + }, + alternate: { + name: "coral", + main: "#E88D56ff", + subtle: "#FEF3EDff", + nav: "#FCDFD0ff" + } + }, + foreground: { + primary: { + name: "terracotta", + dark: "#A0542Dff", + midi: "#CC6D3Cff", + lite: "#E0854Fff", + }, + alternate: { + name: "teal", + dark: "#096A78ff", + midi: "#0E9191ff", + lite: "#17B8C4ff" + } + }, + line: { + primary: { + name: "shadow", + colour: "#010E14ff" + }, + alternate: { + name: "rust", + colour: "#0D0703ff" + } + } + } +} diff --git a/docs/dev/ref/palettes.jsonc b/docs/dev/ref/palettes/palettes.jsonc similarity index 100% rename from docs/dev/ref/palettes.jsonc rename to docs/dev/ref/palettes/palettes.jsonc diff --git a/docs/dev/ref/palettes/rheaPalette.json b/docs/dev/ref/palettes/rheaPalette.json new file mode 100644 index 0000000..7197959 --- /dev/null +++ b/docs/dev/ref/palettes/rheaPalette.json @@ -0,0 +1,44 @@ +{ + "dark": { + "name": "depths", + "colour": "#00221Aff" + }, + "background": { + "primary": { + "name": "teal", + "main": "#0E6B68ff", + "subtle": "#F0F8F7ff", + "nav": "#D5E8E6ff" + }, + "alternate": { + "name": "gold", + "main": "#D4AF37ff", + "subtle": "#FEF9EBff", + "nav": "#FAF0D4ff" + } + }, + "foreground": { + "primary": { + "name": "bronze", + "dark": "#D1A927ff", + "midi": "#D4AA2Eff", + "lite": "#D7B130ff" + }, + "alternate": { + "name": "jade", + "dark": "#138078ff", + "midi": "#1A9B93ff", + "lite": "#22ABA4ff" + } + }, + "line": { + "primary": { + "name": "charcoal", + "colour": "#00100Eff" + }, + "alternate": { + "name": "umber", + "colour": "#0D0B03ff" + } + } +} diff --git a/docs/dev/ref/palettes/rheaPalette.ts b/docs/dev/ref/palettes/rheaPalette.ts new file mode 100644 index 0000000..601cef7 --- /dev/null +++ b/docs/dev/ref/palettes/rheaPalette.ts @@ -0,0 +1,49 @@ +const rheaPalette = { + metadata: { + requiredImprovements: [] + }, + colours: { + dark: { + name: "depths", + colour: "#00221Aff" + }, + background: { + primary: { + name: "teal", + main: "#0E6B68ff", + subtle: "#F0F8F7ff", + nav: "#D5E8E6ff", + }, + alternate: { + name: "gold", + main: "#D4AF37ff", + subtle: "#FEF9EBff", + nav: "#FAF0D4ff" + } + }, + foreground: { + primary: { + name: "bronze", + dark: "#D1A927ff", + midi: "#D4AA2Eff", + lite: "#D7B130ff", + }, + alternate: { + name: "jade", + dark: "#138078ff", + midi: "#1A9B93ff", + lite: "#22ABA4ff" + } + }, + line: { + primary: { + name: "charcoal", + colour: "#00100Eff" + }, + alternate: { + name: "umber", + colour: "#0D0B03ff" + } + } + } +} \ No newline at end of file diff --git a/docs/dev/ref/palettes/tethysPalette.json b/docs/dev/ref/palettes/tethysPalette.json new file mode 100644 index 0000000..c8b4121 --- /dev/null +++ b/docs/dev/ref/palettes/tethysPalette.json @@ -0,0 +1,44 @@ +{ + "dark": { + "name": "abyss", + "colour": "#03121Fff" + }, + "background": { + "primary": { + "name": "coral", + "main": "#F5985Eff", + "subtle": "#FFF4EDff", + "nav": "#FFE8D8ff" + }, + "alternate": { + "name": "sage", + "main": "#5CAA82ff", + "subtle": "#F0F9F5ff", + "nav": "#D8F0E6ff" + } + }, + "foreground": { + "primary": { + "name": "emerald", + "dark": "#2D7A5Eff", + "midi": "#3A9670ff", + "lite": "#4DB085ff" + }, + "alternate": { + "name": "amber", + "dark": "#A45818ff", + "midi": "#D47A17ff", + "lite": "#F1991Fff" + } + }, + "line": { + "primary": { + "name": "midnight", + "colour": "#021318ff" + }, + "alternate": { + "name": "rust", + "colour": "#0D0803ff" + } + } +} diff --git a/docs/dev/ref/palettes/tethysPalette.ts b/docs/dev/ref/palettes/tethysPalette.ts new file mode 100644 index 0000000..564da33 --- /dev/null +++ b/docs/dev/ref/palettes/tethysPalette.ts @@ -0,0 +1,49 @@ +const tethysPalette = { + metadata: { + requiredImprovements: [] + }, + colours: { + dark: { + name: "abyss", + colour: "#03121Fff" + }, + background: { + primary: { + name: "coral", + main: "#F5985Eff", + subtle: "#FFF4EDff", + nav: "#FFE8D8ff", + }, + alternate: { + name: "sage", + main: "#5CAA82ff", + subtle: "#F0F9F5ff", + nav: "#D8F0E6ff" + } + }, + foreground: { + primary: { + name: "emerald", + dark: "#2D7A5Eff", + midi: "#3A9670ff", + lite: "#4DB085ff", + }, + alternate: { + name: "amber", + dark: "#A45818ff", + midi: "#D47A17ff", + lite: "#F1991Fff" + } + }, + line: { + primary: { + name: "midnight", + colour: "#021318ff" + }, + alternate: { + name: "rust", + colour: "#0D0803ff" + } + } + }, +} \ No newline at end of file diff --git a/docs/dev/ref/palettes/theiaPalette.json b/docs/dev/ref/palettes/theiaPalette.json new file mode 100644 index 0000000..9e6384d --- /dev/null +++ b/docs/dev/ref/palettes/theiaPalette.json @@ -0,0 +1,44 @@ +{ + "dark": { + "name": "void", + "colour": "#1A0018ff" + }, + "background": { + "primary": { + "name": "rose", + "main": "#CC5BA6ff", + "subtle": "#F7ECF3ff", + "nav": "#EFD3ECff" + }, + "alternate": { + "name": "turquoise", + "main": "#47C9D6ff", + "subtle": "#EDFBFCff", + "nav": "#D1F4F7ff" + } + }, + "foreground": { + "primary": { + "name": "cyan", + "dark": "#1297A8ff", + "midi": "#11B5C6ff", + "lite": "#47D8E6ff" + }, + "alternate": { + "name": "magenta", + "dark": "#B0127Aff", + "midi": "#CC1A99ff", + "lite": "#E563BAff" + } + }, + "line": { + "primary": { + "name": "obsidian", + "colour": "#0A020Dff" + }, + "alternate": { + "name": "depths", + "colour": "#020D0Eff" + } + } +} diff --git a/docs/dev/ref/palettes/theiaPalette.ts b/docs/dev/ref/palettes/theiaPalette.ts new file mode 100644 index 0000000..547624f --- /dev/null +++ b/docs/dev/ref/palettes/theiaPalette.ts @@ -0,0 +1,49 @@ +const theiaPalette = { + metadata: { + requiredImprovements: [] + }, + colours: { + dark: { + name: "void", + colour: "#1A0018ff" + }, + background: { + primary: { + name: "rose", + main: "#CC5BA6ff", + subtle: "#F7ECF3ff", + nav: "#EFD3ECff", + }, + alternate: { + name: "turquoise", + main: "#47C9D6ff", + subtle: "#EDFBFCff", + nav: "#D1F4F7ff" + } + }, + foreground: { + primary: { + name: "cyan", + dark: "#1297A8ff", + midi: "#11B5C6ff", + lite: "#47D8E6ff", + }, + alternate: { + name: "magenta", + dark: "#B0127Aff", + midi: "#CC1A99ff", + lite: "#E563BAff" + } + }, + line: { + primary: { + name: "obsidian", + colour: "#0A020Dff" + }, + alternate: { + name: "depths", + colour: "#020D0Eff" + } + } + } +} diff --git a/docs/dev/ref/palettes/themisPalette.json b/docs/dev/ref/palettes/themisPalette.json new file mode 100644 index 0000000..6090601 --- /dev/null +++ b/docs/dev/ref/palettes/themisPalette.json @@ -0,0 +1,44 @@ +{ + "dark": { + "name": "midnight", + "colour": "#1A0E3Bff" + }, + "background": { + "primary": { + "name": "amethyst", + "main": "#7D4BB8ff", + "subtle": "#F0ECF8ff", + "nav": "#E0D8F0ff" + }, + "alternate": { + "name": "gold", + "main": "#D4AF37ff", + "subtle": "#FEF9EBff", + "nav": "#FAF0D4ff" + } + }, + "foreground": { + "primary": { + "name": "amber", + "dark": "#B8960Fff", + "midi": "#D4AF37ff", + "lite": "#E5C158ff" + }, + "alternate": { + "name": "violet", + "dark": "#7551BAff", + "midi": "#8B6BC8ff", + "lite": "#A085D6ff" + } + }, + "line": { + "primary": { + "name": "obsidian", + "colour": "#0A021Dff" + }, + "alternate": { + "name": "bronze", + "colour": "#0D0C03ff" + } + } +} diff --git a/docs/dev/ref/palettes/themisPalette.ts b/docs/dev/ref/palettes/themisPalette.ts new file mode 100644 index 0000000..bdc09fc --- /dev/null +++ b/docs/dev/ref/palettes/themisPalette.ts @@ -0,0 +1,49 @@ +const themisPalette = { + metadata: { + requiredImprovements: [] + }, + colours: { + dark: { + name: "midnight", + colour: "#1A0E3Bff" + }, + background: { + primary: { + name: "amethyst", + main: "#7D4BB8ff", + subtle: "#F0ECF8ff", + nav: "#E0D8F0ff", + }, + alternate: { + name: "gold", + main: "#D4AF37ff", + subtle: "#FEF9EBff", + nav: "#FAF0D4ff" + } + }, + foreground: { + primary: { + name: "amber", + dark: "#B8960Fff", + midi: "#D4AF37ff", + lite: "#E5C158ff", + }, + alternate: { + name: "violet", + dark: "#7551BAff", + midi: "#8B6BC8ff", + lite: "#A085D6ff" + } + }, + line: { + primary: { + name: "obsidian", + colour: "#0A021Dff" + }, + alternate: { + name: "bronze", + colour: "#0D0C03ff" + } + } + } +} \ No newline at end of file diff --git a/docs/dev/status/Theia-MVP.md b/docs/dev/status/Theia-MVP.md index 029cf40..239708a 100644 --- a/docs/dev/status/Theia-MVP.md +++ b/docs/dev/status/Theia-MVP.md @@ -10,9 +10,10 @@ #### 1.1.1. Due Tasks #### 1.1.2. Other Tasks -- [ ] Add PDF export format (currently marked "coming soon" in UI) -- [ ] Add export analytics/usage tracking -- [ ] Implement Module XML upload & workflow resume functionality (see section 2.1.1) +[ ] 1.1.2.1. When file is uploaded, UI should navigate to correct step automatically +[ ] 1.1.2.2. Add PDF export format (currently marked "coming soon" in UI) +[ ] 1.1.2.3. Add export analytics/usage tracking +[ ] 1.1.2.4. Implement Module XML upload & workflow resume functionality (see section 2.1.1) ### 1.2. Blocked Tasks diff --git a/docs/dev/status/Themis-MVP.md b/docs/dev/status/Themis-MVP.md index 6ff8416..b3c5b8f 100644 --- a/docs/dev/status/Themis-MVP.md +++ b/docs/dev/status/Themis-MVP.md @@ -56,58 +56,58 @@ [ ] 1.1.2.13. Optimise `ModuleGenerationList` Re-render - 1.1.2.13.1. Each store update triggers re-render of the entire ModuleGenerationList. With 20+ modules, this could feel sluggish. - 1.1.2.13.2. We could use `{#key moduleId}` blocks or extract `ModuleCard` to a separate component with `export let module` to leverage Svelte's granular reactivity. +[ ] 1.1.2.14. Improve handling of bullet lists within modules in `CourseOverview` ### 1.2. Blocked Tasks --- ## 2. MVP Milestones -[ ] 2.7. Build ModuleGenerationList Component (Step 4) 📋 PENDING - - Create `src/lib/components/themis/ModuleGenerationList.svelte` - - Display all modules from refined course structure +[x] 2.7. Build ModuleGenerationList Component (Step 5) ✅ COMPLETED + - Created `src/lib/components/themis/ModuleGenerationList.svelte` + - Displays all modules from refined course structure - Module-by-module generation using existing module workflow - - Track generation status per module (planned, generating, complete, error) - - Allow regeneration of individual modules - - Progress tracking across all modules - - **Why seventh:** Orchestrates the actual content generation using existing proven module generator - - **Status:** Not yet started - depends on completion of structure review workflow -[ ] 2.8. Create /api/themis/module/generate Endpoint 📋 PENDING - > [!IMPORTANT] - > Modules generated via Themis must be created by Metis (or an extension of Metis) - - - Create `src/routes/api/themis/module/+server.ts` - - Accept module data with course context - - Call existing module generation logic with course-aware prompts - - Return XML module spec - - **Why eighth:** API layer for course-aware module generation - - **Status:** API structure exists at `/api/themis/generate/`, needs module-specific endpoint -[ ] 2.9. Extend Module Generation with Course Context 📋 PENDING - - Modify prompt factories to accept optional course context parameter - - Include course narrative and progression in prompts when provided - - Ensure backward compatibility with standalone module generation - - **Why ninth:** Reuses existing module generation with course awareness - - **Status:** Prompt factories refactored and ready, needs course context integration -[ ] 2.10. Build CourseOverview Component (Step 5) 📋 PENDING - - Create `src/lib/components/themis/CourseOverview.svelte` - - Display complete course with all generated modules - - Show course narratives and module summaries - - Export functionality trigger - - Final review interface - - **Why tenth:** Final review and export interface - - **Status:** Not yet started + - Tracks generation status per module (planned, generating, complete, error) + - Allows regeneration of individual modules + - Progress tracking across all modules with SSE streaming + - Integrated ExportButton for early course exports + - **Status:** Complete - orchestrates actual content generation using proven module generator +[x] 2.8. Create /api/themis/module Endpoint ✅ COMPLETED + - Created `src/routes/api/themis/module/+server.ts` + - Accepts module slot data with course context + - Calls existing module generation logic with course-aware prompts + - Returns XML module spec via SSE streaming + - Supports retry logic and validation + - **Status:** Complete - API layer for course-aware module generation +[x] 2.9. Extend Module Generation with Course Context ✅ COMPLETED + - Added `buildCourseAwareModulePrompt()` to metisPromptFactory + - Includes course narrative, arc progression, and preceding modules in prompts + - Maintains backward compatibility with standalone module generation + - XML injection prevention via escapeXml utilities + - **Status:** Complete - reuses existing module generation with course awareness +[x] 2.10. Build CourseOverview Component (Step 6) ✅ COMPLETED (2025-10-25) + - Created `src/lib/components/themis/CourseOverview.svelte` + - Displays complete course with metadata, narratives, and all generated modules + - Arc-grouped collapsible sections with module previews + - Module XML preview modal + - Export functionality via Theia integration + - Course completion status banner + - Navigation: back to generation or reset workflow + - **Status:** Complete - final review and export interface operational [ ] 2.11. Add Course XML Schema and Validator 📋 PENDING - Define course-level XML schema wrapping multiple modules - Validation for complete course structure - Include course narratives and metadata - **Why eleventh:** Ensures exported courses meet quality standards - **Status:** Not yet started - will reuse existing validation patterns + - **Note:** Current export uses Theia service which handles course-to-markdown/HTML conversion [ ] 2.12. Implement Export Functionality 📋 PENDING - XML export for complete course - PDF export option (stretch goal) - Individual module file exports - Course metadata inclusion - **Why twelfth:** Delivers the final product to users - - **Status:** Not yet started + - **Status:** Partially complete - Theia export service operational, needs XML course schema --- @@ -186,6 +186,20 @@ - Save/load multiple courses ✅ - **Completed:** Implemented via `persistedStore()` utility in refactoring Phase 4 - **Location:** `src/lib/stores/themisStores.ts` using `src/lib/utils/state/persistenceUtils.ts` +[x] 4.9. Complete Module Generation Workflow (Steps 5-6) ✅ COMPLETED (2025-10-25) + - **Step 5 - Module Generation:** + - ModuleGenerationList component with arc-grouped display + - SSE streaming for real-time generation feedback + - Individual and batch generation capabilities + - Module regeneration and error handling + - Module XML preview functionality + - **Step 6 - Review & Export:** + - CourseOverview component for final review + - Complete course display with narratives and statistics + - Collapsible arc sections with module details + - Theia export integration (Markdown/HTML) + - Workflow navigation and reset functionality + - **Why completed:** Completes the end-to-end Themis MVP workflow from configuration to export ### 4.2. Completed Tasks #### 4.2.1. Record of Past Deadlines diff --git a/src/lib/components/themis/CourseOverview.svelte b/src/lib/components/themis/CourseOverview.svelte new file mode 100644 index 0000000..7f609a0 --- /dev/null +++ b/src/lib/components/themis/CourseOverview.svelte @@ -0,0 +1,1461 @@ + + +
+
+
+
+

Course Review & Export

+

Review your complete course structure and export when ready

+
+
+ +
+
+
+ + + {#if !allComplete} +
+ + +
+ {:else} +
+ + +
+ {/if} + + +
+

{courseData.title}

+

{courseData.description}

+ + +
+ + +
+
+ {totalModules} + Total Modules +
+
+ {completedModules} + Generated +
+
+ {completionPercentage.toFixed(0)}% + Complete +
+
+ {courseData.arcs.length} + Thematic Arcs +
+
+ + + {#if courseData.courseNarrative || courseData.progressionNarrative} +
+ {#if courseData.courseNarrative} +
+

Course Narrative

+

{courseData.courseNarrative}

+
+ {/if} + + {#if courseData.progressionNarrative} +
+

Progression Narrative

+

{courseData.progressionNarrative}

+
+ {/if} +
+ {/if} + + +
+

Course Structure

+ + {#each courseData.arcs as arc (arc.id)} +
+ + + {#if expandedArcId === arc.id} +
+ + {#if arc.arcThemeNarrative || arc.arcProgressionNarrative} +
+ {#if arc.arcThemeNarrative} +
+ Arc Theme: +

{arc.arcThemeNarrative}

+
+ {/if} + {#if arc.arcProgressionNarrative} +
+ Arc Progression: +

{arc.arcProgressionNarrative}

+
+ {/if} +
+ {/if} + + +
+ {#each arc.modules as module (module.id)} + {@const moduleContent = module.moduleData?.xmlContent ? parseModuleXML(module.moduleData.xmlContent) : null} +
+ + + {#if module.status === 'error'} +
+ Error: {module.errorMessage || 'Module generation failed'} +
+ {/if} + + {#if module.status === 'complete' && expandedModuleId === module.id && moduleContent} +
+ + {#if moduleContent.description} +
+

{moduleContent.description}

+
+ {/if} + + + {#if moduleContent.objectives.length > 0} +
+ + {#if expandedSections[module.id]?.has('objectives')} +
+ {#each moduleContent.objectives as objective} +
+ {objective.name} +

{objective.details}

+
+ {/each} +
+ {/if} +
+ {/if} + + + {#if moduleContent.research.primary.length > 0} +
+ + {#if expandedSections[module.id]?.has('research')} +
+ {#each moduleContent.research.primary as topic} +
+ {topic.name} +

{topic.description}

+ {#if topic.subtopics && topic.subtopics.length > 0} +
+ Subtopics: +
    + {#each topic.subtopics as subtopic} +
  • + {subtopic.name}: {subtopic.description} +
  • + {/each} +
+
+ {/if} +
+ {/each} + {#if moduleContent.research.stretch.length > 0} +
+ Stretch Topics: +
    + {#each moduleContent.research.stretch as topic} +
  • + {topic.name}: {topic.description} +
  • + {/each} +
+
+ {/if} +
+ {/if} +
+ {/if} + + + {#if moduleContent.projects.length > 0} +
+ + {#if expandedSections[module.id]?.has('projects')} +
+ {#each moduleContent.projects as project} +
+ {project.name} +

Task: {project.task}

+

Focus: {project.focus}

+

Criteria: {project.criteria}

+ {#if project.skills.length > 0} +
+ Skills: +
    + {#each project.skills as skill} +
  • + {skill.name}: {skill.content} +
  • + {/each} +
+
+ {/if} + {#if project.examples.length > 0} +
+ Examples: +
    + {#each project.examples as example} +
  • + {example.name}: {example.content} +
  • + {/each} +
+
+ {/if} +
+ {/each} +
+ {/if} +
+ {/if} + + + {#if moduleContent.twists.length > 0} +
+ + {#if expandedSections[module.id]?.has('twists')} +
+ {#each moduleContent.twists as twist} +
+ {twist.name} +

{twist.task}

+ {#if twist.examples.length > 0} +
+ Examples: +
    + {#each twist.examples as example} +
  • + {example.name}: {example.content} +
  • + {/each} +
+
+ {/if} +
+ {/each} +
+ {/if} +
+ {/if} + + + {#if moduleContent.additionalSkills.length > 0} +
+ + {#if expandedSections[module.id]?.has('skills')} +
+ {#each moduleContent.additionalSkills as category} +
+ {category.category} +
    + {#each category.skills as skill} +
  • + {skill.name}: {skill.content} +
  • + {/each} +
+
+ {/each} +
+ {/if} +
+ {/if} + + +
+ +
+
+ {/if} +
+ {/each} +
+
+ {/if} +
+ {/each} +
+ + + +
+ + +{#if previewModuleId} + {@const module = courseData.arcs + .flatMap(arc => arc.modules) + .find(m => m.id === previewModuleId)} + + {#if module && module.moduleData} + + {/if} +{/if} + + diff --git a/src/routes/themis/generate/+page.svelte b/src/routes/themis/generate/+page.svelte index 0d8142f..10b5086 100644 --- a/src/routes/themis/generate/+page.svelte +++ b/src/routes/themis/generate/+page.svelte @@ -5,6 +5,7 @@ import ModuleWithinArcPlanner from "$lib/components/themis/ModuleWithinArcPlanner.svelte"; import CourseStructureReview from "$lib/components/themis/CourseStructureReview.svelte"; import ModuleGenerationList from "$lib/components/themis/ModuleGenerationList.svelte"; + import CourseOverview from "$lib/components/themis/CourseOverview.svelte"; import { currentCourse, courseWorkflowStep } from "$lib/stores/themisStores"; let { data } = $props(); @@ -110,6 +111,15 @@ function handleModuleGenerationBack() { courseWorkflowStep.set(4); } + + function handleCourseOverviewBack() { + courseWorkflowStep.set(5); + } + + function handleCourseOverviewReset() { + // Reset handled by component, just go back to step 1 + courseWorkflowStep.set(1); + } @@ -175,6 +185,12 @@ on:submit={handleModuleGenerationSubmit} on:back={handleModuleGenerationBack} /> + {:else if $courseWorkflowStep === 6 && $currentCourse} + {:else}

Step {$courseWorkflowStep} - {steps[$courseWorkflowStep - 1]}