LangChain Fundamentals - Part 1: Core Components and Architecture

Cover Image for LangChain Fundamentals - Part 1: Core Components and Architecture
AI & Machine Learning5 min read

LangChain's architecture is built on composable components that can be combined to create sophisticated AI applications. Let's explore each component from an engineering perspective.

Core Components Overview

# Core imports structure
from langchain_core import (
    prompts,      # Prompt engineering
    messages,     # Message types
    runnables,    # LCEL components
    output_parsers, # Structured output
    documents,    # Document handling
    callbacks,    # Event handling
)

Models and LLMs

Model Abstraction Layer

LangChain provides a unified interface across different model providers:

from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from typing import Optional, Dict, Any

class ModelFactory:
    """Factory pattern for model initialization"""

    @staticmethod
    def create_model(
        provider: str,
        model_name: str,
        **kwargs
    ) -> Any:
        models = {
            "openai": ChatOpenAI,
            "anthropic": ChatAnthropic,
            "google": ChatGoogleGenerativeAI,
        }

        model_class = models.get(provider)
        if not model_class:
            raise ValueError(f"Unknown provider: {provider}")

        # Default configurations
        default_config = {
            "temperature": 0.7,
            "max_retries": 3,
            "timeout": 30,
            "streaming": True,
        }

        config = {**default_config, **kwargs}
        return model_class(model=model_name, **config)

# Usage
llm = ModelFactory.create_model(
    provider="openai",
    model_name="gpt-4-turbo-preview",
    temperature=0.5
)

Advanced Model Configuration

from langchain_core.callbacks import CallbackManager
from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import tiktoken

class OptimizedLLM:
    """Production-ready LLM configuration"""

    def __init__(self, model_name: str = "gpt-4"):
        self.tokenizer = tiktoken.encoding_for_model(model_name)

        self.llm = ChatOpenAI(
            model=model_name,
            temperature=0,
            max_tokens=2000,
            model_kwargs={
                "top_p": 0.95,
                "frequency_penalty": 0.0,
                "presence_penalty": 0.0,
            },
            # Streaming configuration
            streaming=True,
            callback_manager=CallbackManager([
                StreamingStdOutCallbackHandler()
            ]),
            # Rate limiting
            max_retries=3,
            request_timeout=60,
        )

    def count_tokens(self, text: str) -> int:
        """Count tokens for cost estimation"""
        return len(self.tokenizer.encode(text))

    def estimate_cost(self, input_tokens: int, output_tokens: int) -> float:
        """Estimate API cost"""
        # GPT-4 pricing (as of 2024)
        input_cost = (input_tokens / 1000) * 0.03
        output_cost = (output_tokens / 1000) * 0.06
        return input_cost + output_cost

Prompt Templates

Dynamic Prompt Engineering

from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    FewShotChatMessagePromptTemplate,
)
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage

class PromptManager:
    """Manages complex prompt templates"""

    @staticmethod
    def create_few_shot_prompt(examples: list) -> ChatPromptTemplate:
        """Create few-shot learning prompt"""

        example_prompt = ChatPromptTemplate.from_messages([
            ("human", "{input}"),
            ("ai", "{output}"),
        ])

        few_shot_prompt = FewShotChatMessagePromptTemplate(
            example_prompt=example_prompt,
            examples=examples,
        )

        final_prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an expert assistant. Learn from these examples:"),
            few_shot_prompt,
            ("human", "{query}"),
        ])

        return final_prompt

    @staticmethod
    def create_cot_prompt() -> ChatPromptTemplate:
        """Chain-of-thought prompting"""
        return ChatPromptTemplate.from_messages([
            ("system", """You are a logical reasoning assistant.
            Always think step-by-step:
            1. Understand the problem
            2. Break it down into steps
            3. Solve each step
            4. Combine the solutions
            5. Verify the answer"""),
            ("human", "{problem}"),
            ("ai", "Let me think through this step-by-step:"),
        ])

# Usage
examples = [
    {"input": "2+2", "output": "4"},
    {"input": "5*3", "output": "15"},
]

prompt = PromptManager.create_few_shot_prompt(examples)
chain = prompt | llm
result = chain.invoke({"query": "What is 7*8?"})

Output Parsers

Structured Output Parsing

from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List, Optional
from datetime import datetime

class TaskOutput(BaseModel):
    """Structured task output"""

    task_id: str = Field(description="Unique task identifier")
    title: str = Field(description="Task title")
    priority: int = Field(description="Priority 1-5", ge=1, le=5)
    tags: List[str] = Field(description="Task tags")
    due_date: Optional[datetime] = Field(description="Due date")
    subtasks: List[str] = Field(default_factory=list)

class StructuredOutputChain:
    """Chain with guaranteed structured output"""

    def __init__(self):
        self.parser = PydanticOutputParser(pydantic_object=TaskOutput)

        self.prompt = ChatPromptTemplate.from_messages([
            ("system", "Extract task information. {format_instructions}"),
            ("human", "{task_description}"),
        ])

        self.chain = self.prompt | llm | self.parser

    def process(self, task_description: str) -> TaskOutput:
        try:
            return self.chain.invoke({
                "task_description": task_description,
                "format_instructions": self.parser.get_format_instructions(),
            })
        except Exception as e:
            # Fallback with retry logic
            retry_prompt = ChatPromptTemplate.from_messages([
                ("system", "Fix this JSON: {error}"),
                ("human", "{original}"),
            ])
            # Implement retry logic here
            raise

Document Loaders and Text Splitters

Multi-Source Document Loading

from langchain_community.document_loaders import (
    PyPDFLoader,
    UnstructuredMarkdownLoader,
    CSVLoader,
    WebBaseLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List
import asyncio

class DocumentPipeline:
    """Production document loading pipeline"""

    async def load_document(self, file_path: str) -> List[Document]:
        """Load single document with appropriate loader"""

        extension = Path(file_path).suffix.lower()
        loaders = {
            '.pdf': PyPDFLoader,
            '.md': UnstructuredMarkdownLoader,
            '.csv': CSVLoader,
        }

        loader_class = loaders.get(extension)
        if not loader_class:
            raise ValueError(f"Unsupported file type: {extension}")

        loader = loader_class(file_path)
        documents = await asyncio.get_event_loop().run_in_executor(
            None,
            loader.load
        )

        # Enrich metadata
        for doc in documents:
            doc.metadata.update({
                'source_type': extension,
                'loaded_at': datetime.now().isoformat(),
                'char_count': len(doc.page_content),
            })

        return documents

class SmartTextSplitter:
    """Context-aware text splitting"""

    @staticmethod
    def split_by_token_limit(
        text: str,
        max_tokens: int = 1000,
        overlap_tokens: int = 100
    ) -> List[str]:
        """Split text by token count for LLM context limits"""

        from langchain.text_splitter import TokenTextSplitter

        splitter = TokenTextSplitter(
            chunk_size=max_tokens,
            chunk_overlap=overlap_tokens,
            model_name="gpt-4",
        )

        return splitter.split_text(text)

Building a Production RAG Pipeline

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
import hashlib

class RAGPipeline:
    """End-to-end RAG implementation"""

    def __init__(self, persist_directory: str = "./chroma_db"):
        self.embeddings = OpenAIEmbeddings()
        self.vector_store = Chroma(
            persist_directory=persist_directory,
            embedding_function=self.embeddings,
        )
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
        )

    def add_documents(self, documents: List[Document]):
        """Add documents with deduplication"""

        # Generate unique IDs
        for doc in documents:
            content_hash = hashlib.md5(
                doc.page_content.encode()
            ).hexdigest()
            doc.metadata['content_hash'] = content_hash

        # Split documents
        splits = self.text_splitter.split_documents(documents)

        # Add to vector store with IDs
        ids = [f"{s.metadata['content_hash']}_{i}"
               for i, s in enumerate(splits)]

        self.vector_store.add_documents(splits, ids=ids)

    def query(
        self,
        question: str,
        k: int = 4
    ) -> str:
        """Query with context retrieval"""

        # Retrieve relevant documents
        retriever = self.vector_store.as_retriever(
            search_kwargs={"k": k}
        )

        # Build QA chain
        from langchain_core.runnables import RunnablePassthrough

        qa_prompt = ChatPromptTemplate.from_messages([
            ("system", "Answer based on context: {context}"),
            ("human", "{question}"),
        ])

        chain = (
            {"context": retriever, "question": RunnablePassthrough()}
            | qa_prompt
            | llm
            | StrOutputParser()
        )

        return chain.invoke(question)

Summary

In this first part, we've covered the fundamental building blocks of LangChain:

  • Model abstraction and configuration
  • Advanced prompt engineering techniques
  • Structured output parsing
  • Document loading pipelines
  • Intelligent text splitting strategies

These components form the foundation for building sophisticated AI applications. In the next part, we'll dive into chains, agents, and memory systems.


Series Navigation

This is Part 1 of the LangChain Series.

Previous: ← Part 0 - Introduction to LangChain Ecosystem Next: Part 2 - LangChain Fundamentals: Chains and Agents →

Complete Series:


Tags: #LangChain #AI #LLM #Python #RAG