🌐 EN | 🇯🇵 JP | Last sync: 2025-11-16

Chapter 2: Embeddings and Search

Vector Representation and Similarity Search

📖 Study Time: 30-35 minutes 📊 Difficulty: Intermediate 💻 Code Examples: 6

This chapter covers Embeddings and Search. You will learn essential concepts and techniques.

1. Vector Embeddings

1.1 Embedding Concepts

Embeddings are a technique that represents text as points in a high-dimensional vector space. Semantically similar texts are positioned close together in the vector space.

Embedding Properties:

Cosine Similarity:

Similarity between two vectors \(\mathbf{u}\) and \(\mathbf{v}\):

$$\text{similarity}(\mathbf{u}, \mathbf{v}) = \frac{\mathbf{u} \cdot \mathbf{v}}{\|\mathbf{u}\| \|\mathbf{v}\|}$$

Range: -1 (opposite) to 1 (identical)

Implementation Example 1: Embedding Generation and Similarity Calculation

from openai import OpenAI
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class EmbeddingGenerator:
    """Embedding generation and similarity calculation"""

    def __init__(self, api_key, model="text-embedding-3-small"):
        self.client = OpenAI(api_key=api_key)
        self.model = model

    def get_embedding(self, text):
        """Get embedding for a single text"""
        response = self.client.embeddings.create(
            input=text,
            model=self.model
        )
        return np.array(response.data[0].embedding)

    def get_embeddings_batch(self, texts, batch_size=100):
        """Get embeddings with batch processing"""
        embeddings = []

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            response = self.client.embeddings.create(
                input=batch,
                model=self.model
            )
            batch_embeddings = [
                np.array(data.embedding) for data in response.data
            ]
            embeddings.extend(batch_embeddings)

        return np.array(embeddings)

    def cosine_similarity(self, vec1, vec2):
        """Calculate cosine similarity"""
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    def find_most_similar(self, query_text, document_texts, top_k=5):
        """Search for most similar documents"""
        # Get embeddings
        query_emb = self.get_embedding(query_text)
        doc_embs = self.get_embeddings_batch(document_texts)

        # Calculate similarities
        similarities = cosine_similarity([query_emb], doc_embs)[0]

        # Get Top-K
        top_indices = np.argsort(similarities)[::-1][:top_k]

        results = [
            {
                'text': document_texts[idx],
                'score': float(similarities[idx]),
                'rank': rank + 1
            }
            for rank, idx in enumerate(top_indices)
        ]

        return results

# Usage example
generator = EmbeddingGenerator(api_key="your-api-key")

documents = [
    "Machine learning is AI technology that learns from data",
    "Deep learning uses neural networks",
    "Natural language processing is a method for text analysis",
    "Computer vision specializes in image recognition"
]

query = "AI-based text analysis"
results = generator.find_most_similar(query, documents, top_k=3)

for result in results:
    print(f"Rank {result['rank']}: {result['text']}")
    print(f"Similarity: {result['score']:.4f}\n")

1.2 Choosing Embedding Models

Various embedding models exist, and the choice depends on the intended use case.

Implementation Example 2: Comparing Multiple Embedding Models

from sentence_transformers import SentenceTransformer
from langchain.embeddings import (
    OpenAIEmbeddings, HuggingFaceEmbeddings
)
import time

class EmbeddingComparison:
    """Comparison of multiple embedding models"""

    def __init__(self):
        self.models = {}

    def load_models(self, openai_api_key=None):
        """Load various models"""
        # OpenAI
        if openai_api_key:
            self.models['openai-small'] = OpenAIEmbeddings(
                model="text-embedding-3-small",
                openai_api_key=openai_api_key
            )
            self.models['openai-large'] = OpenAIEmbeddings(
                model="text-embedding-3-large",
                openai_api_key=openai_api_key
            )

        # Sentence Transformers (local)
        self.models['multilingual'] = SentenceTransformer(
            'paraphrase-multilingual-MiniLM-L12-v2'
        )
        self.models['japanese'] = SentenceTransformer(
            'sentence-transformers/distiluse-base-multilingual-cased-v1'
        )

    def benchmark_model(self, model_name, texts):
        """Benchmark a model"""
        model = self.models[model_name]

        start = time.time()

        if isinstance(model, SentenceTransformer):
            embeddings = model.encode(texts)
        else:
            embeddings = model.embed_documents(texts)

        elapsed = time.time() - start

        return {
            'model': model_name,
            'num_texts': len(texts),
            'time': elapsed,
            'time_per_text': elapsed / len(texts),
            'dimension': len(embeddings[0])
        }

    def compare_all_models(self, test_texts):
        """Compare all models"""
        results = []

        for model_name in self.models.keys():
            try:
                result = self.benchmark_model(model_name, test_texts)
                results.append(result)
                print(f"{model_name}: {result['time']:.2f}s "
                      f"(dimension: {result['dimension']})")
            except Exception as e:
                print(f"{model_name}: Error - {e}")

        return results

# Usage example
comparator = EmbeddingComparison()
comparator.load_models(openai_api_key="your-api-key")

test_texts = [
    "Learning the basics of machine learning",
    "Building deep learning models",
    "Applications of natural language processing"
] * 10  # 30 texts

results = comparator.compare_all_models(test_texts)

2. Similarity Search

2.1 Search Algorithms

Vector databases search for similar vectors quickly from large embedding collections.

Major Search Methods:

3. Vector Databases

3.1 FAISS (Facebook AI Similarity Search)

A high-speed similarity search library developed by Meta that runs in local environments.

Implementation Example 3: FAISS Implementation

import faiss
import numpy as np
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document

class FAISSVectorStore:
    """FAISS vector store implementation"""

    def __init__(self, embeddings):
        self.embeddings = embeddings
        self.vectorstore = None

    def create_index(self, documents, index_type='flat'):
        """Create index"""
        # Use Langchain FAISS
        self.vectorstore = FAISS.from_documents(
            documents,
            self.embeddings
        )

        # Custom index configuration is also possible
        if index_type == 'ivf':
            self._create_ivf_index(documents)

        print(f"Index creation complete: {len(documents)} documents")

    def _create_ivf_index(self, documents):
        """Create IVF (Inverted File) index"""
        # Get embeddings
        texts = [doc.page_content for doc in documents]
        embeddings = self.embeddings.embed_documents(texts)
        embeddings_array = np.array(embeddings).astype('float32')

        # Number of dimensions
        dimension = embeddings_array.shape[1]

        # Create IVF index
        nlist = 100  # Number of clusters
        quantizer = faiss.IndexFlatL2(dimension)
        index = faiss.IndexIVFFlat(quantizer, dimension, nlist)

        # Train
        index.train(embeddings_array)
        index.add(embeddings_array)

        print(f"IVF index created: {nlist} clusters")
        return index

    def search(self, query, k=5, score_threshold=None):
        """Search for similar documents"""
        if score_threshold:
            results = self.vectorstore.similarity_search_with_relevance_scores(
                query, k=k
            )
            # Filter by score
            filtered = [
                (doc, score) for doc, score in results
                if score >= score_threshold
            ]
            return filtered
        else:
            return self.vectorstore.similarity_search(query, k=k)

    def search_with_metadata_filter(self, query, k=5, filter_dict=None):
        """Search with metadata filter"""
        if filter_dict:
            return self.vectorstore.similarity_search(
                query, k=k, filter=filter_dict
            )
        return self.search(query, k=k)

    def save_local(self, path):
        """Save locally"""
        self.vectorstore.save_local(path)
        print(f"Saved: {path}")

    def load_local(self, path):
        """Load locally"""
        self.vectorstore = FAISS.load_local(
            path, self.embeddings
        )
        print(f"Loaded: {path}")

# Usage example
embeddings = OpenAIEmbeddings(openai_api_key="your-api-key")
faiss_store = FAISSVectorStore(embeddings)

# Prepare documents
documents = [
    Document(
        page_content="Python is a popular programming language",
        metadata={"category": "programming", "language": "en"}
    ),
    Document(
        page_content="Python is commonly used for machine learning",
        metadata={"category": "ml", "language": "en"}
    )
]

# Create index
faiss_store.create_index(documents)

# Search
results = faiss_store.search("programming language", k=2)
for doc in results:
    print(f"- {doc.page_content}")

# Save
faiss_store.save_local("./faiss_index")

3.2 ChromaDB

An open-source vector database that excels at metadata filtering.

Implementation Example 4: ChromaDB Implementation

import chromadb
from chromadb.config import Settings
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

class ChromaVectorStore:
    """ChromaDB vector store implementation"""

    def __init__(self, embeddings, persist_directory="./chroma_db"):
        self.embeddings = embeddings
        self.persist_directory = persist_directory
        self.vectorstore = None

        # Client configuration
        self.client = chromadb.Client(Settings(
            chroma_db_impl="duckdb+parquet",
            persist_directory=persist_directory
        ))

    def create_collection(self, documents, collection_name="default"):
        """Create collection"""
        self.vectorstore = Chroma.from_documents(
            documents=documents,
            embedding=self.embeddings,
            collection_name=collection_name,
            persist_directory=self.persist_directory
        )

        # Persist
        self.vectorstore.persist()
        print(f"Collection created: {collection_name}")

    def add_documents(self, documents):
        """Add documents"""
        if not self.vectorstore:
            raise ValueError("Collection not created")

        self.vectorstore.add_documents(documents)
        self.vectorstore.persist()
        print(f"{len(documents)} documents added")

    def search_with_filter(self, query, k=5, where=None, where_document=None):
        """Advanced filtering search"""
        # Metadata filter
        if where:
            results = self.vectorstore.similarity_search(
                query, k=k, filter=where
            )
        # Document content filter
        elif where_document:
            results = self.vectorstore.similarity_search(
                query, k=k, where_document=where_document
            )
        else:
            results = self.vectorstore.similarity_search(query, k=k)

        return results

    def mmr_search(self, query, k=5, fetch_k=20, lambda_mult=0.5):
        """MMR (Maximal Marginal Relevance) search

        Search that balances diversity and relevance
        """
        results = self.vectorstore.max_marginal_relevance_search(
            query,
            k=k,
            fetch_k=fetch_k,
            lambda_mult=lambda_mult  # 0=diversity focused, 1=relevance focused
        )
        return results

    def delete_collection(self, collection_name):
        """Delete collection"""
        self.client.delete_collection(collection_name)
        print(f"Deleted: {collection_name}")

# Usage example
embeddings = OpenAIEmbeddings(openai_api_key="your-api-key")
chroma_store = ChromaVectorStore(embeddings, persist_directory="./chroma_db")

documents = [
    Document(
        page_content="Introduction to Python machine learning",
        metadata={"type": "tutorial", "level": "beginner", "year": 2024}
    ),
    Document(
        page_content="Advanced deep learning techniques",
        metadata={"type": "advanced", "level": "expert", "year": 2024}
    ),
    Document(
        page_content="Data science fundamentals",
        metadata={"type": "tutorial", "level": "beginner", "year": 2023}
    )
]

# Create collection
chroma_store.create_collection(documents, collection_name="ml_docs")

# Metadata filter search
results = chroma_store.search_with_filter(
    "machine learning",
    k=2,
    where={"level": "beginner", "year": 2024}
)

for doc in results:
    print(f"- {doc.page_content}")
    print(f"  Metadata: {doc.metadata}")

# MMR search (diversity focused)
diverse_results = chroma_store.mmr_search(
    "learning machine learning",
    k=3,
    lambda_mult=0.3  # Diversity focused
)
print(f"\nMMR search results: {len(diverse_results)} items")

3.3 Pinecone

A cloud-native vector database that excels at scalability.

Implementation Example 5: Pinecone Implementation

import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import time

class PineconeVectorStore:
    """Pinecone vector store implementation"""

    def __init__(self, api_key, environment, embeddings):
        self.embeddings = embeddings

        # Initialize Pinecone
        pinecone.init(
            api_key=api_key,
            environment=environment
        )

    def create_index(self, index_name, dimension=1536, metric='cosine'):
        """Create index"""
        # Check for existing index
        if index_name not in pinecone.list_indexes():
            pinecone.create_index(
                name=index_name,
                dimension=dimension,
                metric=metric,
                pods=1,
                pod_type='p1.x1'
            )
            # Wait for index to be ready
            time.sleep(1)
            print(f"Index created: {index_name}")
        else:
            print(f"Using existing index: {index_name}")

    def upsert_documents(self, index_name, documents):
        """Upsert documents"""
        vectorstore = Pinecone.from_documents(
            documents,
            self.embeddings,
            index_name=index_name
        )
        print(f"{len(documents)} documents upserted")
        return vectorstore

    def search_with_namespace(self, index_name, query, k=5, namespace=None):
        """Search with namespace specification"""
        vectorstore = Pinecone.from_existing_index(
            index_name=index_name,
            embedding=self.embeddings,
            namespace=namespace
        )

        results = vectorstore.similarity_search_with_score(query, k=k)
        return results

    def hybrid_search(self, index_name, query, k=5, alpha=0.5):
        """Hybrid search (dense vector + sparse vector)

        alpha: 0=keyword search only, 1=vector search only
        """
        # Pinecone hybrid search feature
        index = pinecone.Index(index_name)

        # Query embedding
        query_vector = self.embeddings.embed_query(query)

        # Execute hybrid search
        results = index.query(
            vector=query_vector,
            top_k=k,
            include_metadata=True,
            # Hybrid search parameter
            alpha=alpha
        )

        return results

    def delete_index(self, index_name):
        """Delete index"""
        if index_name in pinecone.list_indexes():
            pinecone.delete_index(index_name)
            print(f"Index deleted: {index_name}")

    def get_index_stats(self, index_name):
        """Get index statistics"""
        index = pinecone.Index(index_name)
        stats = index.describe_index_stats()
        return stats

# Usage example
embeddings = OpenAIEmbeddings(openai_api_key="your-openai-key")
pinecone_store = PineconeVectorStore(
    api_key="your-pinecone-key",
    environment="us-west1-gcp",
    embeddings=embeddings
)

# Create index
index_name = "ml-knowledge-base"
pinecone_store.create_index(index_name, dimension=1536)

# Upsert documents
documents = [
    Document(
        page_content="Fundamental theories of machine learning",
        metadata={"category": "ml", "level": "basic"}
    ),
    Document(
        page_content="Deep learning implementation methods",
        metadata={"category": "dl", "level": "advanced"}
    )
]

vectorstore = pinecone_store.upsert_documents(index_name, documents)

# Search
results = pinecone_store.search_with_namespace(
    index_name, "how to learn machine learning", k=3
)

for doc, score in results:
    print(f"Score: {score:.4f}")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}\n")

# Statistics
stats = pinecone_store.get_index_stats(index_name)
print(f"Total vector count: {stats['total_vector_count']}")

3.4 Vector DB Comparison and Selection

Implementation Example 6: Vector DB Performance Comparison

import time
from typing import List, Dict
from langchain.schema import Document

class VectorDBBenchmark:
    """Vector database performance comparison"""

    def __init__(self):
        self.results = []

    def benchmark_indexing(self, db_name, vectorstore, documents):
        """Measure index creation time"""
        start = time.time()

        if db_name == "FAISS":
            vectorstore.create_index(documents)
        elif db_name == "Chroma":
            vectorstore.create_collection(documents)
        elif db_name == "Pinecone":
            vectorstore.upsert_documents("benchmark", documents)

        elapsed = time.time() - start

        return {
            'db': db_name,
            'operation': 'indexing',
            'num_docs': len(documents),
            'time': elapsed,
            'docs_per_sec': len(documents) / elapsed
        }

    def benchmark_search(self, db_name, vectorstore, queries, k=5):
        """Measure search time"""
        start = time.time()

        for query in queries:
            if db_name == "FAISS":
                vectorstore.search(query, k=k)
            elif db_name == "Chroma":
                vectorstore.search_with_filter(query, k=k)
            elif db_name == "Pinecone":
                vectorstore.search_with_namespace("benchmark", query, k=k)

        elapsed = time.time() - start

        return {
            'db': db_name,
            'operation': 'search',
            'num_queries': len(queries),
            'time': elapsed,
            'queries_per_sec': len(queries) / elapsed,
            'avg_latency_ms': (elapsed / len(queries)) * 1000
        }

    def compare_features(self):
        """Feature comparison table"""
        comparison = {
            'FAISS': {
                'type': 'Local library',
                'deployment': 'Self-hosted',
                'scalability': 'Medium',
                'metadata_filter': 'Limited',
                'cost': 'Free (infrastructure costs only)',
                'best_for': 'Small to medium scale, offline environments'
            },
            'Chroma': {
                'type': 'Local/Server',
                'deployment': 'Self-hosted/Cloud',
                'scalability': 'Medium to High',
                'metadata_filter': 'Powerful',
                'cost': 'Free (open source)',
                'best_for': 'Medium scale, development environments'
            },
            'Pinecone': {
                'type': 'Cloud service',
                'deployment': 'Managed',
                'scalability': 'Very high',
                'metadata_filter': 'Powerful',
                'cost': 'Paid (usage-based)',
                'best_for': 'Large scale, production environments'
            }
        }
        return comparison

    def print_comparison(self):
        """Display comparison results"""
        features = self.compare_features()

        print("=" * 80)
        print("Vector Database Feature Comparison")
        print("=" * 80)

        for db_name, features_dict in features.items():
            print(f"\n[{db_name}]")
            for key, value in features_dict.items():
                print(f"  {key:20s}: {value}")

# Usage example
benchmark = VectorDBBenchmark()

# Display feature comparison
benchmark.print_comparison()

# Test data
test_documents = [
    Document(page_content=f"Document {i}")
    for i in range(1000)
]

test_queries = [f"Query {i}" for i in range(100)]

# Run benchmark for each DB
# faiss_result = benchmark.benchmark_indexing("FAISS", faiss_store, test_documents)
# chroma_result = benchmark.benchmark_indexing("Chroma", chroma_store, test_documents)

print("\nPerformance benchmark complete")
Vector DB Selection Guide:

Summary

Disclaimer

⚠️ Help Us Improve Content Quality

This content was created with AI assistance. If you find errors or areas for improvement, please report them using one of the following methods: