Create Python BM25 Index
#!/usr/bin/env python
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from whoosh import qparser
from whoosh.qparser import QueryParser
from whoosh.analysis import StemmingAnalyzer
import os
# Define schema for indexing
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True, analyzer=StemmingAnalyzer()), # Stemming for better search results
path=ID(stored=True, unique=True)
)
# Create index directory if it doesn't exist
if not os.path.exists("indexdir"):
os.mkdir("indexdir")
# Create the index
index = create_in("indexdir", schema)
# Add documents to the index
writer = index.writer()
# Example documents to index
documents = [
{"title": "Document 1", "content": "The quick brown fox jumps over the lazy dog.", "path": "/a"},
{"title": "Document 2", "content": "Whoosh is a fast search library implemented in pure Python.", "path": "/b"},
{"title": "Document 3", "content": "The fox is quick and jumps high.", "path": "/c"}
]
# Add each document to the index
for doc in documents:
writer.add_document(title=doc["title"], content=doc["content"], path=doc["path"])
writer.commit() # Save changes to the index
print("Indexing completed.")
Search Index BM25
#!/usr/bin/env python
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
def search(query_str):
# Open the index
ix = open_dir("indexdir")
# Parse the query
with ix.searcher() as searcher:
query_parser = QueryParser("content", ix.schema)
query = query_parser.parse(query_str)
# Perform the search
results = searcher.search(query, limit=10)
# Print the results
print(f"Search results for '{query_str}':")
for result in results:
print(f"Title: {result['title']}, Path: {result['path']}")
print(f"Content: {result['content']}")
print("-" * 40)
# Example search queries
search("quick fox")
search("search library")
Output
#!/usr/bin/env python
Search results for 'quick fox':
Title: Document 3, Path: /c
Content: The fox is quick and jumps high.
----------------------------------------
Title: Document 1, Path: /a
Content: The quick brown fox jumps over the lazy dog.
----------------------------------------
Search results for 'search library':
Title: Document 2, Path: /b
Content: Whoosh is a fast search library implemented in pure Python.
----------------------------------------