This Python script performs the following tasks:
full_emails.json
).Document
objects for further processing.OpenAIEmbeddings
.FAISS
vector database for fast similarity search.#!/usr/bin/env python
import json
import re
import os
from datetime import datetime
from langchain_community.document_loaders import JSONLoader
from langchain_openai import OpenAIEmbeddings # Updated import
from langchain_community.vectorstores import FAISS
from langchain.schema import Document # Import Document class
# Define jq schema to extract relevant fields from the JSON
jq_schema = ".[] | {page_content: .content, metadata: {id: .id, date: .date}}"
# Initialize the JSONLoader with the schema
loader = JSONLoader("full_emails.json", jq_schema=jq_schema, text_content=False)
documents = loader.load()
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()
# Clean email content function
def clean_content(text):
text = re.sub(r'http\S+', '', text) # Remove URLs
text = re.sub(r'-{2,}', '-', text) # Replace sequences of dashes with one dash
text = re.sub(r'(unsubscribe.*|learn why we included.*|you are receiving.*)', ''
, text, flags=re.IGNORECASE)
text = re.sub(r'\[image:.*?\]|\&\w+;', '', text) # Remove HTML entities
return re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
# Convert the cleaned content into Document objects
processed_docs = []
for doc in documents:
try:
content_data = json.loads(doc.page_content) # Parse content
content = clean_content(content_data.get("page_content", "No Content"))
metadata = content_data.get("metadata", {})
email_id = metadata.get("id", "Unknown ID")
timestamp = int(metadata.get("date", 0)) / 1000 # Convert to seconds
date = datetime.fromtimestamp(timestamp).isoformat() if timestamp > 0 else "Unknown"
# Create Document object
processed_docs.append(
Document(page_content=content, metadata={"id": email_id, "date": date})
)
except json.JSONDecodeError as e:
print(f"Error parsing JSON content: {e}")
# Create FAISS vector store with the Document objects
vector_store = FAISS.from_documents(processed_docs, embeddings)
# Save the FAISS index to disk
faiss_index_path = "faiss_index"
vector_store.save_local(faiss_index_path)
print(f"FAISS index saved to '{faiss_index_path}'")
This script processes a set of emails and builds a vector index for efficient search and retrieval using FAISS.
clean_content()
function removes unwanted elements like URLs, extra dashes, and boilerplate text (e.g., unsubscribe messages).Document
object, which stores both the cleaned content and metadata.When the script is run, it displays the following message:
FAISS index saved to 'faiss_index'
This script leverages OpenAI embeddings and the FAISS vector store to process and index email data, allowing for efficient similarity searches. This setup can be extended to support various document types or integrate with other machine learning models for advanced use cases.