#!/usr/bin/env python
#-------------------------------------------------------------------------------------#
# TF-IDF (Term Frequency-Inverse Document Frequency)
#-------------------------------------------------------------------------------------#
from sklearn.feature_extraction.text import TfidfVectorizer
from prettytable import PrettyTable
# Example documents
docs = [ "This is a sample document.",
"This document is another sample document.",
"Machine Learning Document"
]
# Initialize the vectorizer
vectorizer = TfidfVectorizer()
# Fit the model and transform the text data into a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(docs)
# Get feature names (terms)
feature_names = vectorizer.get_feature_names_out()
# Convert to array to see the result
tfidf_array = tfidf_matrix.toarray()
# Initialize PrettyTable
table = PrettyTable()
# Add column names (terms)
table.field_names = ["Document"] + list(feature_names)
# Add rows (documents and their TF-IDF values rounded to 4 decimals)
for i, doc in enumerate(docs):
row = [f"Document {i+1}"] + [round(value, 4) for value in tfidf_array[i]]
table.add_row(row)
print(table)
+-----------+---------+----------+---------+---------+---------+--------+--------+
| Document | another | document | is | learning| machine | sample | this |
+-----------+---------+----------+---------+---------+---------+--------+--------+
| Document 1| 0.0 | 0.4091 | 0.5268 | 0.0 | 0.0 | 0.5268 | 0.5268 |
| Document 2| 0.492 | 0.5812 | 0.3742 | 0.0 | 0.0 | 0.3742 | 0.3742 |
| Document 3| 0.0 | 0.3854 | 0.0 | 0.6525 | 0.6525 | 0.0 | 0.0 |
+-----------+---------+----------+---------+---------+---------+--------+--------+
Higher TF-IDF values: The higher the TF-IDF score for a term in a document, the more relevant or important that term is to that specific document.
Lower TF-IDF values: Terms with lower TF-IDF scores are either less frequent or appear in many documents, which reduces their importance in distinguishing between documents.
| Document | another | document | is | learning | machine | sample | this |
|------------|---------|----------|--------|----------|---------|--------|--------|
| Document 1 | 0.0 | 0.4091 | 0.5268 | 0.0 | 0.0 | 0.5268 | 0.5268 |
| Document | another | document | is | learning | machine | sample | this |
|------------|---------|----------|--------|----------|---------|---------|--------|
| Document 2 | 0.492 | 0.5812 | 0.3742 | 0.0 | 0.0 | 0.3742 | 0.3742 |
| Document | another | document | is | learning | machine | sample | this |
|------------|---------|----------|--------|----------|---------|--------|-------|
| Document 3 | 0.0 | 0.3854 | 0.0 | 0.6525 | 0.6525 | 0.0 | 0.0 |
The TF-IDF score typically ranges between 0.0 and 1.0, though in theory, the upper limit can go beyond 1 depending on the data. In practice, however, the values are normalized to stay in this range: