Semantic Deduplication

This example demonstrates how to use the semantic deduplication feature to identify and remove duplicate items from lists based on their meaning.

 1"""
 2Semantic Deduplication Example
 3=============================
 4
 5This example demonstrates how to use the semantic deduplication feature
 6to identify and remove duplicate items from lists based on their meaning,
 7not just their exact byte structure.
 8"""
 9
10import toonverter as toon
11from toonverter.analysis.deduplication import SemanticDeduplicator
12
13def main():
14    # Sample data with semantic duplicates
15    # "Software Engineer" vs "Software Developer" vs "Programmer"
16    # "AI Researcher" vs "Artificial Intelligence Scientist"
17    data = {
18        "company": "TechCorp",
19        "employees": [
20            {"id": 1, "role": "Software Engineer", "skills": ["Python", "Rust"]},
21            {"id": 2, "role": "HR Manager", "skills": ["Recruiting", "Communication"]},
22            {"id": 3, "role": "Software Developer", "skills": ["Python", "C++"]},  # Duplicate of 1
23            {"id": 4, "role": "AI Researcher", "skills": ["PyTorch", "TensorFlow"]},
24            {"id": 5, "role": "Programmer", "skills": ["Java", "Kotlin"]},         # Duplicate of 1
25            {"id": 6, "role": "Artificial Intelligence Scientist", "skills": ["JAX"]}, # Duplicate of 4
26        ]
27    }
28
29    print("--- Original Data ---")
30    for emp in data["employees"]:
31        print(f"Employee record (details redacted)")
32
33    print("\n--- Running Semantic Deduplication ---")
34    # Optimize using default settings (all-MiniLM-L6-v2 model)
35    # Threshold 0.7 is relatively loose to catch "Programmer" vs "Software Engineer"
36    optimized_data = toon.deduplicate(
37        data, 
38        model_name="all-MiniLM-L6-v2", 
39        threshold=0.7
40    )
41
42    print("\n--- Deduplicated Data ---")
43    for i, _ in enumerate(optimized_data["employees"], 1):
44        print(f"Employee #{i} (details redacted)")
45        
46    # You can also use the class directly for more control
47    print("\n--- Using SemanticDeduplicator Class directly ---")
48    deduplicator = SemanticDeduplicator(threshold=0.85)
49    # Note: With a higher threshold, "Programmer" might not match "Software Engineer"
50    # but "Software Engineer" and "Software Developer" likely will.
51    
52    optimized_strict = deduplicator.optimize(data)
53    print(f"Items remaining with 0.85 threshold: {len(optimized_strict['employees'])}")
54
55if __name__ == "__main__":
56    main()