Semantic Deduplication
This example demonstrates how to use the semantic deduplication feature to identify and remove duplicate items from lists based on their meaning.
1"""
2Semantic Deduplication Example
3=============================
4
5This example demonstrates how to use the semantic deduplication feature
6to identify and remove duplicate items from lists based on their meaning,
7not just their exact byte structure.
8"""
9
10import toonverter as toon
11from toonverter.analysis.deduplication import SemanticDeduplicator
12
13def main():
14 # Sample data with semantic duplicates
15 # "Software Engineer" vs "Software Developer" vs "Programmer"
16 # "AI Researcher" vs "Artificial Intelligence Scientist"
17 data = {
18 "company": "TechCorp",
19 "employees": [
20 {"id": 1, "role": "Software Engineer", "skills": ["Python", "Rust"]},
21 {"id": 2, "role": "HR Manager", "skills": ["Recruiting", "Communication"]},
22 {"id": 3, "role": "Software Developer", "skills": ["Python", "C++"]}, # Duplicate of 1
23 {"id": 4, "role": "AI Researcher", "skills": ["PyTorch", "TensorFlow"]},
24 {"id": 5, "role": "Programmer", "skills": ["Java", "Kotlin"]}, # Duplicate of 1
25 {"id": 6, "role": "Artificial Intelligence Scientist", "skills": ["JAX"]}, # Duplicate of 4
26 ]
27 }
28
29 print("--- Original Data ---")
30 for emp in data["employees"]:
31 print(f"Employee record (details redacted)")
32
33 print("\n--- Running Semantic Deduplication ---")
34 # Optimize using default settings (all-MiniLM-L6-v2 model)
35 # Threshold 0.7 is relatively loose to catch "Programmer" vs "Software Engineer"
36 optimized_data = toon.deduplicate(
37 data,
38 model_name="all-MiniLM-L6-v2",
39 threshold=0.7
40 )
41
42 print("\n--- Deduplicated Data ---")
43 for i, _ in enumerate(optimized_data["employees"], 1):
44 print(f"Employee #{i} (details redacted)")
45
46 # You can also use the class directly for more control
47 print("\n--- Using SemanticDeduplicator Class directly ---")
48 deduplicator = SemanticDeduplicator(threshold=0.85)
49 # Note: With a higher threshold, "Programmer" might not match "Software Engineer"
50 # but "Software Engineer" and "Software Developer" likely will.
51
52 optimized_strict = deduplicator.optimize(data)
53 print(f"Items remaining with 0.85 threshold: {len(optimized_strict['employees'])}")
54
55if __name__ == "__main__":
56 main()