# Indonesian Text Embedding Usage Examples ## 🔍 **Search & Retrieval** ```python from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import numpy as np model = SentenceTransformer("asmud/nomic-embed-indonesian", trust_remote_code=True) # Indonesian search example query = "search_query: Bagaimana cara memasak rendang?" documents = [ "search_document: Rendang adalah masakan Minangkabau yang dimasak dengan santan dan rempah-rempah", "search_document: Nasi goreng adalah makanan yang dibuat dari nasi yang digoreng dengan bumbu", "search_document: Sate adalah makanan yang terdiri dari daging yang ditusuk dan dibakar" ] query_embedding = model.encode([query]) doc_embeddings = model.encode(documents) similarities = cosine_similarity(query_embedding, doc_embeddings)[0] best_match = np.argmax(similarities) print(f"Best match: {documents[best_match]}") print(f"Similarity score: {similarities[best_match]:.3f}") ``` ## 📊 **Text Classification** ```python # Sentiment analysis texts = [ "classification: Produk ini sangat berkualitas dan sesuai dengan harapan saya", "classification: Saya sangat kecewa dengan pelayanan yang diberikan", "classification: Lumayan bagus, ada beberapa kekurangan tapi overall oke" ] embeddings = model.encode(texts) # The embeddings can now be used with any classifier from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=2) # Positive vs Negative labels = kmeans.fit_predict(embeddings) ``` ## 🎯 **Clustering Indonesian Content** ```python # Group similar content indonesian_texts = [ "clustering: teknologi kecerdasan buatan dan machine learning", "clustering: perkembangan teknologi digital di Indonesia", "clustering: makanan tradisional Jawa seperti gudeg dan tahu gimbal", "clustering: kuliner khas Sumatera termasuk rendang dan gulai", "clustering: politik dan pemerintahan Indonesia", "clustering: kebijakan publik dan reformasi birokrasi" ] embeddings = model.encode(indonesian_texts) from sklearn.cluster import AgglomerativeClustering clustering = AgglomerativeClustering(n_clusters=3) labels = clustering.fit_predict(embeddings) # Group texts by cluster for cluster_id in set(labels): print(f"\nCluster {cluster_id}:") for i, text in enumerate(indonesian_texts): if labels[i] == cluster_id: print(f" - {text}") ``` ## 🔗 **Semantic Similarity** ```python # Find similar Indonesian sentences sentences = [ "Jakarta adalah ibukota Indonesia", "Ibukota negara Indonesia adalah Jakarta", "Saya suka makan nasi goreng", "Cuaca hari ini sangat panas", "Hari ini udaranya sangat panas" ] embeddings = model.encode(sentences) similarity_matrix = cosine_similarity(embeddings) print("Similarity Matrix:") for i, sent1 in enumerate(sentences): for j, sent2 in enumerate(sentences): if i < j: # Only upper triangle sim = similarity_matrix[i][j] print(f"{sim:.3f}: '{sent1}' <-> '{sent2}'") ``` ## 🏢 **Business Applications** ### Customer Support Ticket Routing ```python # Route customer complaints to appropriate departments support_tickets = [ "search_query: Masalah pembayaran dengan kartu kredit tidak bisa diproses", "search_query: Aplikasi sering crash dan tidak bisa dibuka", "search_query: Pesanan belum sampai padahal sudah lewat estimasi" ] departments = [ "search_document: Tim finance menangani masalah pembayaran, refund, dan billing", "search_document: Tim technical support menangani bug aplikasi dan masalah teknis", "search_document: Tim logistics menangani pengiriman, tracking, dan fulfillment" ] ticket_embeddings = model.encode(support_tickets) dept_embeddings = model.encode(departments) for i, ticket in enumerate(support_tickets): similarities = cosine_similarity([ticket_embeddings[i]], dept_embeddings)[0] best_dept = np.argmax(similarities) print(f"Ticket: {ticket}") print(f"Route to: {departments[best_dept]}") print(f"Confidence: {similarities[best_dept]:.3f}\n") ``` ### Content Recommendation ```python # Recommend similar articles user_interest = "search_query: Teknologi AI untuk pendidikan" articles = [ "search_document: Penerapan machine learning dalam sistem pembelajaran adaptif di sekolah", "search_document: Resep masakan tradisional Indonesia yang mudah dibuat di rumah", "search_document: Startup EdTech Indonesia menggunakan AI untuk personalisasi belajar", "search_document: Tips kesehatan untuk menjaga imunitas tubuh di musim hujan" ] interest_embedding = model.encode([user_interest]) article_embeddings = model.encode(articles) similarities = cosine_similarity(interest_embedding, article_embeddings)[0] ranked_articles = sorted(zip(articles, similarities), key=lambda x: x[1], reverse=True) print("Recommended articles:") for article, score in ranked_articles: print(f"{score:.3f}: {article}") ``` ## 📈 **Performance Tips** 1. **Batch Processing**: Encode multiple texts at once for better performance ```python # Good: Batch processing texts = ["text1", "text2", "text3", ...] embeddings = model.encode(texts) # Process all at once # Avoid: One by one processing embeddings = [model.encode([text]) for text in texts] # Slower ``` 2. **Caching**: Cache embeddings for repeated use ```python import pickle # Compute once embeddings = model.encode(large_text_corpus) # Save for reuse with open('embeddings.pkl', 'wb') as f: pickle.dump(embeddings, f) # Load when needed with open('embeddings.pkl', 'rb') as f: cached_embeddings = pickle.load(f) ``` 3. **GPU Acceleration**: Use GPU for faster inference (if available) ```python import torch device = 'cuda' if torch.cuda.is_available() else 'cpu' model = SentenceTransformer("asmud/nomic-embed-indonesian", device=device) ```