import numpy as np
import faiss
import os
import httpx
import time
from dotenv import load_dotenv
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, hamming_loss
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
load_dotenv()
# ——————- Config ——————-
client_id = os.getenv(“client_id”)
client_secret = os.getenv(“client_secret”)
token_url = os.getenv(“token_url”)
embeddings_url = os.getenv(“embeddings_url”)
proxy_url = os.getenv(“proxy_url2”)
access_token_info = {“access_token”: None, “expires_at”: 0}
print(proxy_url)
def get_access_token(client: httpx.Client):
global access_token_info
if access_token_info[“access_token”] and access_token_info[“expires_at”] > time.time() + 60:
return access_token_info[“access_token”]
response = client.post(
token_url, data={“grant_type”: “client_credentials”}, auth=(client_id, client_secret)
)
token_info = response.json()
access_token = token_info.get(“access_token”)
expires_in = token_info.get(“expires_in”, 3600)
access_token_info[“access_token”] = access_token
access_token_info[“expires_at”] = time.time() + expires_in
return access_token
def get_openai_embeddings_with_token(texts, model=”text-embedding-ada-002″, batch_size=256):
# convert single string to list
if isinstance(texts, str):
texts = [texts]
all_embeddings = []
with httpx.Client(proxy=proxy_url, verify=False) as client:
token = get_access_token(client)
if not token:
return None
headers = {“Authorization”: f”Bearer {token}”}
# batch processing
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
data = {“input”: batch, “model”: model}
r = client.post(embeddings_url, json=data, headers=headers)
r.raise_for_status()
embs = [x[“embedding”] for x in r.json().get(“data”, [])]
all_embeddings.extend(embs)
return np.array(all_embeddings, dtype=”float32″)
def get_data_multilabel():
all_examples = [
(“Send processed user list for olympicLU.”, [“OLY-LU”]),
(“Do a security review for OLYHK and olympicSG.”, [“OLY-HK”, “OLY-SG”])
]
sentences = [item[0] for item in all_examples]
labels_list_of_lists = [item[1] for item in all_examples]
return sentences, labels_list_of_lists
def build_vector_index(sentences):
embeddings = get_openai_embeddings_with_token(sentences)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)
faiss.write_index(index, “vector_data.bin”)
return index, embeddings
def load_vector_index():
if os.path.exists(“vector_data.bin”):
return faiss.read_index(“vector_data.bin”)
return None
import numpy as np
import pandas as pd
import faiss
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, hamming_loss, classification_report
from tabulate import tabulate
# —————————–
# Your existing helper functions
# —————————–
# get_data_multilabel()
# get_openai_embeddings_with_token()
# etc.
if __name__ == “__main__”:
sentences, labels_list = get_data_multilabel()
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels_list)
class_names = mlb.classes_
print(f”Total samples available: {len(sentences)}”)
print(f”Number of classes: {len(class_names)}n”)
# =============================================
# STEP 1 — Generate ALL embeddings once
# =============================================
print(“Generating embeddings for ALL samples…”)
embeddings = get_openai_embeddings_with_token(sentences)
print(“Embeddings shape:”, embeddings.shape)
# Experiment dataset sizes
sizes = [200, 400, 600, 800]
sizes = [s for s in sizes if s <= len(sentences)]
print(“n===== STARTING EXPERIMENTS =====n”)
test_queries = [
“monaco olympic instance approval”,
“oly sg and oly ch”,
“status for both monaco and germany instances”,
“marketing budget proposal q3”,
“check OLY-LU and OLY-DE status”
]
# Table results storage
results_table = []
# Loop through 200, 400, 600 samples
for size in sizes:
print(f”nn##########################################”)
print(f”####### EXPERIMENT WITH {size} SAMPLES #####”)
print(f”##########################################n”)
# Slice dataset
X = embeddings[:size]
Y = y[:size]
subset_labels = labels_list[:size]
# =============================================
# STEP 2 — Build FAISS index for subset
# =============================================
print(f”Building FAISS index for {size} samples…”)
dim = X.shape[1]
faiss_index = faiss.IndexFlatIP(dim)
faiss_index.add(X)
# ——- Compute FAISS Recall@1/3/5 ——-
D_all, I_all = faiss_index.search(X, 5)
n = X.shape[0]
recall1 = sum(I_all[i][0] == i for i in range(n)) / n * 100
recall3 = sum(i in I_all[i][:3] for i in range(n)) / n * 100
recall5 = sum(i in I_all[i][:5] for i in range(n)) / n * 100
# =============================================
# STEP 3 — Train classifier
# =============================================
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=42
)
print(f”Train size: {len(X_train)}, Test size: {len(X_test)}”)
clf_base = MLPClassifier(
hidden_layer_sizes=(256, 128),
activation=”relu”,
max_iter=1000,
alpha=0.001,
learning_rate_init=0.001,
batch_size=4,
random_state=42,
early_stopping=True,
validation_fraction=0.1,
verbose=False
)
clf = OneVsRestClassifier(clf_base)
clf.fit(X_train, y_train)
# =============================================
# STEP 4 — Evaluate classifier
# =============================================
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
h_loss = hamming_loss(y_test, y_pred)
print(“n— CLASSIFIER METRICS —“)
print(f”Exact Match Accuracy: {acc:.4f}”)
print(f”Hamming Loss: {h_loss:.4f}”)
print(“nClassification Report:”)
print(classification_report(
y_test, y_pred, target_names=class_names, zero_division=0
))
# Store results for final comparison
results_table.append([
size,
round(recall1, 2),
round(recall3, 2),
round(recall5, 2),
round(acc, 4),
round(h_loss, 4)
])
# =============================================
# STEP 5 — FAISS + classifier for test queries
# =============================================
print(“n— FAISS + CLASSIFIER QUERY RESULTS —“)
for query in test_queries:
print(f”nQuery: ‘{query}'”)
# Embed query
q_emb = get_openai_embeddings_with_token(query)
q_emb_2d = np.array(q_emb).reshape(1, -1)
# ———- FAISS ———-
D, I = faiss_index.search(q_emb_2d, k=3)
print(” FAISS Top 3:”)
for rank, (score, idx) in enumerate(zip(D[0], I[0])):
label_set = subset_labels[idx]
print(f” {rank+1}. {label_set} (score: {score:.4f})”)
# ———- CLASSIFIER ———-
probs = clf.predict_proba(q_emb_2d)[0]
prob_series = pd.Series(probs, index=class_names).sort_values(ascending=False)
mean_prob = probs.mean()
std_prob = probs.std()
threshold = mean_prob + 0.5 * std_prob
predicted_labels = prob_series[prob_series > threshold]
if predicted_labels.empty:
predicted_labels = prob_series.head(1)
print(” Classifier Prediction:”)
for label, score in predicted_labels.items():
print(f” – {label}: {score:.4f}”)