Text Share Online

import numpy as np
import faiss
import os
import httpx
import time
from dotenv import load_dotenv
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, hamming_loss
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd

load_dotenv()

# ——————- Config ——————-
client_id = os.getenv(“client_id”)
client_secret = os.getenv(“client_secret”)
token_url = os.getenv(“token_url”)
embeddings_url = os.getenv(“embeddings_url”)
proxy_url = os.getenv(“proxy_url2”)
access_token_info = {“access_token”: None, “expires_at”: 0}
print(proxy_url)
def get_access_token(client: httpx.Client):
global access_token_info
if access_token_info[“access_token”] and access_token_info[“expires_at”] > time.time() + 60:
return access_token_info[“access_token”]
response = client.post(
token_url, data={“grant_type”: “client_credentials”}, auth=(client_id, client_secret)
)
token_info = response.json()
access_token = token_info.get(“access_token”)
expires_in = token_info.get(“expires_in”, 3600)
access_token_info[“access_token”] = access_token
access_token_info[“expires_at”] = time.time() + expires_in
return access_token

def get_openai_embeddings_with_token(texts, model=”text-embedding-ada-002″, batch_size=256):
# convert single string to list
if isinstance(texts, str):
texts = [texts]
all_embeddings = []

with httpx.Client(proxy=proxy_url, verify=False) as client:
token = get_access_token(client)
if not token:
return None

headers = {“Authorization”: f”Bearer {token}”}

# batch processing
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
data = {“input”: batch, “model”: model}
r = client.post(embeddings_url, json=data, headers=headers)
r.raise_for_status()
embs = [x[“embedding”] for x in r.json().get(“data”, [])]
all_embeddings.extend(embs)
return np.array(all_embeddings, dtype=”float32″)

def get_data_multilabel():
all_examples = [

(“Send processed user list for olympicLU.”, [“OLY-LU”]),
(“Do a security review for OLYHK and olympicSG.”, [“OLY-HK”, “OLY-SG”])
]

sentences = [item[0] for item in all_examples]
labels_list_of_lists = [item[1] for item in all_examples]
return sentences, labels_list_of_lists

def build_vector_index(sentences):
embeddings = get_openai_embeddings_with_token(sentences)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)
faiss.write_index(index, “vector_data.bin”)
return index, embeddings

def load_vector_index():
if os.path.exists(“vector_data.bin”):
return faiss.read_index(“vector_data.bin”)
return None

import numpy as np
import pandas as pd
import faiss
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, hamming_loss, classification_report
from tabulate import tabulate

# —————————–

# Your existing helper functions

# —————————–

# get_data_multilabel()

# get_openai_embeddings_with_token()

# etc.

if __name__ == “__main__”:

sentences, labels_list = get_data_multilabel()

mlb = MultiLabelBinarizer()

y = mlb.fit_transform(labels_list)

class_names = mlb.classes_

print(f”Total samples available: {len(sentences)}”)

print(f”Number of classes: {len(class_names)}n”)

# =============================================

# STEP 1 — Generate ALL embeddings once

# =============================================

print(“Generating embeddings for ALL samples…”)

embeddings = get_openai_embeddings_with_token(sentences)

print(“Embeddings shape:”, embeddings.shape)

# Experiment dataset sizes

sizes = [200, 400, 600, 800]

sizes = [s for s in sizes if s <= len(sentences)]

print(“n===== STARTING EXPERIMENTS =====n”)

test_queries = [

“monaco olympic instance approval”,

“oly sg and oly ch”,

“status for both monaco and germany instances”,

“marketing budget proposal q3”,

“check OLY-LU and OLY-DE status”

]

# Table results storage

results_table = []

# Loop through 200, 400, 600 samples

for size in sizes:

print(f”nn##########################################”)

print(f”####### EXPERIMENT WITH {size} SAMPLES #####”)

print(f”##########################################n”)

# Slice dataset

X = embeddings[:size]

Y = y[:size]

subset_labels = labels_list[:size]

# =============================================

# STEP 2 — Build FAISS index for subset

# =============================================

print(f”Building FAISS index for {size} samples…”)

dim = X.shape[1]

faiss_index = faiss.IndexFlatIP(dim)

faiss_index.add(X)

# ——- Compute FAISS Recall@1/3/5 ——-

D_all, I_all = faiss_index.search(X, 5)

n = X.shape[0]

recall1 = sum(I_all[i][0] == i for i in range(n)) / n * 100

recall3 = sum(i in I_all[i][:3] for i in range(n)) / n * 100

recall5 = sum(i in I_all[i][:5] for i in range(n)) / n * 100

# =============================================

# STEP 3 — Train classifier

# =============================================

X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=42
)

print(f”Train size: {len(X_train)}, Test size: {len(X_test)}”)

clf_base = MLPClassifier(
hidden_layer_sizes=(256, 128),
activation=”relu”,
max_iter=1000,
alpha=0.001,
learning_rate_init=0.001,
batch_size=4,
random_state=42,
early_stopping=True,
validation_fraction=0.1,
verbose=False
)

clf = OneVsRestClassifier(clf_base)
clf.fit(X_train, y_train)

# =============================================
# STEP 4 — Evaluate classifier
# =============================================

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
h_loss = hamming_loss(y_test, y_pred)
print(“n— CLASSIFIER METRICS —“)
print(f”Exact Match Accuracy: {acc:.4f}”)
print(f”Hamming Loss: {h_loss:.4f}”)
print(“nClassification Report:”)
print(classification_report(
y_test, y_pred, target_names=class_names, zero_division=0
))

# Store results for final comparison
results_table.append([
size,
round(recall1, 2),
round(recall3, 2),
round(recall5, 2),
round(acc, 4),
round(h_loss, 4)
])

# =============================================
# STEP 5 — FAISS + classifier for test queries
# =============================================

print(“n— FAISS + CLASSIFIER QUERY RESULTS —“)
for query in test_queries:
print(f”nQuery: ‘{query}'”)
# Embed query
q_emb = get_openai_embeddings_with_token(query)
q_emb_2d = np.array(q_emb).reshape(1, -1)
# ———- FAISS ———-
D, I = faiss_index.search(q_emb_2d, k=3)
print(” FAISS Top 3:”)
for rank, (score, idx) in enumerate(zip(D[0], I[0])):
label_set = subset_labels[idx]
print(f” {rank+1}. {label_set} (score: {score:.4f})”)

# ———- CLASSIFIER ———-
probs = clf.predict_proba(q_emb_2d)[0]
prob_series = pd.Series(probs, index=class_names).sort_values(ascending=False)
mean_prob = probs.mean()
std_prob = probs.std()
threshold = mean_prob + 0.5 * std_prob
predicted_labels = prob_series[prob_series > threshold]

if predicted_labels.empty:
predicted_labels = prob_series.head(1)

print(” Classifier Prediction:”)
for label, score in predicted_labels.items():
print(f” – {label}: {score:.4f}”)

 

 

Share This: