Quick Start
import dbnl
import os
import pandas as pd
from openai import OpenAI
from dbnl.eval.llm import OpenAILLMClient
from dbnl.eval import evaluate
# 1. create client to power LLM-as-judge metrics
base_oai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
oai_client = OpenAILLMClient.from_existing_client(base_oai_client, llm_model="gpt-3.5-turbo-0125")
eval_df = pd.DataFrame(
[
{ "prediction":"France has no capital",
"ground_truth": "The capital of France is Paris",},
{ "prediction":"The capital of France is Toronto",
"ground_truth": "The capital of France is Paris",},
{ "prediction":"Paris is the capital",
"ground_truth": "The capital of France is Paris",},
] * 4
)
# 2. get text metrics that use target (ground_truth) and LLM-as-judge metrics
text_metrics = dbnl.eval.metrics.text_metrics(
prediction="prediction", target="ground_truth", eval_llm_client=oai_client
)
# 3. run text metrics that use target (ground_truth) and LLM-as-judge metrics
aug_eval_df = evaluate(eval_df, text_metrics)
# 4. publish to DBNL
dbnl.login(api_token=os.environ["DBNL_API_TOKEN"])
project = dbnl.get_or_create_project(name="DEAL_testing")
cols = dbnl.experimental.get_column_schemas_from_dataframe(aug_eval_df)
run_config = dbnl.create_run_config(project=project, columns=cols)
run = dbnl.create_run(project=project, run_config=run_config)
dbnl.report_results(run=run, data=aug_eval_df)
dbnl.close_run(run=run)idx
prediction
ground_truth
llm_text_similarity_v0__prediction__ground_truth
Was this helpful?

