Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix evaluations and test set generation to use json_object output format from LLMs #787

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,15 @@ internal sealed class AnswerCorrectnessEvaluator : EvaluationEngine
private KernelFunction ExtractStatements => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Statements"), new OpenAIPromptExecutionSettings
{
Temperature = 1e-8f,
Seed = 0,
ResponseFormat = "json_object"
}, functionName: nameof(this.ExtractStatements));

private KernelFunction EvaluateCorrectness => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "Correctness"), new OpenAIPromptExecutionSettings
{
Temperature = 1e-8f,
Seed = 0,
ResponseFormat = "json_object"
}, functionName: nameof(this.EvaluateCorrectness));

public AnswerCorrectnessEvaluator(Kernel kernel)
Expand All @@ -42,7 +46,7 @@ internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Di
{ "answer", answer.Result }
}).ConfigureAwait(false);

return JsonSerializer.Deserialize<IEnumerable<string>>(extraction.GetValue<string>()!);
return JsonSerializer.Deserialize<StatementExtraction>(extraction.GetValue<string>()!);
}).ConfigureAwait(false);

if (statements is null)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright (c) Microsoft. All rights reserved.

#pragma warning disable IDE0130 // reduce number of "using" statements
// ReSharper disable CheckNamespace
using System.Collections.Generic;
using System.Text.Json.Serialization;

namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness;

#pragma warning disable CA1812 // 'StatementExtraction' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
internal sealed class StatementExtraction
#pragma warning restore CA1812 // 'StatementExtraction' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
{
[JsonPropertyName("statements")]
public List<string> Statements { get; set; } = new List<string>();
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ internal sealed class ContextRecallEvaluator : EvaluationEngine
private KernelFunction EvaluateContextRecall => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "ContextRecall"), new OpenAIPromptExecutionSettings
{
Temperature = 1e-8f,
Seed = 0,
ResponseFormat = "json_object"
}, functionName: nameof(this.EvaluateContextRecall));

public ContextRecallEvaluator(Kernel kernel)
Expand All @@ -29,7 +31,7 @@ public ContextRecallEvaluator(Kernel kernel)

internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Dictionary<string, object?> metadata)
{
var evaluations = await this.Try(3, async (remainingTry) =>
var classification = await this.Try(3, async (remainingTry) =>
{
var extraction = await this.EvaluateContextRecall.InvokeAsync(this._kernel, new KernelArguments
{
Expand All @@ -38,16 +40,16 @@ internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Di
{ "ground_truth", testSet.GroundTruth }
}).ConfigureAwait(false);

return JsonSerializer.Deserialize<IEnumerable<GroundTruthClassification>>(extraction.GetValue<string>()!);
return JsonSerializer.Deserialize<GroundTruthClassifications>(extraction.GetValue<string>()!);
}).ConfigureAwait(false);

if (evaluations is null)
if (classification is null)
{
return 0;
}

metadata.Add($"{nameof(ContextRecallEvaluator)}-Evaluation", evaluations);
metadata.Add($"{nameof(ContextRecallEvaluator)}-Evaluation", classification);

return (float)evaluations.Count(c => c.Attributed > 0) / (float)evaluations.Count();
return (float)classification.Evaluations.Count(c => c.Attributed > 0) / (float)classification.Evaluations.Count;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright (c) Microsoft. All rights reserved.

#pragma warning disable IDE0130 // reduce number of "using" statements
// ReSharper disable CheckNamespace
using System.Collections.Generic;
using System.Text.Json.Serialization;

namespace Microsoft.KernelMemory.Evaluators.ContextRecall;

#pragma warning disable CA1812 // 'GroundTruthClassifications' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
internal sealed class GroundTruthClassifications
#pragma warning restore CA1812 // 'GroundTruthClassifications' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
{
[JsonPropertyName("evaluations")]
public List<GroundTruthClassification> Evaluations { get; set; } = new();
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ internal sealed class ContextRelevancyEvaluator : EvaluationEngine
private KernelFunction EvaluateContext => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "ContextPrecision"), new OpenAIPromptExecutionSettings
{
Temperature = 1e-8f,
Seed = 0,
ResponseFormat = "json_object"
});

public ContextRelevancyEvaluator(Kernel kernel)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright (c) Microsoft. All rights reserved.

#pragma warning disable IDE0130 // reduce number of "using" statements
// ReSharper disable CheckNamespace
using System.Collections.Generic;
using System.Text.Json.Serialization;

namespace Microsoft.KernelMemory.Evaluators.Faithfulness;

#pragma warning disable CA1812 // 'FaithfulnessEvaluations' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
internal sealed class FaithfulnessEvaluations
#pragma warning restore CA1812 // 'FaithfulnessEvaluations' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
{
[JsonPropertyName("evaluations")]
public List<StatementEvaluation> Evaluations { get; set; } = new();
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Text.Json;
using System.Threading.Tasks;
using Microsoft.KernelMemory.Evaluation;
using Microsoft.KernelMemory.Evaluators.AnswerCorrectness;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.Connectors.OpenAI;

Expand All @@ -20,11 +21,15 @@ internal sealed class FaithfulnessEvaluator : EvaluationEngine
private KernelFunction ExtractStatements => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Statements"), new OpenAIPromptExecutionSettings
{
Temperature = 1e-8f,
Seed = 0,
ResponseFormat = "json_object"
}, functionName: nameof(this.ExtractStatements));

private KernelFunction FaithfulnessEvaluation => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "Faithfulness"), new OpenAIPromptExecutionSettings
{
Temperature = 1e-8f,
Seed = 0,
ResponseFormat = "json_object"
}, functionName: nameof(this.FaithfulnessEvaluation));

public FaithfulnessEvaluator(Kernel kernel)
Expand All @@ -34,18 +39,18 @@ public FaithfulnessEvaluator(Kernel kernel)

internal async Task<float> Evaluate(MemoryAnswer answer, Dictionary<string, object?> metadata)
{
var statements = await this.Try(3, async (remainingTry) =>
var extraction = await this.Try(3, async (remainingTry) =>
{
var extraction = await this.ExtractStatements.InvokeAsync(this._kernel, new KernelArguments
{
{ "question", answer.Question },
{ "answer", answer.Result }
}).ConfigureAwait(false);

return JsonSerializer.Deserialize<IEnumerable<string>>(extraction.GetValue<string>()!);
return JsonSerializer.Deserialize<StatementExtraction>(extraction.GetValue<string>()!);
}).ConfigureAwait(false);

if (statements is null)
if (extraction is null)
{
return 0;
}
Expand All @@ -56,10 +61,10 @@ internal async Task<float> Evaluate(MemoryAnswer answer, Dictionary<string, obje
{
{ "context", string.Join(Environment.NewLine, answer.RelevantSources.SelectMany(c => c.Partitions.Select(p => p.Text))) },
{ "answer", answer.Result },
{ "statements", JsonSerializer.Serialize(statements) }
{ "statements", JsonSerializer.Serialize(extraction) }
}).ConfigureAwait(false);

var faithfulness = JsonSerializer.Deserialize<IEnumerable<StatementEvaluation>>(evaluation.GetValue<string>()!);
var faithfulness = JsonSerializer.Deserialize<FaithfulnessEvaluations>(evaluation.GetValue<string>()!);

return faithfulness;
}).ConfigureAwait(false);
Expand All @@ -71,6 +76,6 @@ internal async Task<float> Evaluate(MemoryAnswer answer, Dictionary<string, obje

metadata.Add($"{nameof(FaithfulnessEvaluator)}-Evaluation", faithfulness);

return faithfulness.Count(c => c.Verdict > 0) / (float)statements.Count();
return faithfulness.Evaluations.Count(c => c.Verdict > 0) / (float)extraction.Statements.Count;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ internal sealed class RelevanceEvaluator : EvaluationEngine
private KernelFunction ExtractQuestion => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Question"), new OpenAIPromptExecutionSettings
{
Temperature = 1e-8f,
Seed = 0,
ResponseFormat = "json_object"
}, functionName: nameof(this.ExtractQuestion));

public RelevanceEvaluator(Kernel kernel)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ verification: {

## Instructions

Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with output.
Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with output. Your output should always be a json data.
Let's do it with a real data.

question: {{$question}}
context: {{$context}}
answer: {{$answer}}
verification:
verification:
75 changes: 40 additions & 35 deletions applications/evaluation/Prompts/Evaluation/ContextRecall.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,62 +3,67 @@
question: What can you tell me about albert Albert Einstein?
context: Albert Einstein (14 March 1879 - 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass-energy equivalence formula E = mc2, which arises from relativity theory, has been called 'the world's most famous equation'. He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect', a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.
answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895
classification:
[
{
"Statement": "Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.",
"Reason": "The date of birth of Einstein is mentioned clearly in the context.",
"Attributed": 1,
},
{
"Statement": "He received the 1921 Nobel Prize in Physics for his services to theoretical physics.",
"Reason": "The exact sentence is present in the given context.",
"Attributed": 1,
},
{
"Statement": "He published 4 papers in 1905.",
"Reason": "There is no mention about papers he wrote in the given context.",
"Attributed": 0,
},
{
"Statement": "Einstein moved to Switzerland in 1895.",
"Reason": "There is no supporting evidence for this in the given context.",
"Attributed": 0
},
]
{
evaluations:
[
{
"Statement": "Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.",
"Reason": "The date of birth of Einstein is mentioned clearly in the context.",
"Attributed": 1,
},
{
"Statement": "He received the 1921 Nobel Prize in Physics for his services to theoretical physics.",
"Reason": "The exact sentence is present in the given context.",
"Attributed": 1,
},
{
"Statement": "He published 4 papers in 1905.",
"Reason": "There is no mention about papers he wrote in the given context.",
"Attributed": 0,
},
{
"Statement": "Einstein moved to Switzerland in 1895.",
"Reason": "There is no supporting evidence for this in the given context.",
"Attributed": 0
},
]
}

question: who won 2020 icc world cup?
context: The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title.
answer: England
classification:
[
{
"Statement": "England won the 2022 ICC Men's T20 World Cup.",
"Reason": "From context it is clear that England defeated Pakistan to win the World Cup.",
"Attributed": 1
},
]

{
evaluations:
[
{
"Statement": "England won the 2022 ICC Men's T20 World Cup.",
"Reason": "From context it is clear that England defeated Pakistan to win the World Cup.",
"Attributed": 1
},
]
}

question: What is the primary fuel for the Sun?
context: NULL
answer: Hydrogen
classification:
{
evaluations:
[
{
"Statement": "The Sun's primary fuel is hydrogen.",
"Reason": "The context contains no information",
"Attributed": 0
},
]
}

## Instructions

Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification.
Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification. Your output should always be a json data.

Let's do it with a real data.

question: {{$question}}
context: {{$context}}
answer: {{$ground_truth}}
classification:
classification:
4 changes: 2 additions & 2 deletions applications/evaluation/Prompts/Evaluation/Correctness.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ Given a ground truth and an answer statements, analyze each statement and classi
- FP (false positive): statements present in the answer but not directly supported by any statement in ground truth,
- FN (false negative): statements found in the ground truth but not present in answer.

Each statement can only belong to one of the categories. Provide a reason for each classification.
Each statement can only belong to one of the categories. Provide a reason for each classification. Your output should always be a json data.

question: {{$question}}
answer: {{$answer}}
ground_truth: {{$ground_truth}}
extracted_statements:
extracted_statements:
Loading
Loading