microsoft · dluc · Sep 15, 2024 · Sep 15, 2024
@@ -20,11 +20,15 @@ internal sealed class AnswerCorrectnessEvaluator : EvaluationEngine
  private KernelFunction ExtractStatements => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Statements"), new OpenAIPromptExecutionSettings
  {
  Temperature = 1e-8f,
+ Seed = 0,
+ ResponseFormat = "json_object"
  }, functionName: nameof(this.ExtractStatements));
 
  private KernelFunction EvaluateCorrectness => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "Correctness"), new OpenAIPromptExecutionSettings
  {
  Temperature = 1e-8f,
+ Seed = 0,
+ ResponseFormat = "json_object"
  }, functionName: nameof(this.EvaluateCorrectness));
 
  public AnswerCorrectnessEvaluator(Kernel kernel)
@@ -42,7 +46,7 @@ internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Di
  { "answer", answer.Result }
  }).ConfigureAwait(false);
 
- return JsonSerializer.Deserialize<IEnumerable<string>>(extraction.GetValue<string>()!);
+ return JsonSerializer.Deserialize<StatementExtraction>(extraction.GetValue<string>()!);
  }).ConfigureAwait(false);
 
  if (statements is null)

@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+#pragma warning disable IDE0130 // reduce number of "using" statements
+// ReSharper disable CheckNamespace
+using System.Collections.Generic;
+using System.Text.Json.Serialization;
+
+namespace Microsoft.KernelMemory.Evaluators.AnswerCorrectness;
+
+#pragma warning disable CA1812 // 'StatementExtraction' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+internal sealed class StatementExtraction
+#pragma warning restore CA1812 // 'StatementExtraction' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+{
+ [JsonPropertyName("statements")]
+ public List<string> Statements { get; set; } = new List<string>();
+}
@@ -20,6 +20,8 @@ internal sealed class ContextRecallEvaluator : EvaluationEngine
  private KernelFunction EvaluateContextRecall => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "ContextRecall"), new OpenAIPromptExecutionSettings
  {
  Temperature = 1e-8f,
+ Seed = 0,
+ ResponseFormat = "json_object"
  }, functionName: nameof(this.EvaluateContextRecall));
 
  public ContextRecallEvaluator(Kernel kernel)
@@ -29,7 +31,7 @@ public ContextRecallEvaluator(Kernel kernel)
 
  internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Dictionary<string, object?> metadata)
  {
- var evaluations = await this.Try(3, async (remainingTry) =>
+ var classification = await this.Try(3, async (remainingTry) =>
  {
  var extraction = await this.EvaluateContextRecall.InvokeAsync(this._kernel, new KernelArguments
  {
@@ -38,16 +40,16 @@ internal async Task<float> Evaluate(TestSetItem testSet, MemoryAnswer answer, Di
  { "ground_truth", testSet.GroundTruth }
  }).ConfigureAwait(false);
 
- return JsonSerializer.Deserialize<IEnumerable<GroundTruthClassification>>(extraction.GetValue<string>()!);
+ return JsonSerializer.Deserialize<GroundTruthClassifications>(extraction.GetValue<string>()!);
  }).ConfigureAwait(false);
 
- if (evaluations is null)
+ if (classification is null)
  {
  return 0;
  }
 
- metadata.Add($"{nameof(ContextRecallEvaluator)}-Evaluation", evaluations);
+ metadata.Add($"{nameof(ContextRecallEvaluator)}-Evaluation", classification);
 
- return (float)evaluations.Count(c => c.Attributed > 0) / (float)evaluations.Count();
+ return (float)classification.Evaluations.Count(c => c.Attributed > 0) / (float)classification.Evaluations.Count;
  }
 }
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+#pragma warning disable IDE0130 // reduce number of "using" statements
+// ReSharper disable CheckNamespace
+using System.Collections.Generic;
+using System.Text.Json.Serialization;
+
+namespace Microsoft.KernelMemory.Evaluators.ContextRecall;
+
+#pragma warning disable CA1812 // 'GroundTruthClassifications' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+internal sealed class GroundTruthClassifications
+#pragma warning restore CA1812 // 'GroundTruthClassifications' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+{
+ [JsonPropertyName("evaluations")]
+ public List<GroundTruthClassification> Evaluations { get; set; } = new();
+}
@@ -19,6 +19,8 @@ internal sealed class ContextRelevancyEvaluator : EvaluationEngine
  private KernelFunction EvaluateContext => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "ContextPrecision"), new OpenAIPromptExecutionSettings
  {
  Temperature = 1e-8f,
+ Seed = 0,
+ ResponseFormat = "json_object"
  });
 
  public ContextRelevancyEvaluator(Kernel kernel)

@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+#pragma warning disable IDE0130 // reduce number of "using" statements
+// ReSharper disable CheckNamespace
+using System.Collections.Generic;
+using System.Text.Json.Serialization;
+
+namespace Microsoft.KernelMemory.Evaluators.Faithfulness;
+
+#pragma warning disable CA1812 // 'FaithfulnessEvaluations' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+internal sealed class FaithfulnessEvaluations
+#pragma warning restore CA1812 // 'FaithfulnessEvaluations' is an internal class that is apparently never instantiated. If so, remove the code from the assembly. If this class is intended to contain only static members, make it 'static' (Module in Visual Basic). (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1812)
+{
+ [JsonPropertyName("evaluations")]
+ public List<StatementEvaluation> Evaluations { get; set; } = new();
+}
@@ -6,6 +6,7 @@
 using System.Text.Json;
 using System.Threading.Tasks;
 using Microsoft.KernelMemory.Evaluation;
+using Microsoft.KernelMemory.Evaluators.AnswerCorrectness;
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.Connectors.OpenAI;
 
@@ -20,11 +21,15 @@ internal sealed class FaithfulnessEvaluator : EvaluationEngine
  private KernelFunction ExtractStatements => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Statements"), new OpenAIPromptExecutionSettings
  {
  Temperature = 1e-8f,
+ Seed = 0,
+ ResponseFormat = "json_object"
  }, functionName: nameof(this.ExtractStatements));
 
  private KernelFunction FaithfulnessEvaluation => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Evaluation", "Faithfulness"), new OpenAIPromptExecutionSettings
  {
  Temperature = 1e-8f,
+ Seed = 0,
+ ResponseFormat = "json_object"
  }, functionName: nameof(this.FaithfulnessEvaluation));
 
  public FaithfulnessEvaluator(Kernel kernel)
@@ -34,18 +39,18 @@ public FaithfulnessEvaluator(Kernel kernel)
 
  internal async Task<float> Evaluate(MemoryAnswer answer, Dictionary<string, object?> metadata)
  {
- var statements = await this.Try(3, async (remainingTry) =>
+ var extraction = await this.Try(3, async (remainingTry) =>
  {
  var extraction = await this.ExtractStatements.InvokeAsync(this._kernel, new KernelArguments
  {
  { "question", answer.Question },
  { "answer", answer.Result }
  }).ConfigureAwait(false);
 
- return JsonSerializer.Deserialize<IEnumerable<string>>(extraction.GetValue<string>()!);
+ return JsonSerializer.Deserialize<StatementExtraction>(extraction.GetValue<string>()!);
  }).ConfigureAwait(false);
 
- if (statements is null)
+ if (extraction is null)
  {
  return 0;
  }
@@ -56,10 +61,10 @@ internal async Task<float> Evaluate(MemoryAnswer answer, Dictionary<string, obje
  {
  { "context", string.Join(Environment.NewLine, answer.RelevantSources.SelectMany(c => c.Partitions.Select(p => p.Text))) },
  { "answer", answer.Result },
- { "statements", JsonSerializer.Serialize(statements) }
+ { "statements", JsonSerializer.Serialize(extraction) }
  }).ConfigureAwait(false);
 
- var faithfulness = JsonSerializer.Deserialize<IEnumerable<StatementEvaluation>>(evaluation.GetValue<string>()!);
+ var faithfulness = JsonSerializer.Deserialize<FaithfulnessEvaluations>(evaluation.GetValue<string>()!);
 
  return faithfulness;
  }).ConfigureAwait(false);
@@ -71,6 +76,6 @@ internal async Task<float> Evaluate(MemoryAnswer answer, Dictionary<string, obje
 
  metadata.Add($"{nameof(FaithfulnessEvaluator)}-Evaluation", faithfulness);
 
- return faithfulness.Count(c => c.Verdict > 0) / (float)statements.Count();
+ return faithfulness.Evaluations.Count(c => c.Verdict > 0) / (float)extraction.Statements.Count;
  }
 }
@@ -25,6 +25,8 @@ internal sealed class RelevanceEvaluator : EvaluationEngine
  private KernelFunction ExtractQuestion => this._kernel.CreateFunctionFromPrompt(this.GetSKPrompt("Extraction", "Question"), new OpenAIPromptExecutionSettings
  {
  Temperature = 1e-8f,
+ Seed = 0,
+ ResponseFormat = "json_object"
  }, functionName: nameof(this.ExtractQuestion));
 
  public RelevanceEvaluator(Kernel kernel)

@@ -26,10 +26,10 @@ verification: {
 
 ## Instructions
 
-Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with output.
+Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with output. Your output should always be a json data.
 Let's do it with a real data.
 
 question: {{$question}}
 context: {{$context}}
 answer: {{$answer}}
-verification: 
+verification: 
@@ -3,62 +3,67 @@
 question: What can you tell me about albert Albert Einstein?
 context: Albert Einstein (14 March 1879 - 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass-energy equivalence formula E = mc2, which arises from relativity theory, has been called 'the world's most famous equation'. He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect', a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.
 answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895
-classification: 
- [
- {
- "Statement": "Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.",
- "Reason": "The date of birth of Einstein is mentioned clearly in the context.",
- "Attributed": 1,
- },
- {
- "Statement": "He received the 1921 Nobel Prize in Physics for his services to theoretical physics.",
- "Reason": "The exact sentence is present in the given context.",
- "Attributed": 1,
- },
- {
- "Statement": "He published 4 papers in 1905.",
- "Reason": "There is no mention about papers he wrote in the given context.",
- "Attributed": 0,
- },
- {
- "Statement": "Einstein moved to Switzerland in 1895.",
- "Reason": "There is no supporting evidence for this in the given context.",
- "Attributed": 0
- },
- ]
+{
+ evaluations: 
+ [
+ {
+ "Statement": "Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.",
+ "Reason": "The date of birth of Einstein is mentioned clearly in the context.",
+ "Attributed": 1,
+ },
+ {
+ "Statement": "He received the 1921 Nobel Prize in Physics for his services to theoretical physics.",
+ "Reason": "The exact sentence is present in the given context.",
+ "Attributed": 1,
+ },
+ {
+ "Statement": "He published 4 papers in 1905.",
+ "Reason": "There is no mention about papers he wrote in the given context.",
+ "Attributed": 0,
+ },
+ {
+ "Statement": "Einstein moved to Switzerland in 1895.",
+ "Reason": "There is no supporting evidence for this in the given context.",
+ "Attributed": 0
+ },
+ ]
+}
 
 question: who won 2020 icc world cup?
 context: The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title.
 answer: England
-classification: 
- [
- {
- "Statement": "England won the 2022 ICC Men's T20 World Cup.",
- "Reason": "From context it is clear that England defeated Pakistan to win the World Cup.",
- "Attributed": 1
- },
- ]
-
+{
+ evaluations: 
+ [
+ {
+ "Statement": "England won the 2022 ICC Men's T20 World Cup.",
+ "Reason": "From context it is clear that England defeated Pakistan to win the World Cup.",
+ "Attributed": 1
+ },
+ ]
+}
 
 question: What is the primary fuel for the Sun?
 context: NULL
 answer: Hydrogen
-classification:
+{
+ evaluations:
  [
  {
  "Statement": "The Sun's primary fuel is hydrogen.",
  "Reason": "The context contains no information",
  "Attributed": 0
  },
  ]
+}
 
 ## Instructions
 
-Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification.
+Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification. Your output should always be a json data.
 
 Let's do it with a real data.
 
 question: {{$question}}
 context: {{$context}}
 answer: {{$ground_truth}}
-classification: 
+classification: 
@@ -84,9 +84,9 @@ Given a ground truth and an answer statements, analyze each statement and classi
 - FP (false positive): statements present in the answer but not directly supported by any statement in ground truth,
 - FN (false negative): statements found in the ground truth but not present in answer.
 
-Each statement can only belong to one of the categories. Provide a reason for each classification.
+Each statement can only belong to one of the categories. Provide a reason for each classification. Your output should always be a json data.
 
 question: {{$question}}
 answer: {{$answer}}
 ground_truth: {{$ground_truth}}
-extracted_statements: 
+extracted_statements: