Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VectorStoreIndexCreator, error fixes, tweeking, Documents QnA test #50

Merged
merged 2 commits into from
Nov 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion LangChain.Sources.slnf
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
"src\\libs\\Providers\\LangChain.Providers.LLamaSharp\\LangChain.Providers.LLamaSharp.csproj",
"src\\libs\\Providers\\LangChain.Providers.HuggingFace\\LangChain.Providers.HuggingFace.csproj",
"src\\tests\\LangChain.Providers.LLamaSharp.IntegrationTests\\LangChain.Providers.LLamaSharp.IntegrationTests.csproj",
"src\\tests\\LangChain.UnitTest\\LangChain.UnitTest.csproj"
"src\\tests\\LangChain.UnitTest\\LangChain.UnitTest.csproj",
"src\\libs\\Databases\\LangChain.Databases.InMemory\\LangChain.Databases.InMemory.csproj"
]
}
}
7 changes: 7 additions & 0 deletions LangChain.sln
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LangChain.Providers.LLamaSh
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LangChain.Providers.LLamaSharp.IntegrationTests", "src\tests\LangChain.Providers.LLamaSharp.IntegrationTests\LangChain.Providers.LLamaSharp.IntegrationTests.csproj", "{B89821C6-D8E9-4D0E-9BB6-13E3C0A24D9D}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LangChain.Databases.InMemory", "src\libs\Databases\LangChain.Databases.InMemory\LangChain.Databases.InMemory.csproj", "{A71579D1-CCAF-4F4C-B2E7-01968F11484F}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -316,6 +318,10 @@ Global
{B89821C6-D8E9-4D0E-9BB6-13E3C0A24D9D}.Debug|Any CPU.Build.0 = Debug|Any CPU
{B89821C6-D8E9-4D0E-9BB6-13E3C0A24D9D}.Release|Any CPU.ActiveCfg = Release|Any CPU
{B89821C6-D8E9-4D0E-9BB6-13E3C0A24D9D}.Release|Any CPU.Build.0 = Release|Any CPU
{A71579D1-CCAF-4F4C-B2E7-01968F11484F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{A71579D1-CCAF-4F4C-B2E7-01968F11484F}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A71579D1-CCAF-4F4C-B2E7-01968F11484F}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A71579D1-CCAF-4F4C-B2E7-01968F11484F}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -369,6 +375,7 @@ Global
{302CD326-ADC3-484E-8F41-A54934A01D70} = {FDEE2E22-C239-4921-83B2-9797F765FD6A}
{955F163E-8330-4240-934E-3849F6E366C7} = {E55391DE-F8F3-4CC2-A0E3-2406C76E9C68}
{B89821C6-D8E9-4D0E-9BB6-13E3C0A24D9D} = {FDEE2E22-C239-4921-83B2-9797F765FD6A}
{A71579D1-CCAF-4F4C-B2E7-01968F11484F} = {C58D122C-808F-43F9-BB23-4E517046F533}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {5C00D0F1-6138-4ED9-846B-97E43D6DFF1C}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ public ChromaVectorStore(
HttpClient httpClient,
string endpoint,
IEmbeddings embeddings,
string collectionName = LangchainDefaultCollectionName)
string collectionName = LangchainDefaultCollectionName):base(embeddings)
{
_client = new ChromaClient(httpClient, endpoint);
Embeddings = embeddings;

_collectionName = collectionName;

_store = new ChromaMemoryStore(_client);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace LangChain.Databases.InMemory
{
public enum EDistanceMetrics
{
Euclidean,
Manhattan
}
}
111 changes: 111 additions & 0 deletions src/libs/Databases/LangChain.Databases.InMemory/InMemoryVectorStore.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using LangChain.Abstractions.Embeddings.Base;
using LangChain.Docstore;
using LangChain.VectorStores;

namespace LangChain.Databases.InMemory
{
public class InMemoryVectorStore:VectorStore
{

private readonly Func<float[], float[], float> _distanceFunction;
List<(float[] vec, string id, Document doc)> _storage = new List<(float[] vec, string id, Document doc)>();
public InMemoryVectorStore(IEmbeddings embeddings,EDistanceMetrics distanceMetrics=EDistanceMetrics.Euclidean):base(embeddings)
{
if (distanceMetrics == EDistanceMetrics.Euclidean)
_distanceFunction = Utils.ComputeEuclideanDistance;
else
_distanceFunction = Utils.ComputeManhattanDistance;


}



public override async Task<IEnumerable<string>> AddDocumentsAsync(IEnumerable<Document> documents, CancellationToken cancellationToken = default)
{

var docs = documents.ToArray();

var embeddings = await Embeddings.EmbedDocumentsAsync(docs.Select(x=>x.PageContent).ToArray());
List<string> ids = new List<string>();
for (int i = 0; i < docs.Length; i++)
{
var id = Guid.NewGuid().ToString();
ids.Add(id);
_storage.Add((embeddings[i], id, docs[i]));
}

return ids;
}

public override async Task<IEnumerable<string>> AddTextsAsync(IEnumerable<string> texts, IEnumerable<Dictionary<string, object>> metadatas = null, CancellationToken cancellationToken = default)
{
if(metadatas!=null)
{
var docs = texts.Zip(metadatas,(d,m)=>new Document(d,m)).ToArray();
return await AddDocumentsAsync(docs, cancellationToken);
}
else
{
var docs = texts.Select(d => new Document(d)).ToArray();
return await AddDocumentsAsync(docs, cancellationToken);
}

}

public override Task<bool> DeleteAsync(IEnumerable<string> ids, CancellationToken cancellationToken = default)
{
_storage.RemoveAll(s => ids.Contains(s.id));
return Task.FromResult(true);
}

public override async Task<IEnumerable<Document>> SimilaritySearchAsync(string query, int k = 4, CancellationToken cancellationToken = default)
{
var embedding = await Embeddings.EmbedQueryAsync(query);
return await SimilaritySearchByVectorAsync(embedding, k, cancellationToken);
}

public override Task<IEnumerable<Document>> SimilaritySearchByVectorAsync(IEnumerable<float> embedding, int k = 4, CancellationToken cancellationToken = default)
{

var arr = embedding.ToArray();
var distances = _storage.OrderBy(s => _distanceFunction(s.vec, arr)).Take(k);
return Task.FromResult(distances.Select(d => d.doc));
}

public override async Task<IEnumerable<(Document, float)>> SimilaritySearchWithScoreAsync(string query,
int k = 4, CancellationToken cancellationToken = default)
{
var embedding = await Embeddings.EmbedQueryAsync(query);
var arr = embedding.ToArray();
var distances = _storage.Select(s =>
new {
doc = s.doc ,
distance = _distanceFunction(s.vec, arr)
}).Take(k);
return distances.Select(d => new ValueTuple<Document, float>(d.doc, d.distance));
}

public override Task<IEnumerable<Document>> MaxMarginalRelevanceSearchByVector(IEnumerable<float> embedding, int k = 4, int fetchK = 20, float lambdaMult = 0.5f,
CancellationToken cancellationToken = default)
{
throw new NotImplementedException();
}

public override Task<IEnumerable<Document>> MaxMarginalRelevanceSearch(string query, int k = 4, int fetchK = 20, float lambdaMult = 0.5f,
CancellationToken cancellationToken = default)
{
throw new NotImplementedException();
}

protected override Func<float, float> SelectRelevanceScoreFn()
{
throw new NotImplementedException();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFrameworks>net4.6.2;netstandard2.0;net6.0;net7.0</TargetFrameworks>
</PropertyGroup>

<PropertyGroup Label="NuGet">
<Description>InMemory db for LangChain.</Description>
<PackageTags>$(PackageTags);inmemory</PackageTags>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="System.Net.Http" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\LangChain.Core\LangChain.Core.csproj" />

</ItemGroup>

</Project>
30 changes: 30 additions & 0 deletions src/libs/Databases/LangChain.Databases.InMemory/Utils.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using System;

namespace LangChain.Databases.InMemory
{

public class Utils
{
public static float ComputeEuclideanDistance(float[] vector1, float[] vector2)
{
double sum = 0.0;
for (int i = 0; i < vector1.Length; i++)
{
sum += Math.Pow(vector1[i] - vector2[i], 2);
}

return (float)Math.Sqrt(sum);
}

public static float ComputeManhattanDistance(float[] vector1, float[] vector2)
{
double sum = 0.0;
for (int i = 0; i < vector1.Length; i++)
{
sum += Math.Abs(vector1[i] - vector2[i]);
}

return (float)sum;
}
}
}
2 changes: 1 addition & 1 deletion src/libs/LangChain.Core/Base/BaseChain.cs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ public virtual async Task<string> Run(Dictionary<string, object> input)
{
var keysLengthDifferent = InputKeys.Length != input.Count;

if (!keysLengthDifferent)
if (keysLengthDifferent)
{
throw new ArgumentException($"Chain {ChainType()} expects {InputKeys.Length} but, received {input.Count}");
}
Expand Down
15 changes: 15 additions & 0 deletions src/libs/LangChain.Core/Base/ISource.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
using LangChain.Docstore;

namespace LangChain.Base;

/// <summary>
///
/// </summary>
public interface ISource
{
/// <summary>
///
/// </summary>
/// <param name="cancellationToken"></param>
Task<IReadOnlyCollection<LangChain.Docstore.Document>> LoadAsync(CancellationToken cancellationToken = default);
}
2 changes: 1 addition & 1 deletion src/libs/LangChain.Core/Base/TextSplitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@
return documents;
}

public List<Document> SplitDocuments(List<Document> documents)
public List<Document> SplitDocuments(IEnumerable<Document> documents)
{
var texts = documents.Select(doc => doc.PageContent).ToList();

Check warning on line 74 in src/libs/LangChain.Core/Base/TextSplitter.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

Possible multiple enumerations of 'IEnumerable' collection. Consider using an implementation that avoids multiple enumerations. (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1851)
var metadatas = documents.Select(doc => doc.Metadata).ToList();

Check warning on line 75 in src/libs/LangChain.Core/Base/TextSplitter.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

Possible multiple enumerations of 'IEnumerable' collection. Consider using an implementation that avoids multiple enumerations. (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1851)

Check warning on line 75 in src/libs/LangChain.Core/Base/TextSplitter.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

Possible multiple enumerations of 'IEnumerable' collection. Consider using an implementation that avoids multiple enumerations. (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1851)

return CreateDocuments(texts, metadatas);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public override async Task<IChainValues> CallAsync(IChainValues values)
.Where(kv => kv.Key != InputKey)
.ToDictionary(kv => kv.Key, kv => kv.Value);

var (output, returnDict) = await CombineDocsAsync(docs as List<Document>, otherKeys);
var (output, returnDict) = await CombineDocsAsync((docs as List<Document>), otherKeys);

returnDict[OutputKey] = output;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ public override async Task<IChainValues> CallAsync(IChainValues values)
{
var question = values.Value[_inputKey].ToString();

var docs = await GetDocsAsync(question);
var docs = (await GetDocsAsync(question)).ToList();

var input = new Dictionary<string, object>
{
["input_documents"] = docs,
["question"] = question
[fields.DocumentsKey] = docs,
[_inputKey] = question
};

var answer = await _combineDocumentsChain.Run(input);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ public class BaseRetrievalQaChainInput(BaseCombineDocumentsChain combineDocument
/// <summary> Return the source documents or not. </summary>
public bool ReturnSourceDocuments { get; set; }

public string InputKey { get; set; } = "input_documents";
public string InputKey { get; set; } = "question";
public string DocumentsKey { get; set; } = "input_documents";
public string OutputKey { get; set; } = "output_text";
public bool? Verbose { get; set; }
public CallbackManager? CallbackManager { get; set; }
Expand Down
7 changes: 6 additions & 1 deletion src/libs/LangChain.Core/Docstore/Document.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,17 @@
/// </summary>
public class Document
{
public Document(string content, Dictionary<string, object> metadata)
public Document(string content, Dictionary<string, object>? metadata=null)
{
metadata ??= new Dictionary<string, object>();
PageContent = content;
Metadata = metadata;
}

public static Document Empty { get; } = new(
content: string.Empty,
metadata: new Dictionary<string, object>());

public string PageContent { get; set; }
public int LookupIndex { get; set; }
public string LookupStr { get; set; }
Expand All @@ -27,7 +32,7 @@
/// </summary>
public List<string> Paragraphs()
{
return PageContent.Split(new []{"\n\n"},StringSplitOptions.None).ToList();

Check warning on line 35 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

Prefer 'static readonly' fields over constant array arguments if the called method is called repeatedly and is not mutating the passed array (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1861)

Check warning on line 35 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

Prefer 'static readonly' fields over constant array arguments if the called method is called repeatedly and is not mutating the passed array (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1861)
}
/// <summary>
/// Summary of the page (the first paragraph)
Expand All @@ -43,9 +48,9 @@
public string Lookup(string searchString)
{
// if there is a new search string, reset the index
if (searchString.ToLower(CultureInfo.InvariantCulture) != LookupStr)

Check warning on line 51 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

Prefer using 'string.Equals(string, StringComparison)' to perform a case-insensitive comparison, but keep in mind that this might cause subtle changes in behavior, so make sure to conduct thorough testing after applying the suggestion, or if culturally sensitive comparison is not required, consider using 'StringComparison.OrdinalIgnoreCase' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1862)

Check warning on line 51 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

In method 'Lookup', replace the call to 'ToLower' with 'ToUpperInvariant' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1308)
{
LookupStr = searchString.ToLower(CultureInfo.InvariantCulture);

Check warning on line 53 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

In method 'Lookup', replace the call to 'ToLower' with 'ToUpperInvariant' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1308)
LookupIndex = 0;
}
else
Expand All @@ -54,7 +59,7 @@
}

// get all the paragraphs that contain the search string
var lookups = Paragraphs().Where(p => p.ToLower(CultureInfo.InvariantCulture).Contains(LookupStr)).ToList();

Check warning on line 62 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

Prefer the string comparison method overload of 'string.Contains(string)' that takes a 'StringComparison' enum value to perform a case-insensitive comparison, but keep in mind that this might cause subtle changes in behavior, so make sure to conduct thorough testing after applying the suggestion, or if culturally sensitive comparison is not required, consider using 'StringComparison.OrdinalIgnoreCase' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1862)

Check warning on line 62 in src/libs/LangChain.Core/Docstore/Document.cs

View workflow job for this annotation

GitHub Actions / Build abd test / Build, test and publish

In method 'Lookup', replace the call to 'ToLower' with 'ToUpperInvariant' (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1308)

if (lookups.Count == 0)
{
Expand Down
46 changes: 46 additions & 0 deletions src/libs/LangChain.Core/Indexes/VectorStoreIndexCreator.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
using LangChain.Abstractions.Embeddings.Base;
using LangChain.Base;
using LangChain.Docstore;
using LangChain.VectorStores;

namespace LangChain.Indexes;

/// <summary>
/// Logic for creating a vectorstore index.
/// </summary>
public class VectorStoreIndexCreator
{
public VectorStore VectorStore { get; }
public TextSplitter TextSplitter { get; }

// embeddings are not needed here because VectorStore already has them
public VectorStoreIndexCreator(VectorStore vectorStore, TextSplitter textSplitter)
{
VectorStore = vectorStore;
TextSplitter = textSplitter;
}

/// <summary>
/// Create a vectorstore index from loaders.
/// </summary>
public async Task<VectorStoreIndexWrapper> FromLoaders(List<BaseLoader> loaders)
{
List<Document> documents = new();
foreach (var loader in loaders)
{
documents.AddRange(loader.Load());
}

return await FromDocumentsAsync(documents);
}

/// <summary>
/// Create a vectorstore index from documents.
/// </summary>
public async Task<VectorStoreIndexWrapper> FromDocumentsAsync(List<Document> documents)
{
var subDocs = TextSplitter.SplitDocuments(documents);
await VectorStore.AddDocumentsAsync(subDocs);
return new VectorStoreIndexWrapper(VectorStore);
}
}
Loading