diff --git a/src/dotnet/Common/Common.csproj b/src/dotnet/Common/Common.csproj index 3bc49d15c9..5a8eace780 100644 --- a/src/dotnet/Common/Common.csproj +++ b/src/dotnet/Common/Common.csproj @@ -10,7 +10,9 @@ + + diff --git a/src/dotnet/Common/Constants/AppConfigurationKeys.cs b/src/dotnet/Common/Constants/AppConfigurationKeys.cs index c25489c124..dac3cffce8 100644 --- a/src/dotnet/Common/Constants/AppConfigurationKeys.cs +++ b/src/dotnet/Common/Constants/AppConfigurationKeys.cs @@ -630,9 +630,13 @@ public static class AppConfigurationKeys /// public const string FoundationaLLM_SemanticKernelAPI_OpenAI_ShortSummaryPromptName = "FoundationaLLM:SemanticKernelAPI:OpenAI.ShortSummaryPromptName"; /// - /// The key for the FoundationaLLM:Vectorization:WorkerSettings app configuration setting. + /// The key section for the FoundationaLLM:Vectorization:ContentSourceManagerService app configuration setting. /// - public const string FoundationaLLM_Vectorization_WorkerSettings = "FoundationaLLM:Vectorization:WorkerSettings"; + public const string FoundationaLLM_Vectorization_ContentSourceManagerService = "FoundationaLLM:Vectorization:ContentSourceManagerService"; + /// + /// The key section for the FoundationaLLM:Vectorization:VectorizationWorker app configuration setting. + /// + public const string FoundationaLLM_Vectorization_VectorizationWorker = "FoundationaLLM:Vectorization:VectorizationWorker"; } /// @@ -780,16 +784,20 @@ public static class AppConfigurationKeySections /// public const string FoundationaLLM_BlobStorageMemorySource = "FoundationaLLM:BlobStorageMemorySource"; /// - /// The key section for the FoundationaLLM:Vectorization:Queues app configuration settings. + /// The key section for the FoundationaLLM:Vectorization:ContentSources app configuration settings. /// - public const string FoundationaLLM_Vectorization_Queues = "FoundationaLLM:Vectorization:Queues"; + public const string FoundationaLLM_Vectorization_ContentSources = "FoundationaLLM:Vectorization:ContentSources"; /// - /// The key section for the FoundationaLLM:Vectorization:StateServiceSettings app configuration settings. + /// The key section for the FoundationaLLM:Vectorization:Steps app configuration settings. /// - public const string FoundationaLLM_Vectorization_StateServiceSettings = "FoundationaLLM:Vectorization:StateServiceSettings"; + public const string FoundationaLLM_Vectorization_Steps = "FoundationaLLM:Vectorization:Steps"; + /// + /// The key section for the FoundationaLLM:Vectorization:Queues app configuration settings. + /// + public const string FoundationaLLM_Vectorization_Queues = "FoundationaLLM:Vectorization:Queues"; /// - /// The key section for the FoundationaLLM:Vectorization:WorkerSettings app configuration settings. + /// The key section for the FoundationaLLM:Vectorization:StateService app configuration settings. /// - public const string FoundationaLLM_Vectorization_WorkerSettings = "FoundationaLLM:Vectorization:WorkerSettings"; + public const string FoundationaLLM_Vectorization_StateService = "FoundationaLLM:Vectorization:StateService"; } } diff --git a/src/dotnet/Common/Constants/FileExtensions.cs b/src/dotnet/Common/Constants/FileExtensions.cs new file mode 100644 index 0000000000..56d24be06d --- /dev/null +++ b/src/dotnet/Common/Constants/FileExtensions.cs @@ -0,0 +1,43 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Common.Constants +{ + /// + /// Name constants used to identify file extension. + /// + public static class FileExtensions + { + /// + /// File extension for text files. + /// + public const string Text = ".txt"; + /// + /// File extension for JSON files. + /// + public const string JSON = ".json"; + /// + /// File extension for Markdown files. + /// + public const string Markdown = ".md"; + /// + /// File extension for Microsoft Office Word files. + /// + public const string Word = ".docx"; + /// + /// File extension for Microsoft Office PowerPoint files. + /// + public const string PowerPoint = ".pptx"; + /// + /// File extension for Microsoft Office Excel files. + /// + public const string Excel = ".xlsx"; + /// + /// File extension for PDF files. + /// + public const string PDF = ".pdf"; + } +} diff --git a/src/dotnet/Common/Exceptions/ConfigurationValueException.cs b/src/dotnet/Common/Exceptions/ConfigurationValueException.cs new file mode 100644 index 0000000000..d09f219bc4 --- /dev/null +++ b/src/dotnet/Common/Exceptions/ConfigurationValueException.cs @@ -0,0 +1,38 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Common.Exceptions +{ + /// + /// Represents an error with a configuration value. + /// + public class ConfigurationValueException : Exception + { + /// + /// Initializes a new instance of the class with a default message. + /// + public ConfigurationValueException() + { + } + + /// + /// Initializes a new instance of the class with its message set to . + /// + /// A string that describes the error. + public ConfigurationValueException(string? message) : base(message) + { + } + + /// + /// Initializes a new instance of the class with its message set to . + /// + /// A string that describes the error. + /// The exception that is the cause of the current exception. + public ConfigurationValueException(string? message, Exception? innerException) : base(message, innerException) + { + } + } +} diff --git a/src/dotnet/Common/Exceptions/ContentException.cs b/src/dotnet/Common/Exceptions/ContentException.cs new file mode 100644 index 0000000000..e1fc7a3e2e --- /dev/null +++ b/src/dotnet/Common/Exceptions/ContentException.cs @@ -0,0 +1,38 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Common.Exceptions +{ + /// + /// Represents an error with accessing content. + /// + public class ContentException : Exception + { + /// + /// Initializes a new instance of the class with a default message. + /// + public ContentException() + { + } + + /// + /// Initializes a new instance of the class with its message set to . + /// + /// A string that describes the error. + public ContentException(string? message) : base(message) + { + } + + /// + /// Initializes a new instance of the class with its message set to . + /// + /// A string that describes the error. + /// The exception that is the cause of the current exception. + public ContentException(string? message, Exception? innerException) : base(message, innerException) + { + } + } +} diff --git a/src/dotnet/Common/Interfaces/IStorageService.cs b/src/dotnet/Common/Interfaces/IStorageService.cs index ec9f9df689..becfe899c7 100644 --- a/src/dotnet/Common/Interfaces/IStorageService.cs +++ b/src/dotnet/Common/Interfaces/IStorageService.cs @@ -11,5 +11,42 @@ namespace FoundationaLLM.Common.Interfaces /// public interface IStorageService { + /// + /// Reads the binary content of a specified file from the storage. + /// + /// The name of the container where the file is located. + /// The path of the file to read. + /// The cancellation token that signals that operations should be cancelled. + /// The binary content of the file. + Task ReadFileAsync(string containerName, string filePath, CancellationToken cancellationToken); + + /// + /// Writes the binary content to a specified file from the storage. + /// + /// The name of the container where the file is located. + /// The path of the file to read. + /// The binary content written to the file. + /// The cancellation token that signals that operations should be cancelled. + /// + Task WriteFileAsync(string containerName, string filePath, Stream fileContent, CancellationToken cancellationToken); + + /// + /// Writes the string content to a specified file from the storage. + /// + /// The name of the container where the file is located. + /// The path of the file to read. + /// The string content written to the file. + /// The cancellation token that signals that operations should be cancelled. + /// + Task WriteFileAsync(string containerName, string filePath, string fileContent, CancellationToken cancellationToken); + + /// + /// Checks if a file exists on the storage. + /// + /// The name of the container where the file is located. + /// The path of the file to read. + /// The cancellation token that signals that operations should be cancelled. + /// + Task FileExistsAsync(string containerName, string filePath, CancellationToken cancellationToken); } } diff --git a/src/dotnet/Common/Models/ModelRegistry.cs b/src/dotnet/Common/Models/ModelRegistry.cs index 6e2d5366ef..f430142449 100644 --- a/src/dotnet/Common/Models/ModelRegistry.cs +++ b/src/dotnet/Common/Models/ModelRegistry.cs @@ -11,15 +11,15 @@ public class ModelRegistry /// /// Dictionary of model names and their corresponding entries in the registry. /// - public static Dictionary Models = new Dictionary - { + public static readonly Dictionary Models = new() + { { nameof(Customer), new ModelRegistryEntry { Type = typeof(Customer), - TypeMatchingProperties = new List { "customerId", "firstName" }, - NamingProperties = new List { "firstName", "lastName" }, + TypeMatchingProperties = ["customerId", "firstName"], + NamingProperties = ["firstName", "lastName"], } }, { @@ -27,8 +27,8 @@ public class ModelRegistry new ModelRegistryEntry { Type = typeof(Product), - TypeMatchingProperties = new List { "sku" }, - NamingProperties = new List { "name" } + TypeMatchingProperties = ["sku"], + NamingProperties = ["name"] } }, { @@ -36,8 +36,8 @@ public class ModelRegistry new ModelRegistryEntry { Type = typeof(SalesOrder), - TypeMatchingProperties = new List { "orderDate", "shipDate" }, - NamingProperties = new List { "id" } + TypeMatchingProperties = ["orderDate", "shipDate"], + NamingProperties = ["id"] } }, { @@ -45,8 +45,8 @@ public class ModelRegistry new ModelRegistryEntry { Type = typeof(ShortTermMemory), - TypeMatchingProperties = new List { "memory__" }, - NamingProperties = new List() + TypeMatchingProperties = ["memory__"], + NamingProperties = [] } } }; @@ -61,7 +61,7 @@ public class ModelRegistry var result = ModelRegistry .Models .Select(m => m.Value) - .SingleOrDefault(x => objProps.Intersect(x.TypeMatchingProperties!).Count() == x.TypeMatchingProperties!.Count()); + .SingleOrDefault(x => objProps.Intersect(x.TypeMatchingProperties!).Count() == x.TypeMatchingProperties!.Count); return result; } diff --git a/src/dotnet/Common/Services/BlobStorageService.cs b/src/dotnet/Common/Services/BlobStorageService.cs new file mode 100644 index 0000000000..9c561ae2eb --- /dev/null +++ b/src/dotnet/Common/Services/BlobStorageService.cs @@ -0,0 +1,114 @@ +using Azure; +using Azure.Identity; +using Azure.Storage; +using Azure.Storage.Blobs; +using Azure.Storage.Blobs.Models; +using FoundationaLLM.Common.Exceptions; +using FoundationaLLM.Common.Interfaces; +using FoundationaLLM.Common.Settings; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using System.Text; + +namespace FoundationaLLM.Common.Services +{ + /// + /// Provides access to Azure blob storage. + /// + /// + /// Initializes a new instance of the with the specified options and logger. + /// + /// The options object containing the object with the settings. + /// The logger used for logging. +#pragma warning disable CS8618 // Non-nullable field must contain a non-null value when exiting constructor. Consider declaring as nullable. + public class BlobStorageService( +#pragma warning restore CS8618 // Non-nullable field must contain a non-null value when exiting constructor. Consider declaring as nullable. + IOptions options, + ILogger logger) : StorageServiceBase(options, logger), IStorageService + { + private BlobServiceClient _blobServiceClient; + + /// + public async Task ReadFileAsync( + string containerName, + string filePath, + CancellationToken cancellationToken = default) + { + var containerClient = _blobServiceClient.GetBlobContainerClient(containerName); + var blobClient = containerClient.GetBlobClient(filePath); + + try + { + Response? content = await blobClient.DownloadContentAsync(cancellationToken).ConfigureAwait(false); + + if (content != null && content.HasValue) + { + return content.Value.Content; + } + + throw new ContentException($"Cannot read file {filePath} from container {containerName}."); + } + catch (RequestFailedException e) when (e.Status == 404) + { + _logger.LogWarning("File not found: {FilePath}", filePath); + throw new ContentException("File not found.", e); + } + } + + /// + public async Task WriteFileAsync( + string containerName, + string filePath, + Stream fileContent, + CancellationToken cancellationToken = default) + { + var containerClient = _blobServiceClient.GetBlobContainerClient(containerName); + var blobClient = containerClient.GetBlobClient(filePath); + + fileContent.Seek(0, SeekOrigin.Begin); + + BlobUploadOptions options = new(); + await blobClient.UploadAsync(fileContent, options, cancellationToken).ConfigureAwait(false); + } + + /// + public async Task WriteFileAsync( + string containerName, + string filePath, + string fileContent, + CancellationToken cancellationToken = default) => + await WriteFileAsync( + containerName, + filePath, + new MemoryStream(Encoding.UTF8.GetBytes(fileContent)), + cancellationToken).ConfigureAwait(false); + + /// + public async Task FileExistsAsync( + string containerName, + string filePath, + CancellationToken cancellationToken = default) + { + var containerClient = _blobServiceClient.GetBlobContainerClient(containerName); + var blobClient = containerClient.GetBlobClient(filePath); + + return await blobClient.ExistsAsync(cancellationToken).ConfigureAwait(false); + } + + /// + protected override void CreateClientFromAccountKey(string accountName, string accountKey) => + _blobServiceClient = new BlobServiceClient( + new Uri($"https://{accountName}.dfs.core.windows.net"), + new StorageSharedKeyCredential(accountName, accountKey)); + + /// + protected override void CreateClientFromConnectionString(string connectionString) => + _blobServiceClient = new BlobServiceClient(connectionString); + + /// + protected override void CreateClientFromIdentity(string accountName) => + _blobServiceClient = new BlobServiceClient( + new Uri($"https://{accountName}.dfs.core.windows.net"), + new DefaultAzureCredential()); + } +} diff --git a/src/dotnet/Common/Services/DataLakeStorageService.cs b/src/dotnet/Common/Services/DataLakeStorageService.cs index 82495f4f33..9ab4785bd1 100644 --- a/src/dotnet/Common/Services/DataLakeStorageService.cs +++ b/src/dotnet/Common/Services/DataLakeStorageService.cs @@ -1,4 +1,9 @@ -using FoundationaLLM.Common.Interfaces; +using Azure; +using Azure.Identity; +using Azure.Storage; +using Azure.Storage.Files.DataLake; +using FoundationaLLM.Common.Exceptions; +using FoundationaLLM.Common.Interfaces; using FoundationaLLM.Common.Settings; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; @@ -9,16 +14,87 @@ namespace FoundationaLLM.Common.Services /// Provides access to Azure Data Lake blob storage. /// /// - /// Initializes a new instance of the with the specified options and logger. + /// Initializes a new instance of the with the specified options and logger. /// /// The options object containing the object with the settings. /// The logger used for logging. +#pragma warning disable CS8618 // Non-nullable field must contain a non-null value when exiting constructor. Consider declaring as nullable. public class DataLakeStorageService( +#pragma warning restore CS8618 // Non-nullable field must contain a non-null value when exiting constructor. Consider declaring as nullable. IOptions options, - ILogger logger) : IStorageService + ILogger logger) : StorageServiceBase(options, logger), IStorageService { -#pragma warning disable IDE0052 // Remove unread private members - private readonly BlobStorageServiceSettings _settings = options.Value; - private readonly ILogger _logger = logger; + private DataLakeServiceClient _dataLakeClient; + + /// + public async Task ReadFileAsync( + string containerName, + string filePath, + CancellationToken cancellationToken = default) + { + var fileSystemClient = _dataLakeClient.GetFileSystemClient(containerName); + var fileClient = fileSystemClient.GetFileClient(filePath); + + try + { + var memoryStream = new MemoryStream(); + var result = await fileClient.ReadToAsync(memoryStream, null, cancellationToken).ConfigureAwait(false); + + if (result.IsError) + throw new ContentException($"Cannot read file {filePath} from file system {containerName}."); + + memoryStream.Seek(0, SeekOrigin.Begin); + return BinaryData.FromStream(memoryStream); + } + catch (RequestFailedException e) when (e.Status == 404) + { + _logger.LogWarning("File not found: {FilePath}", filePath); + throw new ContentException("File not found.", e); + } + } + + /// + public Task WriteFileAsync( + string containerName, + string filePath, + Stream fileContent, + CancellationToken cancellationToken) => + throw new NotImplementedException(); + + /// + public Task WriteFileAsync( + string containerName, + string filePath, + string fileContent, + CancellationToken cancellationToken) => + throw new NotImplementedException(); + + /// + public async Task FileExistsAsync( + string containerName, + string filePath, + CancellationToken cancellationToken = default) + { + var fileSystemClient = _dataLakeClient.GetFileSystemClient(containerName); + var fileClient = fileSystemClient.GetFileClient(filePath); + + return await fileClient.ExistsAsync(cancellationToken).ConfigureAwait(false); + } + + /// + protected override void CreateClientFromAccountKey(string accountName, string accountKey) => + _dataLakeClient = new DataLakeServiceClient( + new Uri($"https://{accountName}.dfs.core.windows.net"), + new StorageSharedKeyCredential(accountName, accountKey)); + + /// + protected override void CreateClientFromConnectionString(string connectionString) => + _dataLakeClient = new DataLakeServiceClient(connectionString); + + /// + protected override void CreateClientFromIdentity(string accountName) => + _dataLakeClient = new DataLakeServiceClient( + new Uri($"https://{accountName}.dfs.core.windows.net"), + new DefaultAzureCredential()); } } diff --git a/src/dotnet/Common/Services/StorageServiceBase.cs b/src/dotnet/Common/Services/StorageServiceBase.cs new file mode 100644 index 0000000000..f25112188a --- /dev/null +++ b/src/dotnet/Common/Services/StorageServiceBase.cs @@ -0,0 +1,107 @@ +using Azure.Identity; +using Azure.Storage.Files.DataLake; +using Azure.Storage; +using Azure; +using FoundationaLLM.Common.Exceptions; +using FoundationaLLM.Common.Settings; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using System; +using System.Collections.Generic; +using System.ComponentModel; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Common.Services +{ + /// + /// Provides access to storage. + /// + public abstract class StorageServiceBase + { + private readonly BlobStorageServiceSettings _settings; + /// + /// The logger used for logging. + /// + protected readonly ILogger _logger; + + /// + /// Initializes a new instance of the with the specified options and logger. + /// + /// The options object containing the object with the settings. + /// The logger used for logging. + public StorageServiceBase( + IOptions options, + ILogger logger) + { + _settings = options.Value; + _logger = logger; + + switch (_settings.AuthenticationType) + { + case BlobStorageAuthenticationTypes.ConnectionString: + ValidateConnectionString(_settings.ConnectionString); + CreateClientFromConnectionString(_settings.ConnectionString!); + break; + case BlobStorageAuthenticationTypes.AccountKey: + ValidateAccountName(_settings.AccountName); + ValidateAccountKey(_settings.AccountKey); + CreateClientFromAccountKey(_settings.AccountName!, _settings.AccountKey!); + break; + case BlobStorageAuthenticationTypes.AzureIdentity: + ValidateAccountName(_settings.AccountName); + CreateClientFromIdentity(_settings.AccountName!); + break; + default: + throw new InvalidEnumArgumentException($"The authentication type {_settings.AuthenticationType} is not supported."); + } + } + + /// + /// Creates a storage client from a connection string. + /// + /// The storage connection string. + protected abstract void CreateClientFromConnectionString(string connectionString); + + /// + /// Creates a storage client from an account name and an account key. + /// + /// The storage account name. + /// The storage account key. + protected abstract void CreateClientFromAccountKey(string accountName, string accountKey); + + /// + /// Create a storage client from an account name using the default identity for authentication. + /// + /// The storage account name. + protected abstract void CreateClientFromIdentity(string accountName); + + private void ValidateAccountName(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + { + _logger.LogCritical("The Azure blob storage account name is invalid."); + throw new ConfigurationValueException("The Azure blob storage account name is invalid."); + } + } + + private void ValidateAccountKey(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + { + _logger.LogCritical("The Azure blob storage account key is invalid."); + throw new ConfigurationValueException("The Azure blob storage account key is invalid."); + } + } + + private void ValidateConnectionString(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + { + _logger.LogCritical("The Azure blob storage account connection string is invalid."); + throw new ConfigurationValueException("The Azure blob storage account connection string is invalid."); + } + } + } +} diff --git a/src/dotnet/Common/Settings/BlobStorageServiceSettings.cs b/src/dotnet/Common/Settings/BlobStorageServiceSettings.cs index 4b2d302476..f111240e45 100644 --- a/src/dotnet/Common/Settings/BlobStorageServiceSettings.cs +++ b/src/dotnet/Common/Settings/BlobStorageServiceSettings.cs @@ -1,8 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; +using System.Text.Json.Serialization; namespace FoundationaLLM.Common.Settings { @@ -12,14 +8,15 @@ namespace FoundationaLLM.Common.Settings public record BlobStorageServiceSettings { /// - /// The name of the blob storage account. + /// A value indicating the type of authentication used. /// - public required string AccountName { get; set; } + [JsonConverter(typeof(JsonStringEnumConverter))] + public required BlobStorageAuthenticationTypes AuthenticationType { get; set; } /// - /// A value indicating the type of authentication used. + /// The name of the blob storage account. /// - public required BlobStorageAuthenticationTypes AuthenticationType { get; set; } + public string? AccountName { get; set; } /// /// The account key used for authentication. diff --git a/src/dotnet/Vectorization/DataFormats/PDF/PDFTextExtractor.cs b/src/dotnet/Vectorization/DataFormats/PDF/PDFTextExtractor.cs new file mode 100644 index 0000000000..6a941961ee --- /dev/null +++ b/src/dotnet/Vectorization/DataFormats/PDF/PDFTextExtractor.cs @@ -0,0 +1,30 @@ +using System.Text; +using UglyToad.PdfPig; +using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor; + +namespace FoundationaLLM.Vectorization.DataFormats.PDF +{ + /// + /// Extracts text from PDF files. + /// + public class PDFTextExtractor + { + /// + /// Extracts the text content from a PDF document. + /// + /// The binary content of the PDF document. + /// The text content of the PDF document. + public static string GetText(BinaryData binaryContent) + { + StringBuilder sb = new(); + using var pdfDocument = PdfDocument.Open(binaryContent.ToStream()); + foreach (var page in pdfDocument.GetPages()) + { + var text = ContentOrderTextExtractor.GetText(page); + sb.Append(text); + } + + return sb.ToString().Trim(); + } + } +} diff --git a/src/dotnet/Vectorization/Handlers/EmbeddingHandler.cs b/src/dotnet/Vectorization/Handlers/EmbeddingHandler.cs index a9ac0d8e50..5d629c6b1b 100644 --- a/src/dotnet/Vectorization/Handlers/EmbeddingHandler.cs +++ b/src/dotnet/Vectorization/Handlers/EmbeddingHandler.cs @@ -1,18 +1,31 @@ using FoundationaLLM.Common.Constants; +using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; namespace FoundationaLLM.Vectorization.Handlers { - public class EmbeddingHandler : VectorizationStepHandlerBase + /// + /// Handles the embedding stage of the vectorization pipeline. + /// + /// The dictionary of named parameters used to configure the handler. + /// The app configuration section containing the configuration for vectorization pipeline steps. + /// The that manages content sources. + /// The that manages vectorization state. + /// The logger factory used to create loggers for logging. + public class EmbeddingHandler( + Dictionary parameters, + IConfigurationSection? stepsConfiguration, + IContentSourceManagerService contentSourceManagerService, + IVectorizationStateService stateService, + ILoggerFactory loggerFactory) : VectorizationStepHandlerBase(VectorizationSteps.Embed, parameters, stepsConfiguration, contentSourceManagerService, stateService, loggerFactory) { - public EmbeddingHandler( - Dictionary parameters) : base(VectorizationSteps.Embed, parameters) - { - } - - protected override async Task ProcessRequest(VectorizationRequest request, VectorizationState state, CancellationToken cancellationToken) - { - await Task.Delay(TimeSpan.FromSeconds(10)); - } + /// + protected override async Task ProcessRequest( + VectorizationRequest request, + VectorizationState state, + IConfigurationSection? stepConfiguration, + CancellationToken cancellationToken) => await Task.Delay(TimeSpan.FromSeconds(10)); } } diff --git a/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs b/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs index abc784ac09..865e21a023 100644 --- a/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs +++ b/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs @@ -1,18 +1,44 @@ using FoundationaLLM.Common.Constants; +using FoundationaLLM.Vectorization.Exceptions; +using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; namespace FoundationaLLM.Vectorization.Handlers { - public class ExtractionHandler : VectorizationStepHandlerBase + /// + /// Handles the extraction stage of the vectorization pipeline. + /// + /// The dictionary of named parameters used to configure the handler. + /// The app configuration section containing the configuration for vectorization pipeline steps. + /// The that manages content sources. + /// The that manages vectorization state. + /// The logger factory used to create loggers for logging. + public class ExtractionHandler( + Dictionary parameters, + IConfigurationSection? stepsConfiguration, + IContentSourceManagerService contentSourceManagerService, + IVectorizationStateService stateService, + ILoggerFactory loggerFactory) : VectorizationStepHandlerBase(VectorizationSteps.Extract, parameters, stepsConfiguration, contentSourceManagerService, stateService, loggerFactory) { - public ExtractionHandler( - Dictionary parameters) : base(VectorizationSteps.Extract, parameters) + /// + protected override async Task ProcessRequest( + VectorizationRequest request, + VectorizationState state, + IConfigurationSection? stepConfiguration, + CancellationToken cancellationToken) { - } + var contentSource = _contentSourceManagerService.GetContentSource(_parameters["content_source_name"]); - protected override async Task ProcessRequest(VectorizationRequest request, VectorizationState state, CancellationToken cancellationToken) - { - await Task.Delay(TimeSpan.FromSeconds(10)); + var textContent = await contentSource.ExtractTextFromFileAsync(request.ContentIdentifier.MultipartId, cancellationToken); + + state.AddOrReplaceArtifact(new VectorizationArtifact + { + Type = VectorizationArtifactType.ExtractedText, + Position = 1, + Content = textContent + }); } } } diff --git a/src/dotnet/Vectorization/Handlers/IndexingHandler.cs b/src/dotnet/Vectorization/Handlers/IndexingHandler.cs index 0262a71f41..ab62e008e2 100644 --- a/src/dotnet/Vectorization/Handlers/IndexingHandler.cs +++ b/src/dotnet/Vectorization/Handlers/IndexingHandler.cs @@ -1,18 +1,31 @@ using FoundationaLLM.Common.Constants; +using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; namespace FoundationaLLM.Vectorization.Handlers { - public class IndexingHandler : VectorizationStepHandlerBase + /// + /// Handles the indexing stage of the vectorization pipeline. + /// + /// The dictionary of named parameters used to configure the handler. + /// The app configuration section containing the configuration for vectorization pipeline steps. + /// The that manages content sources. + /// The that manages vectorization state. + /// The logger factory used to create loggers for logging. + public class IndexingHandler( + Dictionary parameters, + IConfigurationSection? stepsConfiguration, + IContentSourceManagerService contentSourceManagerService, + IVectorizationStateService stateService, + ILoggerFactory loggerFactory) : VectorizationStepHandlerBase(VectorizationSteps.Index, parameters, stepsConfiguration, contentSourceManagerService, stateService, loggerFactory) { - public IndexingHandler( - Dictionary parameters) : base(VectorizationSteps.Index, parameters) - { - } - - protected override async Task ProcessRequest(VectorizationRequest request, VectorizationState state, CancellationToken cancellationToken) - { - await Task.Delay(TimeSpan.FromSeconds(10)); - } + /// + protected override async Task ProcessRequest( + VectorizationRequest request, + VectorizationState state, + IConfigurationSection? stepConfiguration, + CancellationToken cancellationToken) => await Task.Delay(TimeSpan.FromSeconds(10)); } } diff --git a/src/dotnet/Vectorization/Handlers/PartitionHandler.cs b/src/dotnet/Vectorization/Handlers/PartitionHandler.cs index 78e43df070..6e044d846b 100644 --- a/src/dotnet/Vectorization/Handlers/PartitionHandler.cs +++ b/src/dotnet/Vectorization/Handlers/PartitionHandler.cs @@ -1,18 +1,47 @@ using FoundationaLLM.Common.Constants; +using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; +using Microsoft.ML.Tokenizers; namespace FoundationaLLM.Vectorization.Handlers { - public class PartitionHandler : VectorizationStepHandlerBase + /// + /// Handles the partitioning stage of the vectorization pipeline. + /// + /// The dictionary of named parameters used to configure the handler. + /// The app configuration section containing the configuration for vectorization pipeline steps. + /// The that manages content sources. + /// The that manages vectorization state. + /// The logger factory used to create loggers for logging. + public class PartitionHandler( + Dictionary parameters, + IConfigurationSection? stepsConfiguration, + IContentSourceManagerService contentSourceManagerService, + IVectorizationStateService stateService, + ILoggerFactory loggerFactory) : VectorizationStepHandlerBase(VectorizationSteps.Partition, parameters, stepsConfiguration, contentSourceManagerService, stateService, loggerFactory) { - public PartitionHandler( - Dictionary parameters) : base(VectorizationSteps.Partition, parameters) + /// + protected override async Task ProcessRequest( + VectorizationRequest request, + VectorizationState state, + IConfigurationSection? stepConfiguration, + CancellationToken cancellationToken) { - } + await _stateService.LoadArtifacts(state, VectorizationArtifactType.ExtractedText); - protected override async Task ProcessRequest(VectorizationRequest request, VectorizationState state, CancellationToken cancellationToken) - { - await Task.Delay(TimeSpan.FromSeconds(10)); + var extractedTextArtifact = state.Artifacts.SingleOrDefault(a => a.Type == VectorizationArtifactType.ExtractedText + && a.Position == 1 && !string.IsNullOrWhiteSpace(a.Content)); + + if (extractedTextArtifact == null) + { + state.Log(this, request.Id, "The extracted text artifact was not found."); + return; + } + + var tokenizer = new Tokenizer(new Bpe()); + var tokens = tokenizer.Encode(extractedTextArtifact.Content!); } } } diff --git a/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerBase.cs b/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerBase.cs index 2e1608ef47..0c0acedf1d 100644 --- a/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerBase.cs +++ b/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerBase.cs @@ -1,39 +1,94 @@ using FoundationaLLM.Vectorization.Exceptions; using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; namespace FoundationaLLM.Vectorization.Handlers { - public class VectorizationStepHandlerBase : IVectorizationStepHandler + /// + /// Implements basic vectorization step handler functionality. + /// + /// The identifier of the vectorization step. + /// The dictionary of named parameters used to configure the handler. + /// The app configuration section containing the configuration for vectorization pipeline steps. + /// The that manages content sources. + /// The that manages vectorization state. + /// The logger factory used to create loggers for logging. + public class VectorizationStepHandlerBase( + string stepId, + Dictionary parameters, + IConfigurationSection? stepsConfiguration, + IContentSourceManagerService contentSourceManagerService, + IVectorizationStateService stateService, + ILoggerFactory loggerFactory) : IVectorizationStepHandler { - protected readonly string _stepId = string.Empty; - protected readonly Dictionary _parameters; + /// + /// The identifier of the vectorization step. + /// + protected readonly string _stepId = stepId; + /// + /// The dictionary of named parameters used to configure the handler. + /// + protected readonly Dictionary _parameters = parameters; + /// + /// The app configuration section containing the configuration for vectorization pipeline steps. + /// + protected readonly IConfigurationSection? _stepsConfiguration = stepsConfiguration; + /// + /// The content source manager service. + /// + protected readonly IContentSourceManagerService _contentSourceManagerService = contentSourceManagerService; + /// + /// The vectorization state service. + /// + protected readonly IVectorizationStateService _stateService = stateService; + /// + /// The logger used for logging. + /// + protected readonly ILogger _logger = + loggerFactory.CreateLogger(); /// public string StepId => _stepId; - public VectorizationStepHandlerBase( - string stepId, Dictionary parameters) - { - _stepId = stepId; - _parameters = parameters; - } - /// public async Task Invoke(VectorizationRequest request, VectorizationState state, CancellationToken cancellationToken) { try { state.LogHandlerStart(this, request.Id); + _logger.LogInformation("Starting handler {HandlerId} for request {RequestId}", _stepId, request.Id); + + var stepConfiguration = default(IConfigurationSection); + + if (_parameters.TryGetValue("configuration_section", out string? configurationSection)) + { + stepConfiguration = _stepsConfiguration!.GetSection(configurationSection); + + if (stepConfiguration == null + || ( + stepConfiguration.Value == null + && !stepConfiguration.GetChildren().Any() + )) + { + _logger.LogError("The configuration section {ConfigurationSection} expected by the {StepId} handler is not available.", + configurationSection, _stepId); + throw new VectorizationException( + $"The configuration section {configurationSection} expected by the {_stepId} handler is not available."); + } + } ValidateRequest(request); - await ProcessRequest(request, state, cancellationToken); + await ProcessRequest(request, state, stepConfiguration, cancellationToken); state.LogHandlerEnd(this, request.Id); + _logger.LogInformation("Finished handler {HandlerId} for request {RequestId}", _stepId, request.Id); } catch (Exception ex) { state.LogHandlerError(this, request.Id, ex); + _logger.LogError(ex, "Error in executing [extract] step handler for request {VectorizationRequestId}.", request.Id); } } @@ -43,7 +98,20 @@ private void ValidateRequest(VectorizationRequest request) throw new VectorizationException($"The request with id {request.Id} does not contain a step with id {_stepId}."); } - protected virtual async Task ProcessRequest(VectorizationRequest request, VectorizationState state, CancellationToken cancellationToken) => + /// + /// Processes a vectorization request. + /// The vectorization state will be updated with the result(s) of the processing. + /// + /// The to be processed. + /// The associated with the vectorization request. + /// The providing the configuration required by the step. + /// The that signals stopping the processing. + /// + protected virtual async Task ProcessRequest( + VectorizationRequest request, + VectorizationState state, + IConfigurationSection? stepConfiguration, + CancellationToken cancellationToken) => await Task.Delay(TimeSpan.FromSeconds(30), cancellationToken); } } diff --git a/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerFactory.cs b/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerFactory.cs index 557c8c9148..850a8fb2a7 100644 --- a/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerFactory.cs +++ b/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerFactory.cs @@ -1,6 +1,8 @@ using FoundationaLLM.Common.Constants; using FoundationaLLM.Vectorization.Exceptions; using FoundationaLLM.Vectorization.Interfaces; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; namespace FoundationaLLM.Vectorization.Handlers { @@ -14,22 +16,25 @@ public class VectorizationStepHandlerFactory /// /// The identifier of the vectorization pipeline step for which the handler is created. /// The parameters used to initialize the vectorization step handler. + /// The app configuration section containing the configuration for vectorization pipeline steps. + /// The that manages content sources. + /// The that manages vectorization state. + /// The logger factory used to create loggers. /// A class implementing . - public static IVectorizationStepHandler Create(string step, Dictionary parameters) - { - switch (step) + public static IVectorizationStepHandler Create( + string step, + Dictionary parameters, + IConfigurationSection? stepsConfiguration, + IContentSourceManagerService contentSourceManagerService, + IVectorizationStateService stateService, + ILoggerFactory loggerFactory) => + step switch { - case VectorizationSteps.Extract: - return new ExtractionHandler(parameters); - case VectorizationSteps.Partition: - return new PartitionHandler(parameters); - case VectorizationSteps.Embed: - return new EmbeddingHandler(parameters); - case VectorizationSteps.Index: - return new IndexingHandler(parameters); - default: - throw new VectorizationException($"There is no handler available for the vectorization pipeline step [{step}]."); - } - } + VectorizationSteps.Extract => new ExtractionHandler(parameters, stepsConfiguration, contentSourceManagerService, stateService, loggerFactory), + VectorizationSteps.Partition => new PartitionHandler(parameters, stepsConfiguration, contentSourceManagerService, stateService, loggerFactory), + VectorizationSteps.Embed => new EmbeddingHandler(parameters, stepsConfiguration, contentSourceManagerService, stateService, loggerFactory), + VectorizationSteps.Index => new IndexingHandler(parameters, stepsConfiguration, contentSourceManagerService, stateService, loggerFactory), + _ => throw new VectorizationException($"There is no handler available for the vectorization pipeline step [{step}]."), + }; } } diff --git a/src/dotnet/Vectorization/Interfaces/IContentSourceManagerService.cs b/src/dotnet/Vectorization/Interfaces/IContentSourceManagerService.cs new file mode 100644 index 0000000000..0e5140017f --- /dev/null +++ b/src/dotnet/Vectorization/Interfaces/IContentSourceManagerService.cs @@ -0,0 +1,21 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Vectorization.Interfaces +{ + /// + /// Manages content sources registered for use by the vectorization pipelines. + /// + public interface IContentSourceManagerService + { + /// + /// Gets a content source specified by name. + /// + /// The name of the content source to retrieve. + /// The instance of the requested content source. + IContentSourceService GetContentSource(string contentSourceName); + } +} diff --git a/src/dotnet/Vectorization/Interfaces/IContentSourceService.cs b/src/dotnet/Vectorization/Interfaces/IContentSourceService.cs index e62a221e7a..8ae9133e5c 100644 --- a/src/dotnet/Vectorization/Interfaces/IContentSourceService.cs +++ b/src/dotnet/Vectorization/Interfaces/IContentSourceService.cs @@ -4,15 +4,16 @@ namespace FoundationaLLM.Vectorization.Interfaces { /// - /// Provides access to a content source. + /// Provides access to files from a content source. /// public interface IContentSourceService { /// - /// Reads the binary content of a specified document from the content source. + /// Reads the binary content of a specified file from the storage. /// - /// The path of the document to read. - /// The binary content of the document. - Task ReadFileAsync(string filePath); + /// The multipart unique identifier of the file being read. + /// The cancellation token that signals that operations should be cancelled. + /// The string content of the file. + Task ExtractTextFromFileAsync(List multipartId, CancellationToken cancellationToken); } } diff --git a/src/dotnet/Vectorization/Interfaces/IRequestSourceService.cs b/src/dotnet/Vectorization/Interfaces/IRequestSourceService.cs index 8b2027e3c8..20f692883b 100644 --- a/src/dotnet/Vectorization/Interfaces/IRequestSourceService.cs +++ b/src/dotnet/Vectorization/Interfaces/IRequestSourceService.cs @@ -22,7 +22,7 @@ public interface IRequestSourceService /// /// Receives the specified number of requests. /// The received requests will be invisible for other clients for a default timeout of 30 seconds. - /// They should be removed from the source by calling before the timeout expires. + /// They should be removed from the source by calling before the timeout expires. /// /// The number of requests to receive. /// A collection of tuples containg a object, a message id and a pop receipt. diff --git a/src/dotnet/Vectorization/Interfaces/IVectorizationStateService.cs b/src/dotnet/Vectorization/Interfaces/IVectorizationStateService.cs index a6679383b8..ba6388a41d 100644 --- a/src/dotnet/Vectorization/Interfaces/IVectorizationStateService.cs +++ b/src/dotnet/Vectorization/Interfaces/IVectorizationStateService.cs @@ -22,6 +22,14 @@ public interface IVectorizationStateService /// A item containe the requested vectorization state. Task ReadState(VectorizationRequest request); + /// + /// Loads into the state the specified type of artifact(s). + /// + /// The vectorization state in which the artifacts will be loaded. + /// The type of artifact(s) to load. + /// + Task LoadArtifacts(VectorizationState state, VectorizationArtifactType artifactType); + /// /// Saves a specified vectorization state. /// diff --git a/src/dotnet/Vectorization/Models/Configuration/ContentSourceManagerServiceSettings.cs b/src/dotnet/Vectorization/Models/Configuration/ContentSourceManagerServiceSettings.cs new file mode 100644 index 0000000000..e979d36f80 --- /dev/null +++ b/src/dotnet/Vectorization/Models/Configuration/ContentSourceManagerServiceSettings.cs @@ -0,0 +1,16 @@ +using FoundationaLLM.Vectorization.Services.ContentSources; +using System.Text.Json.Serialization; + +namespace FoundationaLLM.Vectorization.Models.Configuration +{ + /// + /// Provides configuration settings for the service. + /// + public class ContentSourceManagerServiceSettings + { + /// + /// The list of all content sources that are registered for use by the vectorization pipelines. + /// + public required List ContentSources { get; set; } + } +} diff --git a/src/dotnet/Vectorization/Models/Configuration/VectorizationStateServiceSettings.cs b/src/dotnet/Vectorization/Models/Configuration/VectorizationStateServiceSettings.cs index acb3b1b294..eab296a93b 100644 --- a/src/dotnet/Vectorization/Models/Configuration/VectorizationStateServiceSettings.cs +++ b/src/dotnet/Vectorization/Models/Configuration/VectorizationStateServiceSettings.cs @@ -1,4 +1,5 @@ -using System; +using FoundationaLLM.Common.Settings; +using System; using System.Collections.Generic; using System.Linq; using System.Text; @@ -12,13 +13,13 @@ namespace FoundationaLLM.Vectorization.Models.Configuration public class VectorizationStateServiceSettings { /// - /// The connection string to connect to the underlying persistence service. + /// The settings for connecting to the underlying blob storage. /// - public required string ConnectionString { get; set; } + public required BlobStorageServiceSettings Storage { get; set; } /// /// The name of the container where the underlying persistence service stores vectorization state. /// - public required string ContainerName { get; set; } + public required string StorageContainerName { get; set; } } } diff --git a/src/dotnet/Vectorization/Models/Configuration/VectorizationWorkerSettings.cs b/src/dotnet/Vectorization/Models/Configuration/VectorizationWorkerSettings.cs index eb7498107b..018f5d6552 100644 --- a/src/dotnet/Vectorization/Models/Configuration/VectorizationWorkerSettings.cs +++ b/src/dotnet/Vectorization/Models/Configuration/VectorizationWorkerSettings.cs @@ -2,14 +2,26 @@ namespace FoundationaLLM.Vectorization.Models.Configuration { + /// + /// Settings for the vectorization worker. + /// public class VectorizationWorkerSettings { + /// + /// Settings for the request managers. + /// [JsonPropertyOrder(0)] public List? RequestManagers { get; set; } + /// + /// Settings for the request sources. + /// [JsonPropertyOrder(1)] public List? RequestSources { get; set; } + /// + /// The type of queuing engine used to dispatch vectorization requests. + /// [JsonPropertyOrder(2)] [JsonConverter(typeof(JsonStringEnumConverter))] public VectorizationQueuing QueuingEngine { get; set; } diff --git a/src/dotnet/Vectorization/Models/ContentSource.cs b/src/dotnet/Vectorization/Models/ContentSource.cs new file mode 100644 index 0000000000..9d0342fc2c --- /dev/null +++ b/src/dotnet/Vectorization/Models/ContentSource.cs @@ -0,0 +1,21 @@ +using System.Text.Json.Serialization; + +namespace FoundationaLLM.Vectorization.Models +{ + /// + /// Provides detials about a content source. + /// + public class ContentSource + { + /// + /// The name of the content source. + /// + public required string Name { get; set; } + + /// + /// The type of the content source. + /// + [JsonConverter(typeof(JsonStringEnumConverter))] + public required ContentSourceType Type { get; set; } + } +} diff --git a/src/dotnet/Vectorization/Models/ContentSourceType.cs b/src/dotnet/Vectorization/Models/ContentSourceType.cs new file mode 100644 index 0000000000..177fad9976 --- /dev/null +++ b/src/dotnet/Vectorization/Models/ContentSourceType.cs @@ -0,0 +1,19 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Vectorization.Models +{ + /// + /// Types of content sources from which documents can be retrieved. + /// + public enum ContentSourceType + { + /// + /// Azure data lake storage account. + /// + AzureDataLake + } +} diff --git a/src/dotnet/Vectorization/Models/VectorizationArtifact.cs b/src/dotnet/Vectorization/Models/VectorizationArtifact.cs new file mode 100644 index 0000000000..7d911e9e41 --- /dev/null +++ b/src/dotnet/Vectorization/Models/VectorizationArtifact.cs @@ -0,0 +1,46 @@ +using System.Text.Json.Serialization; +using UglyToad.PdfPig.Outline; + +namespace FoundationaLLM.Vectorization.Models +{ + /// + /// Represents a vectorization artifact. + /// + public class VectorizationArtifact + { + /// + /// The type of the vectorization artifact. + /// + [JsonPropertyOrder(1)] + [JsonPropertyName("type")] + [JsonConverter(typeof(JsonStringEnumConverter))] + public VectorizationArtifactType Type { get; set; } + + /// + /// The canonical identifier of the vectorization artifact. + /// + [JsonPropertyOrder(2)] + [JsonPropertyName ("canonical_id")] + public string? CanonicalId { get; set; } + + /// + /// The position of the vectorization artifact. + /// This is relevant only for artifact types that can have multiple parts. + /// + [JsonPropertyOrder(3)] + [JsonPropertyName("position")] + public int Position { get; set; } + + /// + /// The content of the artifact. + /// + [JsonIgnore] + public string? Content { get; set; } + + /// + /// Indicates the need to persist the content of the artifact. + /// + [JsonIgnore] + public bool IsDirty { get; set; } + } +} diff --git a/src/dotnet/Vectorization/Models/VectorizationArtifactType.cs b/src/dotnet/Vectorization/Models/VectorizationArtifactType.cs new file mode 100644 index 0000000000..c3404144f8 --- /dev/null +++ b/src/dotnet/Vectorization/Models/VectorizationArtifactType.cs @@ -0,0 +1,27 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Vectorization.Models +{ + /// + /// Deifines types of vectorization artifacts. + /// + public enum VectorizationArtifactType + { + /// + /// Text extracted from source content. + /// + ExtractedText, + /// + /// Text partition suitable for embedding. + /// + TextPartition, + /// + /// Vector embedding derived from a text partition. + /// + TextEmbeddingVector + } +} diff --git a/src/dotnet/Vectorization/Models/VectorizationContentIdentifier.cs b/src/dotnet/Vectorization/Models/VectorizationContentIdentifier.cs index 4a5e852513..f5b54bdbc2 100644 --- a/src/dotnet/Vectorization/Models/VectorizationContentIdentifier.cs +++ b/src/dotnet/Vectorization/Models/VectorizationContentIdentifier.cs @@ -13,11 +13,18 @@ namespace FoundationaLLM.Vectorization.Models public class VectorizationContentIdentifier { /// - /// The unique identifier of the content (i.e., document) being vectorized. + /// The multipart unique identifier of the the content (i.e. document) being vectorized. /// [JsonPropertyOrder(1)] - [JsonPropertyName("unique_id")] - public required string UniqueId { get; set; } + [JsonPropertyName("multipart_id")] + public required List MultipartId { get; set; } + + /// + /// The unique identifier of the content (i.e., document) being vectorized. + /// The identifier is determined by concatenating the parts from . + /// + [JsonIgnore] + public string UniqueId => string.Join("/", MultipartId); /// /// The canonical identifier of the content being vectorized. diff --git a/src/dotnet/Vectorization/Models/VectorizationLogEntry.cs b/src/dotnet/Vectorization/Models/VectorizationLogEntry.cs index 3e4bf705fa..58926e4331 100644 --- a/src/dotnet/Vectorization/Models/VectorizationLogEntry.cs +++ b/src/dotnet/Vectorization/Models/VectorizationLogEntry.cs @@ -2,29 +2,37 @@ namespace FoundationaLLM.Vectorization.Models { - public class VectorizationLogEntry + /// + /// Represents a log entry that contains information about vectorization operations. + /// + public class VectorizationLogEntry(string requestId, string source, string text) { + /// + /// The unique identifier of the vectorization request. + /// [JsonPropertyOrder(0)] [JsonPropertyName("rid")] - public string RequestId { get; set; } + public string RequestId { get; set; } = requestId; + /// + /// The time at which the log entry was created. + /// [JsonPropertyOrder(1)] [JsonPropertyName("t")] public DateTimeOffset Time { get; set; } = DateTimeOffset.UtcNow; + /// + /// The source of the log entry. This is usually the name of the vectorization step handler. + /// [JsonPropertyOrder(2)] [JsonPropertyName("src")] - public string Source { get; set; } + public string Source { get; set; } = source; + /// + /// The content of the log entry. + /// [JsonPropertyOrder(3)] [JsonPropertyName("txt")] - public string Text { get; set; } - - public VectorizationLogEntry(string requestId, string source, string text) - { - this.RequestId = requestId; - this.Source = source; - this.Text = text; - } + public string Text { get; set; } = text; } } diff --git a/src/dotnet/Vectorization/Models/VectorizationQueuing.cs b/src/dotnet/Vectorization/Models/VectorizationQueuing.cs index bf7e061fad..462bd8af2e 100644 --- a/src/dotnet/Vectorization/Models/VectorizationQueuing.cs +++ b/src/dotnet/Vectorization/Models/VectorizationQueuing.cs @@ -6,9 +6,18 @@ namespace FoundationaLLM.Vectorization.Models { + /// + /// Types of queuing engines used to dispatch vectorization requests. + /// public enum VectorizationQueuing { + /// + /// No persisted queuing. Results in using a simple, non-production grade, in-memory queuing mechanism. + /// None, + /// + /// Azure storage account queuing. + /// AzureStorageQueue } } diff --git a/src/dotnet/Vectorization/Models/VectorizationState.cs b/src/dotnet/Vectorization/Models/VectorizationState.cs index 8e4ec19e9f..a5b8aba852 100644 --- a/src/dotnet/Vectorization/Models/VectorizationState.cs +++ b/src/dotnet/Vectorization/Models/VectorizationState.cs @@ -30,10 +30,17 @@ public class VectorizationState [JsonPropertyName("content_identifier")] public required VectorizationContentIdentifier ContentIdentifier { get; set; } + /// + /// The vectorization artifacts associated with the vectorization state. + /// + [JsonPropertyOrder(2)] + [JsonPropertyName("artifacts")] + public List Artifacts { get; set; } = []; + /// /// The list of log entries associated with actions executed by the vectorization pipeline. /// - [JsonPropertyOrder(18)] + [JsonPropertyOrder(20)] [JsonPropertyName("log")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public List LogEntries { get; set; } = []; @@ -87,5 +94,19 @@ public static VectorizationState FromRequest(VectorizationRequest request) => CurrentRequestId = request.Id, ContentIdentifier = request.ContentIdentifier }; + + /// + /// Adds or replaces a vectorization artifact associated with the vectorization state. + /// + /// The to be added or replaced. + public void AddOrReplaceArtifact(VectorizationArtifact artifact) + { + var existingArtifact = Artifacts.SingleOrDefault(a => a.Type == artifact.Type & a.Position == artifact.Position); + if (existingArtifact != null) + Artifacts.Remove(existingArtifact); + + artifact.IsDirty = true; + Artifacts.Add(artifact); + } } } diff --git a/src/dotnet/Vectorization/Models/VectorizationStep.cs b/src/dotnet/Vectorization/Models/VectorizationStep.cs index 42c9354f62..dd62f8621c 100644 --- a/src/dotnet/Vectorization/Models/VectorizationStep.cs +++ b/src/dotnet/Vectorization/Models/VectorizationStep.cs @@ -7,12 +7,21 @@ namespace FoundationaLLM.Vectorization.Models { + /// + /// Represents a vectorization step in a vectorization request. + /// public class VectorizationStep { + /// + /// The identifier of the step. + /// [JsonPropertyOrder(0)] [JsonPropertyName("id")] public required string Id { get; set; } + /// + /// Dictionary-based configuration for the step. + /// [JsonPropertyOrder(1)] [JsonPropertyName("parameters")] public required Dictionary Parameters { get; set; } diff --git a/src/dotnet/Vectorization/Services/ContentSources/BlobStorageContentSourceService.cs b/src/dotnet/Vectorization/Services/ContentSources/BlobStorageContentSourceService.cs deleted file mode 100644 index 227872f93c..0000000000 --- a/src/dotnet/Vectorization/Services/ContentSources/BlobStorageContentSourceService.cs +++ /dev/null @@ -1,20 +0,0 @@ -using FoundationaLLM.Vectorization.Interfaces; -using System; -using System.Threading.Tasks; - -namespace FoundationaLLM.Vectorization.Services.ContentSources -{ - public class BlobStorageContentSourceService : ContentSourceServiceBase, IContentSourceService - { - public BlobStorageContentSourceService() - { - } - - /// - public async Task ReadFileAsync(string filePath) - { - await Task.CompletedTask; - throw new NotImplementedException(); - } - } -} diff --git a/src/dotnet/Vectorization/Services/ContentSources/ContentSourceManagerService.cs b/src/dotnet/Vectorization/Services/ContentSources/ContentSourceManagerService.cs new file mode 100644 index 0000000000..767777c416 --- /dev/null +++ b/src/dotnet/Vectorization/Services/ContentSources/ContentSourceManagerService.cs @@ -0,0 +1,56 @@ +using FoundationaLLM.Common.Constants; +using FoundationaLLM.Common.Settings; +using FoundationaLLM.Vectorization.Exceptions; +using FoundationaLLM.Vectorization.Interfaces; +using FoundationaLLM.Vectorization.Models; +using FoundationaLLM.Vectorization.Models.Configuration; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; + +namespace FoundationaLLM.Vectorization.Services.ContentSources +{ + /// + /// Manages content sources registered for use in the vectorization pipelines. + /// + /// + /// Creates a new instance of the content source manager service. + /// + /// The configuration settings used to initialize the service. + /// The global configuration provider. + /// The logger factory used to create loggers. + public class ContentSourceManagerService( + IOptions options, + IConfiguration configuration, + ILoggerFactory loggerFactory) : IContentSourceManagerService + { + private readonly ContentSourceManagerServiceSettings _settings = options.Value; + private readonly IConfiguration _configuration = configuration; + private readonly ILoggerFactory _loggerFactory = loggerFactory; + + /// + public IContentSourceService GetContentSource(string contentSourceName) + { + var contentSource = _settings.ContentSources.SingleOrDefault(cs => cs.Name == contentSourceName) + ?? throw new VectorizationException($"The content source {contentSourceName} is not registered."); + + return contentSource.Type switch + { + ContentSourceType.AzureDataLake => CreateAzureDataLakeContentSourceService(contentSourceName), + _ => throw new VectorizationException($"The content source type {contentSource.Type} is not supported."), + }; + } + + private DataLakeContentSourceService CreateAzureDataLakeContentSourceService(string contentSourceName) + { + var blobStorageServiceSettings = new BlobStorageServiceSettings { AuthenticationType = BlobStorageAuthenticationTypes.Unknown }; + _configuration.Bind( + $"{AppConfigurationKeySections.FoundationaLLM_Vectorization_ContentSources}:{contentSourceName}", + blobStorageServiceSettings); + + return new DataLakeContentSourceService( + blobStorageServiceSettings, + _loggerFactory); + } + } +} diff --git a/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceBase.cs b/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceBase.cs index d379a8b3e5..3bf13bc33d 100644 --- a/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceBase.cs +++ b/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceBase.cs @@ -1,6 +1,24 @@ -namespace FoundationaLLM.Vectorization.Services.ContentSources +using FoundationaLLM.Vectorization.Exceptions; + +namespace FoundationaLLM.Vectorization.Services.ContentSources { + /// + /// Provides common functionalities for all content sources. + /// public class ContentSourceServiceBase { + /// + /// Validates a multipart unique content identifier. + /// + /// The multipart identifier to validate. + /// The required number of parts in the multipart identifier. + /// + public void ValidateMultipartId(List multipartId, int partsCount) + { + if (multipartId == null + || multipartId.Count != partsCount + || multipartId.Any(t => string.IsNullOrWhiteSpace(t))) + throw new VectorizationException("Invalid multipart identifier."); + } } } diff --git a/src/dotnet/Vectorization/Services/ContentSources/DataLakeContentSourceService.cs b/src/dotnet/Vectorization/Services/ContentSources/DataLakeContentSourceService.cs new file mode 100644 index 0000000000..218cf3aa55 --- /dev/null +++ b/src/dotnet/Vectorization/Services/ContentSources/DataLakeContentSourceService.cs @@ -0,0 +1,54 @@ +using FoundationaLLM.Common.Constants; +using FoundationaLLM.Common.Services; +using FoundationaLLM.Common.Settings; +using FoundationaLLM.Vectorization.DataFormats.PDF; +using FoundationaLLM.Vectorization.Exceptions; +using FoundationaLLM.Vectorization.Interfaces; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; + +namespace FoundationaLLM.Vectorization.Services.ContentSources +{ + /// + /// Implements a vectorization content source for content residing in blob storage. + /// + public class DataLakeContentSourceService : ContentSourceServiceBase, IContentSourceService + { + private readonly BlobStorageServiceSettings _storageSettings; + private readonly ILogger _logger; + private readonly DataLakeStorageService _dataLakeStorageService; + + /// + /// Creates a new instance of the vectorization content source. + /// + public DataLakeContentSourceService( + BlobStorageServiceSettings storageSettings, + ILoggerFactory loggerFactory) + { + _storageSettings = storageSettings; + _logger = loggerFactory.CreateLogger(); + _dataLakeStorageService = new DataLakeStorageService( + Options.Create(_storageSettings), + loggerFactory.CreateLogger()); + } + + /// + public async Task ExtractTextFromFileAsync(List multipartId, CancellationToken cancellationToken) + { + ValidateMultipartId(multipartId, 3); + + var binaryContent = await _dataLakeStorageService.ReadFileAsync( + multipartId[1], + multipartId[2], + cancellationToken); + + var fileExtension = Path.GetExtension(multipartId[2]); + + return fileExtension.ToLower() switch + { + FileExtensions.PDF => PDFTextExtractor.GetText(binaryContent), + _ => throw new VectorizationException($"The file type for {multipartId[2]} is not supported."), + }; + } + } +} diff --git a/src/dotnet/Vectorization/Services/RequestManagerService.cs b/src/dotnet/Vectorization/Services/RequestManagerService.cs index 6ddfe6c081..ccca58ba7c 100644 --- a/src/dotnet/Vectorization/Services/RequestManagerService.cs +++ b/src/dotnet/Vectorization/Services/RequestManagerService.cs @@ -4,6 +4,7 @@ using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models; using FoundationaLLM.Vectorization.Models.Configuration; +using Microsoft.Extensions.Configuration; using Microsoft.Extensions.Logging; namespace FoundationaLLM.Vectorization.Services @@ -17,7 +18,10 @@ public class RequestManagerService : IRequestManagerService private readonly Dictionary _requestSourceServices; private readonly IRequestSourceService _incomingRequestSourceService; private readonly IVectorizationStateService _vectorizationStateService; - private readonly ILogger _logger; + private readonly IConfigurationSection? _stepsConfiguration; + private readonly IContentSourceManagerService _contentSourceManagerService; + private readonly ILogger _logger; + private readonly ILoggerFactory _loggerFactory; private readonly CancellationToken _cancellationToken; private readonly TaskPool _taskPool; @@ -28,21 +32,28 @@ public class RequestManagerService : IRequestManagerService /// The configuration settings used to initialize the instance. /// The dictionary with all the request source services registered in the vectorization platform. /// The service providing vectorization state management. - /// The logger service. + /// The object providing access to the settings. + /// The that manages content sources. + /// The logger factory used to create loggers. /// The cancellation token that can be used to cancel the work. /// The exception thrown when the initialization of the instance fails. public RequestManagerService( RequestManagerServiceSettings settings, Dictionary requestSourceServices, IVectorizationStateService vectorizationStateService, - ILogger logger, + IConfigurationSection? stepsConfiguration, + IContentSourceManagerService contentSourceManagerService, + ILoggerFactory loggerFactory, CancellationToken cancellationToken) { _settings = settings; _requestSourceServices = requestSourceServices; _vectorizationStateService = vectorizationStateService; + _stepsConfiguration = stepsConfiguration; + _contentSourceManagerService = contentSourceManagerService; - _logger = logger; + _loggerFactory = loggerFactory; + _logger = _loggerFactory.CreateLogger(); _cancellationToken = cancellationToken; if (!_requestSourceServices.TryGetValue(_settings.RequestSourceName, out IRequestSourceService? value) || value == null) @@ -114,7 +125,13 @@ private async Task HandleRequest(VectorizationRequest request) ? await _vectorizationStateService.ReadState(request) : VectorizationState.FromRequest(request); - var stepHandler = VectorizationStepHandlerFactory.Create(_settings.RequestSourceName, request[_settings.RequestSourceName]!.Parameters); + var stepHandler = VectorizationStepHandlerFactory.Create( + _settings.RequestSourceName, + request[_settings.RequestSourceName]!.Parameters, + _stepsConfiguration, + _contentSourceManagerService, + _vectorizationStateService, + _loggerFactory); await stepHandler.Invoke(request, state, _cancellationToken); await _vectorizationStateService.SaveState(state); diff --git a/src/dotnet/Vectorization/Services/RequestSources/RequestSourcesCache.cs b/src/dotnet/Vectorization/Services/RequestSources/RequestSourcesCache.cs index 351691368c..6a838fe416 100644 --- a/src/dotnet/Vectorization/Services/RequestSources/RequestSourcesCache.cs +++ b/src/dotnet/Vectorization/Services/RequestSources/RequestSourcesCache.cs @@ -3,11 +3,6 @@ using Microsoft.Extensions.Configuration; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; namespace FoundationaLLM.Vectorization.Services.RequestSources { @@ -17,7 +12,8 @@ namespace FoundationaLLM.Vectorization.Services.RequestSources /// /// Creates a new instance of the cache. /// - /// The instance containing the instance. + /// The instance containing the instance. + /// The containing settings for the queues. /// The used to create new loggers for child objects. public class RequestSourcesCache( IOptions vectorizationWorkerOptions, diff --git a/src/dotnet/Vectorization/Services/RequestSources/StorageQueueRequestSourceService.cs b/src/dotnet/Vectorization/Services/RequestSources/StorageQueueRequestSourceService.cs index 59daa08906..fa3abd53d5 100644 --- a/src/dotnet/Vectorization/Services/RequestSources/StorageQueueRequestSourceService.cs +++ b/src/dotnet/Vectorization/Services/RequestSources/StorageQueueRequestSourceService.cs @@ -10,6 +10,9 @@ namespace FoundationaLLM.Vectorization.Services.RequestSources { + /// + /// Implements a request source that uses Azure storage queues. + /// public class StorageQueueRequestSourceService : IRequestSourceService { private readonly RequestSourceServiceSettings _settings; @@ -20,6 +23,11 @@ public class StorageQueueRequestSourceService : IRequestSourceService /// public string SourceName => _settings.Name; + /// + /// Creates a new instance of the request source. + /// + /// The object providing the settings. + /// The logger used for logging. public StorageQueueRequestSourceService( RequestSourceServiceSettings settings, ILogger logger) diff --git a/src/dotnet/Vectorization/Services/VectorizationStates/BlobStorageVectorizationStateService.cs b/src/dotnet/Vectorization/Services/VectorizationStates/BlobStorageVectorizationStateService.cs index cb24edaaef..42029de941 100644 --- a/src/dotnet/Vectorization/Services/VectorizationStates/BlobStorageVectorizationStateService.cs +++ b/src/dotnet/Vectorization/Services/VectorizationStates/BlobStorageVectorizationStateService.cs @@ -1,63 +1,93 @@ using Azure.Core; using Azure.Storage.Blobs; +using FoundationaLLM.Common.Interfaces; +using FoundationaLLM.Common.Services; using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models; using FoundationaLLM.Vectorization.Models.Configuration; +using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using System.Text; using System.Text.Json; namespace FoundationaLLM.Vectorization.Services.VectorizationStates { + /// + /// Provides vectorization state persistence services using Azure blob storage. + /// public class BlobStorageVectorizationStateService : VectorizationStateServiceBase, IVectorizationStateService { - private readonly BlobServiceClient _blobServiceClient; + private readonly BlobStorageService _storageService; private readonly BlobStorageVectorizationStateServiceSettings _settings; + private readonly ILoggerFactory _loggerFactory; - public BlobStorageVectorizationStateService(IOptions options) + /// + /// Creates a new vectorization state service instance. + /// + /// The options used to configure the new instance. + /// The logger factory used to create loggers. + public BlobStorageVectorizationStateService( + IOptions options, + ILoggerFactory loggerFactory) { _settings = options.Value; - _blobServiceClient = new BlobServiceClient(_settings.ConnectionString); + _loggerFactory = loggerFactory; + _storageService = new BlobStorageService( + Options.Create(_settings.Storage), + _loggerFactory.CreateLogger()); } /// - public async Task HasState(VectorizationRequest request) - { - var blobClient = GetBlobClient(request.ContentIdentifier); + public async Task HasState(VectorizationRequest request) => + await _storageService.FileExistsAsync( + _settings.StorageContainerName, + $"{GetPersistenceIdentifier(request.ContentIdentifier)}.json"); - return await blobClient.ExistsAsync(); - } /// public async Task ReadState(VectorizationRequest request) { - var blobClient = GetBlobClient(request.ContentIdentifier); + var content = await _storageService.ReadFileAsync( + _settings.StorageContainerName, + $"{GetPersistenceIdentifier(request.ContentIdentifier)}.json"); - var response = await blobClient.DownloadAsync(); - using var reader = new StreamReader(response.Value.Content); - var content = await reader.ReadToEndAsync(); return JsonSerializer.Deserialize(content)!; } + /// + public async Task LoadArtifacts(VectorizationState state, VectorizationArtifactType artifactType) + { + foreach (var artifact in state.Artifacts.Where(a => a.Type == artifactType)) + if (!string.IsNullOrWhiteSpace(artifact.CanonicalId)) + artifact.Content = Encoding.UTF8.GetString( + await _storageService.ReadFileAsync( + _settings.StorageContainerName, + artifact.CanonicalId)); + } + /// public async Task SaveState(VectorizationState state) { - var blobClient = GetBlobClient(state.ContentIdentifier); + var persistenceIdentifier = GetPersistenceIdentifier(state.ContentIdentifier); - var content = JsonSerializer.Serialize(state); - var stream = new MemoryStream(Encoding.UTF8.GetBytes(content)); + foreach (var artifact in state.Artifacts) + if (artifact.IsDirty) + { + var artifactPath = + $"{persistenceIdentifier}_{artifact.Type.ToString().ToLower()}_{artifact.Position:D6}.txt"; - await blobClient.UploadAsync(stream, true); - } + await _storageService.WriteFileAsync( + _settings.StorageContainerName, + artifactPath, + artifact.Content); + artifact.CanonicalId = artifactPath; + } - private BlobClient GetBlobClient(VectorizationContentIdentifier contentIdentifier) - { - var containerClient = _blobServiceClient.GetBlobContainerClient(_settings.ContainerName); - return containerClient.GetBlobClient( - GetPersistenceIdentifier(contentIdentifier)); + var content = JsonSerializer.Serialize(state); + await _storageService.WriteFileAsync( + _settings.StorageContainerName, + $"{persistenceIdentifier}.json", + content); } - - protected override string GetPersistenceIdentifier(VectorizationContentIdentifier contentIdentifier) => - $"{contentIdentifier.CanonicalId}_state_{HashContentIdentifier(contentIdentifier)}.json"; } } diff --git a/src/dotnet/Vectorization/Services/VectorizationStates/MemoryVectorizationStateService.cs b/src/dotnet/Vectorization/Services/VectorizationStates/MemoryVectorizationStateService.cs index fce71e2d32..4235d7e620 100644 --- a/src/dotnet/Vectorization/Services/VectorizationStates/MemoryVectorizationStateService.cs +++ b/src/dotnet/Vectorization/Services/VectorizationStates/MemoryVectorizationStateService.cs @@ -4,6 +4,9 @@ namespace FoundationaLLM.Vectorization.Services.VectorizationStates { + /// + /// Provides in-memory vectorization state persistence. + /// public class MemoryVectorizationStateService : VectorizationStateServiceBase, IVectorizationStateService { private readonly Dictionary _vectorizationStateDictionary = []; @@ -29,6 +32,9 @@ public async Task ReadState(VectorizationRequest request) return value; } + /// + public async Task LoadArtifacts(VectorizationState state, VectorizationArtifactType artifactType) => throw new NotImplementedException(); + /// public async Task SaveState(VectorizationState state) { @@ -40,8 +46,5 @@ public async Task SaveState(VectorizationState state) if (!_vectorizationStateDictionary.TryAdd(id, state)) _vectorizationStateDictionary[id] = state; } - - protected override string GetPersistenceIdentifier(VectorizationContentIdentifier contentIdentifier) => - $"{contentIdentifier.CanonicalId}_state_{HashContentIdentifier(contentIdentifier)}"; } } diff --git a/src/dotnet/Vectorization/Services/VectorizationStates/VectorizationStateServiceBase.cs b/src/dotnet/Vectorization/Services/VectorizationStates/VectorizationStateServiceBase.cs index 5b6f637916..b10df6398e 100644 --- a/src/dotnet/Vectorization/Services/VectorizationStates/VectorizationStateServiceBase.cs +++ b/src/dotnet/Vectorization/Services/VectorizationStates/VectorizationStateServiceBase.cs @@ -8,10 +8,24 @@ namespace FoundationaLLM.Vectorization.Services.VectorizationStates { + /// + /// Provides base services for the vectorization state services. + /// public abstract class VectorizationStateServiceBase { - protected abstract string GetPersistenceIdentifier(VectorizationContentIdentifier contentIdentifier); + /// + /// Gets the location of the vectorization state based on the content identifier. + /// + /// The holding the content identification information. + /// + protected string GetPersistenceIdentifier(VectorizationContentIdentifier contentIdentifier) => + $"{contentIdentifier.CanonicalId}_state_{HashContentIdentifier(contentIdentifier)}"; + /// + /// Computes the MD5 hash of the content identifier. + /// + /// The holding the content identification information. + /// protected static string HashContentIdentifier(VectorizationContentIdentifier contentIdentifier) { var byteHash = MD5.HashData( diff --git a/src/dotnet/Vectorization/Vectorization.csproj b/src/dotnet/Vectorization/Vectorization.csproj index b3ae29c555..31ba4d4272 100644 --- a/src/dotnet/Vectorization/Vectorization.csproj +++ b/src/dotnet/Vectorization/Vectorization.csproj @@ -6,12 +6,16 @@ enable FoundationaLLM.Vectorization FoundationaLLM.Vectorization + True + True + + diff --git a/src/dotnet/Vectorization/VectorizationWorker.cs b/src/dotnet/Vectorization/VectorizationWorker.cs index 6202108085..b22724f8f4 100644 --- a/src/dotnet/Vectorization/VectorizationWorker.cs +++ b/src/dotnet/Vectorization/VectorizationWorker.cs @@ -4,18 +4,19 @@ namespace FoundationaLLM.Vectorization { - public class VectorizationWorker + /// + /// Provides the core execution context for the vectorization worker. + /// + /// The collection of request managers to run. + public class VectorizationWorker( + IEnumerable requestManagerServices) { - private readonly IEnumerable _requestManagerServices; - - public VectorizationWorker( - IVectorizationStateService vectorizationStateService, - IDictionary requestSourceServices, - IEnumerable requestManagerServices, - ILogger logger, - CancellationToken cancellationToken) => - _requestManagerServices = requestManagerServices; + private readonly IEnumerable _requestManagerServices = requestManagerServices; + /// + /// Starts all the request managers and enters a holding pattern waiting on them to stop. + /// + /// public async Task Run() { var requestManagerTasks = _requestManagerServices diff --git a/src/dotnet/Vectorization/VectorizationWorkerBuilder.cs b/src/dotnet/Vectorization/VectorizationWorkerBuilder.cs index 48e165d3c4..f5c66040b6 100644 --- a/src/dotnet/Vectorization/VectorizationWorkerBuilder.cs +++ b/src/dotnet/Vectorization/VectorizationWorkerBuilder.cs @@ -22,6 +22,8 @@ public class VectorizationWorkerBuilder private IVectorizationStateService? _stateService; private CancellationToken _cancellationToken = default; private ILoggerFactory? _loggerFactory; + private IConfigurationSection? _stepsConfiguration; + private IContentSourceManagerService? _contentSourceManagerService; private readonly RequestSourcesBuilder _requestSourcesBuilder = new(); @@ -48,6 +50,9 @@ public VectorizationWorker Build() if (_loggerFactory == null) throw new VectorizationException("Cannot build a vectorization worker without a valid logger factory."); + if (_contentSourceManagerService == null) + throw new VectorizationException("Cannot build a vectorization worker without a valid content sources manager."); + var requestSourceServices = _requestSourcesBuilder.Build(); var requestManagerServices = _settings!.RequestManagers! @@ -55,16 +60,14 @@ public VectorizationWorker Build() rm, requestSourceServices, _stateService, - _loggerFactory!.CreateLogger(), + _stepsConfiguration, + _contentSourceManagerService, + _loggerFactory, _cancellationToken)) .ToList(); var vectorizationWorker = new VectorizationWorker( - _stateService, - requestSourceServices, - requestManagerServices, - _loggerFactory!.CreateLogger(), - _cancellationToken); + requestManagerServices); return vectorizationWorker; } @@ -131,6 +134,28 @@ public VectorizationWorkerBuilder WithQueuesConfiguration(IConfigurationSection return this; } + /// + /// Specifies the configuration section containing settings for vectorization pipeline steps. + /// + /// The object providing access to the settings. + /// The updated instance of the builder. + public VectorizationWorkerBuilder WithStepsConfiguration(IConfigurationSection stepsConfiguration) + { + _stepsConfiguration = stepsConfiguration; + return this; + } + + /// + /// Specifies the content source manager service. + /// + /// The that manages content sources. + /// The updated instance of the builder. + public VectorizationWorkerBuilder WithContentSourceManager(IContentSourceManagerService contentSourceManagerService) + { + _contentSourceManagerService = contentSourceManagerService; + return this; + } + private static void ValidateSettings(VectorizationWorkerSettings settings) { if ( diff --git a/src/dotnet/VectorizationAPI/Program.cs b/src/dotnet/VectorizationAPI/Program.cs index bf05e3c163..30ebac2d01 100644 --- a/src/dotnet/VectorizationAPI/Program.cs +++ b/src/dotnet/VectorizationAPI/Program.cs @@ -50,7 +50,7 @@ // Add configurations to the container builder.Services.AddOptions() - .Bind(builder.Configuration.GetSection(AppConfigurationKeySections.FoundationaLLM_Vectorization_WorkerSettings)); + .Bind(builder.Configuration.GetSection(AppConfigurationKeys.FoundationaLLM_Vectorization_VectorizationWorker)); builder.Services.AddSingleton( typeof(IConfigurationSection), diff --git a/src/dotnet/VectorizationWorker/Program.cs b/src/dotnet/VectorizationWorker/Program.cs index f531f2fd01..33e71d43dc 100644 --- a/src/dotnet/VectorizationWorker/Program.cs +++ b/src/dotnet/VectorizationWorker/Program.cs @@ -7,6 +7,7 @@ using FoundationaLLM.Common.Settings; using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models.Configuration; +using FoundationaLLM.Vectorization.Services.ContentSources; using FoundationaLLM.Vectorization.Services.VectorizationStates; using FoundationaLLM.Vectorization.Worker; using Microsoft.ApplicationInsights.AspNetCore.Extensions; @@ -49,17 +50,23 @@ // Add configurations to the container builder.Services.AddOptions() - .Bind(builder.Configuration.GetSection(AppConfigurationKeySections.FoundationaLLM_Vectorization_WorkerSettings)); + .Bind(builder.Configuration.GetSection(AppConfigurationKeys.FoundationaLLM_Vectorization_VectorizationWorker)); builder.Services.AddOptions() - .Bind(builder.Configuration.GetSection(AppConfigurationKeySections.FoundationaLLM_Vectorization_StateServiceSettings)); + .Bind(builder.Configuration.GetSection(AppConfigurationKeySections.FoundationaLLM_Vectorization_StateService)); +builder.Services.AddOptions() + .Bind(builder.Configuration.GetSection(AppConfigurationKeys.FoundationaLLM_Vectorization_ContentSourceManagerService)); builder.Services.AddSingleton( - typeof(IConfigurationSection), - builder.Configuration.GetSection(AppConfigurationKeySections.FoundationaLLM_Vectorization_Queues)); + typeof(IEnumerable), + new IConfigurationSection[] { + builder.Configuration.GetSection(AppConfigurationKeySections.FoundationaLLM_Vectorization_Queues), + builder.Configuration.GetSection(AppConfigurationKeySections.FoundationaLLM_Vectorization_Steps) + }); // Add services to the container. builder.Services.AddTransient(); builder.Services.AddSingleton(); +builder.Services.AddSingleton(); builder.Services.AddHostedService(); builder.Services.AddControllers(); diff --git a/src/dotnet/VectorizationWorker/Worker.cs b/src/dotnet/VectorizationWorker/Worker.cs index d2f162e6cb..ca05d5d3e8 100644 --- a/src/dotnet/VectorizationWorker/Worker.cs +++ b/src/dotnet/VectorizationWorker/Worker.cs @@ -1,4 +1,5 @@  +using FoundationaLLM.Common.Constants; using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models.Configuration; using Microsoft.Extensions.Options; @@ -12,18 +13,21 @@ namespace FoundationaLLM.Vectorization.Worker /// Creates a new instance of the worker. /// /// The used to manage the vectorization state. + /// The used to manage content sources. /// The options holding the vectorization worker settings. - /// The containing the settings for the queues. + /// The list of configuration sections required by the vectorization worker builder. /// The used to create loggers in child objects. public class Worker( IVectorizationStateService stateService, + IContentSourceManagerService contentSourceManagerService, IOptions settings, - IConfigurationSection queuesConfiguration, + IEnumerable configurationSections, ILoggerFactory loggerFactory) : BackgroundService { private readonly IVectorizationStateService _stateService = stateService; + private readonly IContentSourceManagerService _contentSourceManagerService = contentSourceManagerService; private readonly VectorizationWorkerSettings _settings = settings.Value; - private readonly IConfigurationSection _queuesConfiguration = queuesConfiguration; + private readonly IEnumerable _configurationSections = configurationSections; private readonly ILoggerFactory _loggerFactory = loggerFactory; /// @@ -32,7 +36,9 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) var vectorizationWorker = new VectorizationWorkerBuilder() .WithStateService(_stateService) .WithSettings(_settings) - .WithQueuesConfiguration(_queuesConfiguration) + .WithQueuesConfiguration(_configurationSections.Single(cs => cs.Path == AppConfigurationKeySections.FoundationaLLM_Vectorization_Queues)) + .WithStepsConfiguration(_configurationSections.Single(cs => cs.Path == AppConfigurationKeySections.FoundationaLLM_Vectorization_Steps)) + .WithContentSourceManager(_contentSourceManagerService) .WithLoggerFactory(_loggerFactory) .WithCancellationToken(stoppingToken) .Build();