AOSSIE-Org · ParagGhatage · Jan 16, 2025 · Jan 17, 2025 · Jan 17, 2025
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,8 @@ yarn-error.log*
 pnpm-debug.log*
 lerna-debug.log*
 
+backend/app/models/image-generation/*
+
 node_modules
 dist
 dist-ssr

diff --git a/README.md b/README.md
@@ -55,17 +55,15 @@ Handles file system operations and provides a secure bridge between the frontend
 
 ## Setup
 
-### Frontend Setup
-
 #### Prerequisites
 
 - Node.js (LTS version recommended)
 - npm (comes with Node.js)
 - Rust (latest stable version)
 - Tauri CLI
 
-#### Installation
 
+#### Installation
 
 1. Clone the repository to your local system:
     ```bash
@@ -74,6 +72,13 @@ Handles file system operations and provides a secure bridge between the frontend
     ```bash
     cd PictoPy
     ```
+2. AI image Generation Setup :
+    - [Click Here](https://drive.google.com/uc?export=download&id=1-ufcBNxai2K64NCxPw8wBLKTzRXnZ5HA) to download the AI model.
+    - extract downloaded zip file.
+    - Copy all the folders from `saved_diffusion_pipeline\` folder into `backend\app\models\image-generation\`
+    directory of PictoPy.
+
+### Frontend Setup
 
 
 1. Navigate to the frontend directory:
@@ -265,6 +270,12 @@ You can control the number of workers by setting the `WORKERS` environment varia
   </br>
 - For setting up the backend, follow the instructions in the [Backend Setup Guide](./docs/backend/docker-setup.md).
 
+## AI Model Documentation
+
+For more detailed information on the AI models used in this project, refer to the 
+- [AI Image-Generation](docs\AI-Models\Image-Generation\stable_deffusion.md).
+
+
 ## Additional Resources
 
 - [Tauri Documentation](https://tauri.app/v1/guides/)
@@ -274,3 +285,6 @@ You can control the number of workers by setting the `WORKERS` environment varia
 ## Troubleshooting
 
 If you encounter any issues, please check the respective documentation for Tauri, React, and FastAPI. For persistent problems, feel free to open an issue in the project repository.
+
+
+
diff --git a/backend/app/routes/images.py b/backend/app/routes/images.py
@@ -1,9 +1,19 @@
 import os
 import shutil
 import asyncio
-from fastapi import APIRouter, Query
-from fastapi.responses import JSONResponse
+import time
+import logging
+from fastapi import APIRouter, Query, HTTPException
+from fastapi.responses import JSONResponse, StreamingResponse
+from pydantic import BaseModel
+from transformers import pipeline
+from diffusers import StableDiffusionPipeline, DiffusionPipeline, LCMScheduler
+import torch
+import matplotlib.pyplot as plt
+from io import BytesIO
+import base64
 from PIL import Image
+from fastapi import HTTPException, Query
 
 # hello
 from app.config.settings import IMAGES_PATH
@@ -19,6 +29,7 @@
     extract_metadata,
 )
 
+
 router = APIRouter()
 
 
@@ -32,6 +43,44 @@ async def run_get_classes(img_path):
             detect_faces(img_path)
 
 
+
+print(os.curdir)
+
+model_path = os.path.abspath("./app/models/image-generation")
+print("Model path:", model_path)
+# Load the saved diffusion pipeline once
+pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+pipe.to("cuda")
+print("Model loaded and ready on GPU")
+
+class GenerateImageRequest(BaseModel):
+    prompt: str
+from fastapi import HTTPException, Query
+
+@router.post("/generate-image")
+async def generate_image(prompt: str = Query(..., description="Prompt for image generation")):
+    try:
+        # Generate the image
+        print("request received")
+        image = pipe(prompt, num_inference_steps=4, guidance_scale=10.0).images[0]
+
+        # Convert the image to a Base64 string
+        buffer = BytesIO()
+        image.save(buffer, format="PNG")
+        buffer.seek(0)
+        image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+         # Clear GPU memory to prevent out-of-memory errors in subsequent requests
+        del image  # Free the image object
+        torch.cuda.empty_cache()
+        # Return the JSON response with the image
+        return JSONResponse(content={
+            "prompt": prompt,
+            "image": image_base64
+        })
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
 @router.get("/all-images")
 def get_images():
     try:

diff --git a/backend/main.py b/backend/main.py
@@ -39,7 +39,7 @@ async def lifespan(app: FastAPI):
 # Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Allows all origins
+    allow_origins=["http://localhost:1420"],  # Allows all origins
     allow_credentials=True,
     allow_methods=["*"],  # Allows all methods
     allow_headers=["*"],  # Allows all headers

diff --git a/docs/AI-Models/Image-Generation/stable_deffusion.md b/docs/AI-Models/Image-Generation/stable_deffusion.md
@@ -0,0 +1,82 @@
+# Stable Diffusion with Latent Consistency Model (LCM) SSD-1B
+
+## Overview
+
+In this documentation, I will provide an overview of the `latent-consistency/lcm-ssd-1b` model, how it is used for efficient image generation, and details of my experience with this model.
+
+### What is Latent Consistency Model (LCM)?
+
+Latent Consistency Model (LCM) is a novel architecture that improves the efficiency of diffusion-based models. The LCM SSD-1B is a distilled version of the Segmind Stable Diffusion XL (SDXL) model, designed to reduce the inference steps required for generating high-resolution images. The model provides the flexibility to generate high-quality images in just 2 to 8 steps, significantly speeding up the process compared to traditional diffusion models.
+
+The `latent-consistency/lcm-ssd-1b` model is a lighter version of SDXL, with a 60% speed improvement while maintaining the quality of the generated images. This model can be used for both text-to-image and image-to-image generation tasks.
+
+## Features
+
+- **Fast Inference**: The `lcm-ssd-1b` model can generate high-resolution images with fewer inference steps (2-8 steps).
+- **Distilled for Efficiency**: A 50% smaller version of the SDXL model that offers a 60% speedup.
+- **High-Quality Image Generation**: Capable of producing detailed and realistic images based on text prompts.
+- **Versatile Use**: Works for a variety of tasks, including text-to-image, inpainting, image-to-image, and ControlNet.
+
+## Installation
+
+To use the model, ensure that the following dependencies are installed:
+
+```bash
+pip install --upgrade pip
+pip install --upgrade diffusers transformers accelerate peft
+```
+
+## Architecture
+
+### Overview
+
+The **Latent Consistency Model (LCM)**, specifically the **SSD-1B** variant, is based on the principles of generative diffusion models, particularly the *Stable Diffusion XL* (SDXL). It focuses on accelerating inference time while maintaining high-quality text-to-image generation capabilities. The architecture is built on a **UNet-based network** that conditions on text prompts and generates images with the help of a scheduler and guidance mechanism.
+
+### Key Components
+
+1. **UNet2DConditionModel**: The UNet model serves as the core neural network used for conditioning and generating images. It has been adapted from the original Stable Diffusion model, incorporating features like latent consistency to enhance the generation process.
+
+2. **DiffusionPipeline**: This component manages the flow of data and model processing. It loads both the base model and the scheduler, facilitating inference and image generation.
+
+3. **LCMScheduler**: The scheduler, crucial for the generation process, enables the model to perform fewer inference steps, thus reducing the time required to generate high-quality images.
+
+4. **Text-to-Image Conditioning**: The model uses **text prompts** that guide the generation process. The input prompt influences the generated image, enabling applications like **image synthesis, inpainting,** and **style transfer**.
+
+5. **Knowledge Distillation**: The SSD-1B is distilled from the larger SDXL model using a knowledge distillation strategy. This process transfers the learning from the larger model to a smaller model, offering speed improvements without a significant loss in quality.
+
+6. **Guidance Mechanism**: A guidance scale is used to direct the model’s output toward the intended result, enhancing the relevance of the generated image to the text prompt.
+
+### Workflow
+
+1. **Prompt Input**: The user provides a textual prompt that describes the desired image.
+2. **Latent Space Transformation**: The text prompt is processed, and the network uses a latent representation to start the image generation.
+3. **Scheduler Interaction**: The LCMScheduler controls the number of inference steps, allowing the model to generate high-quality images in fewer steps.
+4. **Generation Output**: The model generates the image based on the prompt, utilizing guidance scales for refinement.
+5. **Post-processing**: The generated image can be further processed or refined using image enhancement techniques.
+
+### Architecture Diagram
+
+```plaintext
+            +------------------+
+            |   Text Prompt    | 
+            +------------------+
+                     |
+                     v
+       +---------------------------+
+       | Text-to-Image Conditioning |
+       +---------------------------+
+                     |
+                     v
+       +----------------------------+
+       | UNet2DConditionModel (UNet) |
+       +----------------------------+
+                     |
+                     v
+       +--------------------------+
+       |      LCMScheduler         |
+       +--------------------------+
+                     |
+                     v
+       +----------------------------+
+       |  Image Generation Output   |
+       +----------------------------+