feat: make browser more lightweight

robusta-dev · Jan 21, 2025 · c7d6d82 · c7d6d82
1 parent 05cf2e5
commit c7d6d82
Show file tree

Hide file tree

Showing 9 changed files with 117 additions and 216 deletions.
diff --git a/.github/workflows/build-and-test.yaml b/.github/workflows/build-and-test.yaml
@@ -37,7 +37,6 @@ jobs:
           curl -sSL https://install.python-poetry.org | python3 - --version 1.4.0
           poetry config virtualenvs.create false
           poetry install --no-root
-          poetry run python -m playwright install --with-deps firefox
 
           sudo apt-get install -y binutils
           pyinstaller holmes.py --add-data 'holmes/plugins/runbooks/*:holmes/plugins/runbooks' --add-data 'holmes/plugins/prompts/*:holmes/plugins/prompts' --add-data 'holmes/plugins/toolsets/*:holmes/plugins/toolsets' --hidden-import=tiktoken_ext.openai_public --hidden-import=tiktoken_ext --hiddenimport litellm.llms.tokenizers --collect-data litellm

diff --git a/.github/workflows/llm-evaluation.yaml b/.github/workflows/llm-evaluation.yaml
@@ -26,7 +26,6 @@ jobs:
           curl -sSL https://install.python-poetry.org | python3 - --version 1.4.0
           poetry config virtualenvs.create false
           poetry install --no-root
-          poetry run python -m playwright install --with-deps firefox
 
       - name: Run tests
         shell: bash

diff --git a/Dockerfile b/Dockerfile
@@ -75,7 +75,6 @@ ENV PYTHONPATH=$PYTHONPATH:.:/app/holmes
 WORKDIR /app
 
 COPY --from=builder /app/venv /venv
-RUN python -m playwright install firefox --with-deps
 
 # We're installing here libexpat1, to upgrade the package to include a fix to 3 high CVEs. CVE-2024-45491,CVE-2024-45490,CVE-2024-45492
 RUN apt-get update \

diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ To this 👇
 
 ### Key Features
 - **Automatic data collection:** HolmesGPT surfaces up the observability data you need to investigate
-- **Secure:** *Read-only* access to your data - respects RBAC permissions 
+- **Secure:** *Read-only* access to your data - respects RBAC permissions
 - **Runbook automation and knowledge sharing:** Tell Holmes how you investigate today and it will automate it
 - **Extensible:** Add your own data sources (tools) and Holmes will use them to investigate
 - **Data Privacy:** Bring your own API key for any AI provider (OpenAI, Azure, AWS Bedrock, etc)
@@ -491,7 +491,7 @@ To use Vertex AI with Gemini models, set the following environment variables:
 
 ```bash
 export VERTEXAI_PROJECT="your-project-id"
-export VERTEXAI_LOCATION="us-central1" 
+export VERTEXAI_LOCATION="us-central1"
 export GOOGLE_APPLICATION_CREDENTIALS="path/to/your/service_account_key.json"
 ```
 
@@ -560,7 +560,8 @@ Fetching runbooks through URLs
 </summary>
 
 HolmesGPT can consult webpages containing runbooks or other relevant information.
-HolmesGPT uses playwright to scrape webpages and requires playwright to be installed and working through `playwright install`.
+This is done through a HTTP GET and the resulting HTML is then cleaned and parsed into markdown.
+Any Javascript that is on the webpage is ignored.
 </details>
 
 <details>

diff --git a/holmes/plugins/toolsets/internet.py b/holmes/plugins/toolsets/internet.py
@@ -1,76 +1,101 @@
 import re
+import os
 import logging
 
-from typing import Any
-from holmes.core.tools import Tool, ToolParameter, Toolset, ToolsetCommandPrerequisite
+from typing import Any, Optional, Tuple
+
+from requests import RequestException, Timeout
+from holmes.core.tools import Tool, ToolParameter, Toolset, ToolsetTag
 from markdownify import markdownify
-from playwright.sync_api import Error as PlaywrightError
-from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
-from playwright.sync_api import sync_playwright
 from bs4 import BeautifulSoup
 
-# TODO: change and make it holmes
-USER_AGENT_STR = (
-    "Mozilla/5.0 (X11; Linux x86_64; rv:128.0; holmesgpt;) Gecko/20100101 Firefox/128.0"
-)
-PAGE_LOAD_TIMEOUT_SECONDS = 60000
-
-
-def scrape_with_playwright(url):
+import requests
 
-    with sync_playwright() as p:
-        try:
-            browser = p.firefox.launch()
-        except Exception as e:
-            logging.error(str(e))
-            return None, None
+# TODO: change and make it holmes
+INTERNET_TOOLSET_USER_AGENT = os.environ.get("INTERNET_TOOLSET_USER_AGENT", "Mozilla/5.0 (X11; Linux x86_64; rv:128.0; holmesgpt;) Gecko/20100101 Firefox/128.0")
+INTERNET_TOOLSET_TIMEOUT_SECONDS = int(os.environ.get("INTERNET_TOOLSET_TIMEOUT_SECONDS", "60"))
+
+SELECTORS_TO_REMOVE = [
+    'script', 'style', 'meta', 'link', 'noscript',
+    'header', 'footer', 'nav',
+    'iframe', 'svg', 'img',
+    'button',
+    'menu', 'sidebar', 'aside',
+    '.header'
+    '.footer'
+    '.navigation',
+    '.nav',
+    '.menu',
+    '.sidebar',
+    '.ad',
+    '.advertisement',
+    '.social',
+    '.popup',
+    '.modal',
+    '.banner',
+    '.cookie-notice',
+    '.social-share',
+    '.related-articles',
+    '.recommended',
+    '#header'
+    '#footer'
+    '#navigation',
+    '#nav',
+    '#menu',
+    '#sidebar',
+    '#ad',
+    '#advertisement',
+    '#social',
+    '#popup',
+    '#modal',
+    '#banner',
+    '#cookie-notice',
+    '#social-share',
+    '#related-articles',
+    '#recommended'
+]
+
+def scrape(url) -> Tuple[Optional[str], Optional[str]]:
+    response = None
+    content = None
+    mime_type = None
+    try:
+        response = requests.get(
+            url,
+            headers={
+                'User-Agent': INTERNET_TOOLSET_USER_AGENT
+            },
+            timeout=INTERNET_TOOLSET_TIMEOUT_SECONDS
+        )
+        response.raise_for_status()
+    except Timeout:
+        logging.error(
+            f"Failed to load {url}. Timeout after {INTERNET_TOOLSET_TIMEOUT_SECONDS} seconds",
+            exc_info=True
+        )
+    except RequestException as e:
+        logging.error(f"Failed to load {url}: {str(e)}", exc_info=True)
+        return None, None
 
+    if response:
+        content = response.text
         try:
-            context = browser.new_context(ignore_https_errors=False)
-            page = context.new_page()
+            content_type = response.headers['content-type']
+            if content_type:
+                mime_type = content_type.split(";")[0]
+        except Exception:
+            logging.info(f"Failed to parse content type from headers {response.headers}")
 
-            page.set_extra_http_headers({"User-Agent": USER_AGENT_STR})
+    return (content, mime_type)
 
-            response = None
-            try:
-                response = page.goto(
-                    url, wait_until="networkidle", timeout=PAGE_LOAD_TIMEOUT_SECONDS
-                )
-                context.cookies()  # Reading cookies allows to load some pages checking that cookies are enabled
-            except PlaywrightTimeoutError:
-                logging.error(
-                    f"Failed to load {url}. Timeout after {PAGE_LOAD_TIMEOUT_SECONDS} seconds"
-                )
-            except PlaywrightError as e:
-                logging.error(f"Failed to load {url}: {str(e)}")
-                return None, None
-
-            try:
-                content = page.content()
-                mime_type = None
-                if response:
-                    content_type = response.header_value("content-type")
-                    if content_type:
-                        mime_type = content_type.split(";")[0]
-            except PlaywrightError as e:
-                logging.error(f"Error retrieving page content: {str(e)}")
-                content = None
-                mime_type = None
-        finally:
-            browser.close()
-
-    return content, mime_type
-
-
-def cleanup(soup):
+def cleanup(soup:BeautifulSoup):
     """Remove all elements that are irrelevant to the textual representation of a web page.
     This includes images, extra data, even links as there is no intention to navigate from that page.
     """
-    for svg in soup.find_all("svg"):
-        svg.decompose()
 
-    if soup.img:
-        soup.img.decompose()
+    for selector in SELECTORS_TO_REMOVE:
+        for element in soup.select(selector):
+            element.decompose()
 
     for tag in soup.find_all("a"):
         tag.unwrap()
@@ -83,7 +108,8 @@ def cleanup(soup):
     return soup
 
 
-def html_to_markdown(page_source):
+
+def html_to_markdown(page_source:str):
 
     soup = BeautifulSoup(page_source, "html.parser")
     soup = cleanup(soup)
@@ -122,7 +148,7 @@ class FetchWebpage(Tool):
     def __init__(self):
         super().__init__(
             name="fetch_webpage",
-            description="Fetch a webpage with w3m. Use this to fetch runbooks if they are present before starting your investigation (if no other tool like confluence is more appropriate)",
+            description="Fetch a webpage. Use this to fetch runbooks if they are present before starting your investigation (if no other tool like confluence is more appropriate)",
             parameters={
                 "url": ToolParameter(
                     description="The URL to fetch",
@@ -135,7 +161,7 @@ def __init__(self):
     def invoke(self, params: Any) -> str:
 
         url: str = params["url"]
-        content, mime_type = scrape_with_playwright(url)
+        content, mime_type = scrape(url)
 
         if not content:
             logging.error(f"Failed to retrieve content from {url}")
@@ -160,12 +186,7 @@ def __init__(self):
             name="internet",
             description="Fetch webpages",
             icon_url="https://platform.robusta.dev/demos/internet-access.svg",
-            prerequisites=[
-                # Take a sucessful screenshot ensures playwright is correctly installed
-                ToolsetCommandPrerequisite(
-                    command="python -m playwright screenshot --browser firefox https://www.google.com playwright.png"
-                ),
-            ],
+            prerequisites=[],
             tools=[FetchWebpage()],
-            tags=["core",]
+            tags=[ToolsetTag.CORE,]
         )