Skip to content

Commit

Permalink
feat: make browser more lightweight
Browse files Browse the repository at this point in the history
  • Loading branch information
nherment committed Jan 21, 2025
1 parent 05cf2e5 commit c7d6d82
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 216 deletions.
1 change: 0 additions & 1 deletion .github/workflows/build-and-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ jobs:
curl -sSL https://install.python-poetry.org | python3 - --version 1.4.0
poetry config virtualenvs.create false
poetry install --no-root
poetry run python -m playwright install --with-deps firefox
sudo apt-get install -y binutils
pyinstaller holmes.py --add-data 'holmes/plugins/runbooks/*:holmes/plugins/runbooks' --add-data 'holmes/plugins/prompts/*:holmes/plugins/prompts' --add-data 'holmes/plugins/toolsets/*:holmes/plugins/toolsets' --hidden-import=tiktoken_ext.openai_public --hidden-import=tiktoken_ext --hiddenimport litellm.llms.tokenizers --collect-data litellm
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/llm-evaluation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ jobs:
curl -sSL https://install.python-poetry.org | python3 - --version 1.4.0
poetry config virtualenvs.create false
poetry install --no-root
poetry run python -m playwright install --with-deps firefox
- name: Run tests
shell: bash
Expand Down
1 change: 0 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ ENV PYTHONPATH=$PYTHONPATH:.:/app/holmes
WORKDIR /app

COPY --from=builder /app/venv /venv
RUN python -m playwright install firefox --with-deps

# We're installing here libexpat1, to upgrade the package to include a fix to 3 high CVEs. CVE-2024-45491,CVE-2024-45490,CVE-2024-45492
RUN apt-get update \
Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ To this 👇

### Key Features
- **Automatic data collection:** HolmesGPT surfaces up the observability data you need to investigate
- **Secure:** *Read-only* access to your data - respects RBAC permissions
- **Secure:** *Read-only* access to your data - respects RBAC permissions
- **Runbook automation and knowledge sharing:** Tell Holmes how you investigate today and it will automate it
- **Extensible:** Add your own data sources (tools) and Holmes will use them to investigate
- **Data Privacy:** Bring your own API key for any AI provider (OpenAI, Azure, AWS Bedrock, etc)
Expand Down Expand Up @@ -491,7 +491,7 @@ To use Vertex AI with Gemini models, set the following environment variables:

```bash
export VERTEXAI_PROJECT="your-project-id"
export VERTEXAI_LOCATION="us-central1"
export VERTEXAI_LOCATION="us-central1"
export GOOGLE_APPLICATION_CREDENTIALS="path/to/your/service_account_key.json"
```

Expand Down Expand Up @@ -560,7 +560,8 @@ Fetching runbooks through URLs
</summary>

HolmesGPT can consult webpages containing runbooks or other relevant information.
HolmesGPT uses playwright to scrape webpages and requires playwright to be installed and working through `playwright install`.
This is done through a HTTP GET and the resulting HTML is then cleaned and parsed into markdown.
Any Javascript that is on the webpage is ignored.
</details>

<details>
Expand Down
157 changes: 89 additions & 68 deletions holmes/plugins/toolsets/internet.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,101 @@
import re
import os
import logging

from typing import Any
from holmes.core.tools import Tool, ToolParameter, Toolset, ToolsetCommandPrerequisite
from typing import Any, Optional, Tuple

from requests import RequestException, Timeout
from holmes.core.tools import Tool, ToolParameter, Toolset, ToolsetTag
from markdownify import markdownify
from playwright.sync_api import Error as PlaywrightError
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup

# TODO: change and make it holmes
USER_AGENT_STR = (
"Mozilla/5.0 (X11; Linux x86_64; rv:128.0; holmesgpt;) Gecko/20100101 Firefox/128.0"
)
PAGE_LOAD_TIMEOUT_SECONDS = 60000


def scrape_with_playwright(url):
import requests

with sync_playwright() as p:
try:
browser = p.firefox.launch()
except Exception as e:
logging.error(str(e))
return None, None
# TODO: change and make it holmes
INTERNET_TOOLSET_USER_AGENT = os.environ.get("INTERNET_TOOLSET_USER_AGENT", "Mozilla/5.0 (X11; Linux x86_64; rv:128.0; holmesgpt;) Gecko/20100101 Firefox/128.0")
INTERNET_TOOLSET_TIMEOUT_SECONDS = int(os.environ.get("INTERNET_TOOLSET_TIMEOUT_SECONDS", "60"))

SELECTORS_TO_REMOVE = [
'script', 'style', 'meta', 'link', 'noscript',
'header', 'footer', 'nav',
'iframe', 'svg', 'img',
'button',
'menu', 'sidebar', 'aside',
'.header'
'.footer'
'.navigation',
'.nav',
'.menu',
'.sidebar',
'.ad',
'.advertisement',
'.social',
'.popup',
'.modal',
'.banner',
'.cookie-notice',
'.social-share',
'.related-articles',
'.recommended',
'#header'
'#footer'
'#navigation',
'#nav',
'#menu',
'#sidebar',
'#ad',
'#advertisement',
'#social',
'#popup',
'#modal',
'#banner',
'#cookie-notice',
'#social-share',
'#related-articles',
'#recommended'
]

def scrape(url) -> Tuple[Optional[str], Optional[str]]:
response = None
content = None
mime_type = None
try:
response = requests.get(
url,
headers={
'User-Agent': INTERNET_TOOLSET_USER_AGENT
},
timeout=INTERNET_TOOLSET_TIMEOUT_SECONDS
)
response.raise_for_status()
except Timeout:
logging.error(
f"Failed to load {url}. Timeout after {INTERNET_TOOLSET_TIMEOUT_SECONDS} seconds",
exc_info=True
)
except RequestException as e:
logging.error(f"Failed to load {url}: {str(e)}", exc_info=True)
return None, None

if response:
content = response.text
try:
context = browser.new_context(ignore_https_errors=False)
page = context.new_page()
content_type = response.headers['content-type']
if content_type:
mime_type = content_type.split(";")[0]
except Exception:
logging.info(f"Failed to parse content type from headers {response.headers}")

page.set_extra_http_headers({"User-Agent": USER_AGENT_STR})
return (content, mime_type)

response = None
try:
response = page.goto(
url, wait_until="networkidle", timeout=PAGE_LOAD_TIMEOUT_SECONDS
)
context.cookies() # Reading cookies allows to load some pages checking that cookies are enabled
except PlaywrightTimeoutError:
logging.error(
f"Failed to load {url}. Timeout after {PAGE_LOAD_TIMEOUT_SECONDS} seconds"
)
except PlaywrightError as e:
logging.error(f"Failed to load {url}: {str(e)}")
return None, None

try:
content = page.content()
mime_type = None
if response:
content_type = response.header_value("content-type")
if content_type:
mime_type = content_type.split(";")[0]
except PlaywrightError as e:
logging.error(f"Error retrieving page content: {str(e)}")
content = None
mime_type = None
finally:
browser.close()

return content, mime_type


def cleanup(soup):
def cleanup(soup:BeautifulSoup):
"""Remove all elements that are irrelevant to the textual representation of a web page.
This includes images, extra data, even links as there is no intention to navigate from that page.
"""
for svg in soup.find_all("svg"):
svg.decompose()

if soup.img:
soup.img.decompose()
for selector in SELECTORS_TO_REMOVE:
for element in soup.select(selector):
element.decompose()

for tag in soup.find_all("a"):
tag.unwrap()
Expand All @@ -83,7 +108,8 @@ def cleanup(soup):
return soup


def html_to_markdown(page_source):

def html_to_markdown(page_source:str):

soup = BeautifulSoup(page_source, "html.parser")
soup = cleanup(soup)
Expand Down Expand Up @@ -122,7 +148,7 @@ class FetchWebpage(Tool):
def __init__(self):
super().__init__(
name="fetch_webpage",
description="Fetch a webpage with w3m. Use this to fetch runbooks if they are present before starting your investigation (if no other tool like confluence is more appropriate)",
description="Fetch a webpage. Use this to fetch runbooks if they are present before starting your investigation (if no other tool like confluence is more appropriate)",
parameters={
"url": ToolParameter(
description="The URL to fetch",
Expand All @@ -135,7 +161,7 @@ def __init__(self):
def invoke(self, params: Any) -> str:

url: str = params["url"]
content, mime_type = scrape_with_playwright(url)
content, mime_type = scrape(url)

if not content:
logging.error(f"Failed to retrieve content from {url}")
Expand All @@ -160,12 +186,7 @@ def __init__(self):
name="internet",
description="Fetch webpages",
icon_url="https://platform.robusta.dev/demos/internet-access.svg",
prerequisites=[
# Take a sucessful screenshot ensures playwright is correctly installed
ToolsetCommandPrerequisite(
command="python -m playwright screenshot --browser firefox https://www.google.com playwright.png"
),
],
prerequisites=[],
tools=[FetchWebpage()],
tags=["core",]
tags=[ToolsetTag.CORE,]
)
Loading

0 comments on commit c7d6d82

Please sign in to comment.