Skip to content

Commit

Permalink
Crawler for AWS controls for #18.
Browse files Browse the repository at this point in the history
  • Loading branch information
xee5ch committed Oct 14, 2023
1 parent f4807ba commit 11a0b62
Show file tree
Hide file tree
Showing 5 changed files with 348 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/com/amazon/aws/securityhub/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# An example AWS SecurityHub standard controls documentation pages.
example.html
Empty file.
161 changes: 161 additions & 0 deletions src/com/amazon/aws/securityhub/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions src/com/amazon/aws/securityhub/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[tool.poetry]
name = "securityhub"
version = "0.1.0"
description = "Scripting to convert AWS Security Hub controls for AWS services into AWS format."
authors = ["Al S <[email protected]>"]
license = "GPL-3.0"

[tool.poetry.dependencies]
python = "^3.10"
Jinja2 = "3.1.2"
requests = "2.31.0"
parsel = "1.8.1"

[tool.poetry.dev-dependencies]

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
167 changes: 167 additions & 0 deletions src/com/amazon/aws/securityhub/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#!/usr/bin/env python3
import abc
import argparse
import datetime
from jinja2 import Environment, FileSystemLoader
import logging
from os.path import dirname
from pathlib import Path
from parsel import Selector
import re
import requests
from typing import Dict, List
import uuid
from urllib.parse import urljoin

class Renderer:
@classmethod
def __subclasshook__(cls, subclass):
return (hasattr(subclass, 'render')
and callable(subclass.render))

class JinjaTemplateRender:
def __init__(self, searchpath=f"{dirname(__file__)}", template='ssdf_catalog.json.j2'):
self.loader = FileSystemLoader(searchpath)
self.environment = Environment(loader=self.loader, autoescape=True)
try:
self.template = self.environment.get_template(template)
self.template.globals.update({
'strftime': datetime.datetime.strftime,
'uuid4': uuid.uuid4
})
except Exception as err:
logging.exception(f"Template at '{template}' failed to load, must reinit!")
logging.exception(err)
self.template = None
def render(self, *args, **kwargs):
return self.template.render(*args, **kwargs)

class Transformer(metaclass=abc.ABCMeta):
@classmethod
def __subclasshook__(cls, subclass):
return (hasattr(subclass, 'load')
and callable(subclass.load)
and hasattr(subclass, 'transform')
and callable(subclass.transform)
and hasattr(subclass, 'save')
and callable(subclass.save))

class AWSSecurityHubControlTransformer:
def __init__(self, raw_data: List[Selector]):
self.raw_data = raw_data if raw_data else []
self.controls: Dict[str, Dict] = {}

def transform(self):
current_control_id: str = ''
for d in self.raw_data:
try:
tag = d.root.tag
if tag == 'p':
self.process_p(d, current_control_id)
if tag == 'div':
self.process_div(d, current_control_id)
if tag == 'h3':
self.process_h3(d, current_control_id)
if tag == 'h2':
control_id, control_label, control_title = self.process_h2(d)
self.controls[control_id] = {}
self.controls[control_id] = {
'label': control_label,
'title': control_title
}
current_control_id = control_id

except Exception as err:
logging.exception(err)
return self.controls

def process_p(self, data: Selector, control_id: str):
tag = 'p'
print(f"{tag}: {data.extract()}")
return

def process_div(self, data: Selector, control_id: str):
tag = 'div'
print(f"{tag}: {data.extract()}")
div_class = data.xpath("./@class").extract()[0]
print(f"{tag}: {div_class}")
if div_class == 'itemizedList':
pass
if div_class == 'other':
pass
return

def process_h3(self, data: Selector, control_id: str):
tag = 'h3'
return

def process_h2(self, data: Selector) -> (str, str, str):
tag = 'h2'
control_id = data.xpath("./@id").extract()[0]
rest = data.xpath("./text()").extract()[0]
matches = re.match(r"^\[([.A-Z0-9]{3,})\] (.*)$", rest)
control_label, control_title = matches.groups()
return control_id, control_label, control_title

class ControlIndexSpider:
def __init__(self, url=[]):
self.url = url
self.raw_html = ''
self.target_urls = []

def crawl(self):
try:
response = requests.get(self.url)
self.raw_html = response.content.decode('utf-8')
selector = Selector(text=self.raw_html)
data = selector.xpath("//div[@class='highlights']/ul/li/a/@href")
self.target_urls = [urljoin(response.url, url.extract()) for url in data]
return self.target_urls
except Exception as err:
logging.exception(err)

class ControlDetailSpider:
def __init__(self, urls: List[str]):
self.urls = urls if urls else []
self.raw_data: List[Selector] = []
self.raw_html = ''

def crawl(self):
for url in self.urls:
try:
#response = requests.get(url)
#self.raw_html = response.content.decode('utf-8')
with open('example.html') as fd:
self.raw_html = fd.read()
selector = Selector(text=self.raw_html)
self.raw_data = selector.xpath(
"""
//div[@id='main-col-body']/*
[
self::div[@class != 'awsdocs-page-header-container'] or
self::h2 or self::h3 or
self::p/preceding-sibling::div[not(@class ='awsdocs-page-header-container')]
]
"""
)
return self.raw_data

except Exception as err:
logging.exception(err)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Convert the AWS Security Hub controls website to an OSCAL JSON catalog')
parser.add_argument(
'-o',
'--output-file',
help='Path intended to save OSCAL JSON catalog file. If not provided print to standard out.',
type=argparse.FileType('w'),
dest='output_file',
required=False
)
# index_spider = ControlIndexSpider(url='https://docs.aws.amazon.com/securityhub/latest/userguide/securityhub-controls-reference.html')
# index_spider.crawl()
control_spider = ControlDetailSpider(urls=['https://docs.aws.amazon.com/securityhub/latest/userguide/s3-controls.html'])
raw_data = control_spider.crawl()
transformer = AWSSecurityHubControlTransformer(raw_data)
transformer.transform()

0 comments on commit 11a0b62

Please sign in to comment.