diff --git a/src/com/amazon/aws/securityhub/.gitignore b/src/com/amazon/aws/securityhub/.gitignore new file mode 100644 index 0000000..b582d60 --- /dev/null +++ b/src/com/amazon/aws/securityhub/.gitignore @@ -0,0 +1,2 @@ +# An example AWS SecurityHub standard controls documentation pages. +example.html diff --git a/src/com/amazon/aws/securityhub/__init__.py b/src/com/amazon/aws/securityhub/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/com/amazon/aws/securityhub/poetry.lock b/src/com/amazon/aws/securityhub/poetry.lock new file mode 100644 index 0000000..cb753b9 --- /dev/null +++ b/src/com/amazon/aws/securityhub/poetry.lock @@ -0,0 +1,161 @@ +[[package]] +name = "certifi" +version = "2023.7.22" +description = "Python package for providing Mozilla's CA Bundle." +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "charset-normalizer" +version = "3.3.0" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "main" +optional = false +python-versions = ">=3.7.0" + +[[package]] +name = "cssselect" +version = "1.2.0" +description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "idna" +version = "3.4" +description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "jinja2" +version = "3.1.2" +description = "A very fast and expressive template engine." +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + +[[package]] +name = "jmespath" +version = "1.0.1" +description = "JSON Matching Expressions" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "lxml" +version = "4.9.3" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["beautifulsoup4"] +source = ["Cython (>=0.29.35)"] + +[[package]] +name = "markupsafe" +version = "2.1.3" +description = "Safely add untrusted strings to HTML/XML markup." +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "packaging" +version = "23.2" +description = "Core utilities for Python packages" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "parsel" +version = "1.8.1" +description = "Parsel is a library to extract data from HTML and XML using XPath and CSS selectors" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +cssselect = ">=0.9" +jmespath = "*" +lxml = "*" +packaging = "*" +w3lib = ">=1.19.0" + +[[package]] +name = "requests" +version = "2.31.0" +description = "Python HTTP for Humans." +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "urllib3" +version = "2.0.6" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + +[[package]] +name = "w3lib" +version = "2.1.2" +description = "Library of web-related functions" +category = "main" +optional = false +python-versions = ">=3.7" + +[metadata] +lock-version = "1.1" +python-versions = "^3.10" +content-hash = "4b891f85e60dfa2d18004d651a105d326d815119bb15aec17bbf659eb0647059" + +[metadata.files] +certifi = [] +charset-normalizer = [] +cssselect = [] +idna = [] +jinja2 = [ + {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, + {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, +] +jmespath = [] +lxml = [] +markupsafe = [] +packaging = [] +parsel = [] +requests = [] +urllib3 = [] +w3lib = [] diff --git a/src/com/amazon/aws/securityhub/pyproject.toml b/src/com/amazon/aws/securityhub/pyproject.toml new file mode 100644 index 0000000..90b6689 --- /dev/null +++ b/src/com/amazon/aws/securityhub/pyproject.toml @@ -0,0 +1,18 @@ +[tool.poetry] +name = "securityhub" +version = "0.1.0" +description = "Scripting to convert AWS Security Hub controls for AWS services into AWS format." +authors = ["Al S "] +license = "GPL-3.0" + +[tool.poetry.dependencies] +python = "^3.10" +Jinja2 = "3.1.2" +requests = "2.31.0" +parsel = "1.8.1" + +[tool.poetry.dev-dependencies] + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/src/com/amazon/aws/securityhub/transform.py b/src/com/amazon/aws/securityhub/transform.py new file mode 100644 index 0000000..2373607 --- /dev/null +++ b/src/com/amazon/aws/securityhub/transform.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +import abc +import argparse +import datetime +from jinja2 import Environment, FileSystemLoader +import logging +from os.path import dirname +from pathlib import Path +from parsel import Selector +import re +import requests +from typing import Dict, List +import uuid +from urllib.parse import urljoin + +class Renderer: + @classmethod + def __subclasshook__(cls, subclass): + return (hasattr(subclass, 'render') + and callable(subclass.render)) + +class JinjaTemplateRender: + def __init__(self, searchpath=f"{dirname(__file__)}", template='ssdf_catalog.json.j2'): + self.loader = FileSystemLoader(searchpath) + self.environment = Environment(loader=self.loader, autoescape=True) + try: + self.template = self.environment.get_template(template) + self.template.globals.update({ + 'strftime': datetime.datetime.strftime, + 'uuid4': uuid.uuid4 + }) + except Exception as err: + logging.exception(f"Template at '{template}' failed to load, must reinit!") + logging.exception(err) + self.template = None + def render(self, *args, **kwargs): + return self.template.render(*args, **kwargs) + +class Transformer(metaclass=abc.ABCMeta): + @classmethod + def __subclasshook__(cls, subclass): + return (hasattr(subclass, 'load') + and callable(subclass.load) + and hasattr(subclass, 'transform') + and callable(subclass.transform) + and hasattr(subclass, 'save') + and callable(subclass.save)) + +class AWSSecurityHubControlTransformer: + def __init__(self, raw_data: List[Selector]): + self.raw_data = raw_data if raw_data else [] + self.controls: Dict[str, Dict] = {} + + def transform(self): + current_control_id: str = '' + for d in self.raw_data: + try: + tag = d.root.tag + if tag == 'p': + self.process_p(d, current_control_id) + if tag == 'div': + self.process_div(d, current_control_id) + if tag == 'h3': + self.process_h3(d, current_control_id) + if tag == 'h2': + control_id, control_label, control_title = self.process_h2(d) + self.controls[control_id] = {} + self.controls[control_id] = { + 'label': control_label, + 'title': control_title + } + current_control_id = control_id + + except Exception as err: + logging.exception(err) + return self.controls + + def process_p(self, data: Selector, control_id: str): + tag = 'p' + print(f"{tag}: {data.extract()}") + return + + def process_div(self, data: Selector, control_id: str): + tag = 'div' + print(f"{tag}: {data.extract()}") + div_class = data.xpath("./@class").extract()[0] + print(f"{tag}: {div_class}") + if div_class == 'itemizedList': + pass + if div_class == 'other': + pass + return + + def process_h3(self, data: Selector, control_id: str): + tag = 'h3' + return + + def process_h2(self, data: Selector) -> (str, str, str): + tag = 'h2' + control_id = data.xpath("./@id").extract()[0] + rest = data.xpath("./text()").extract()[0] + matches = re.match(r"^\[([.A-Z0-9]{3,})\] (.*)$", rest) + control_label, control_title = matches.groups() + return control_id, control_label, control_title + +class ControlIndexSpider: + def __init__(self, url=[]): + self.url = url + self.raw_html = '' + self.target_urls = [] + + def crawl(self): + try: + response = requests.get(self.url) + self.raw_html = response.content.decode('utf-8') + selector = Selector(text=self.raw_html) + data = selector.xpath("//div[@class='highlights']/ul/li/a/@href") + self.target_urls = [urljoin(response.url, url.extract()) for url in data] + return self.target_urls + except Exception as err: + logging.exception(err) + +class ControlDetailSpider: + def __init__(self, urls: List[str]): + self.urls = urls if urls else [] + self.raw_data: List[Selector] = [] + self.raw_html = '' + + def crawl(self): + for url in self.urls: + try: + #response = requests.get(url) + #self.raw_html = response.content.decode('utf-8') + with open('example.html') as fd: + self.raw_html = fd.read() + selector = Selector(text=self.raw_html) + self.raw_data = selector.xpath( + """ + //div[@id='main-col-body']/* + [ + self::div[@class != 'awsdocs-page-header-container'] or + self::h2 or self::h3 or + self::p/preceding-sibling::div[not(@class ='awsdocs-page-header-container')] + ] + """ + ) + return self.raw_data + + except Exception as err: + logging.exception(err) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Convert the AWS Security Hub controls website to an OSCAL JSON catalog') + parser.add_argument( + '-o', + '--output-file', + help='Path intended to save OSCAL JSON catalog file. If not provided print to standard out.', + type=argparse.FileType('w'), + dest='output_file', + required=False + ) + # index_spider = ControlIndexSpider(url='https://docs.aws.amazon.com/securityhub/latest/userguide/securityhub-controls-reference.html') + # index_spider.crawl() + control_spider = ControlDetailSpider(urls=['https://docs.aws.amazon.com/securityhub/latest/userguide/s3-controls.html']) + raw_data = control_spider.crawl() + transformer = AWSSecurityHubControlTransformer(raw_data) + transformer.transform()