From 6ba9f589bbedc9ad5de4190a64116a5d85eb8579 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Fri, 24 Nov 2023 14:09:39 +0000 Subject: [PATCH 1/2] fix: request index mappings in batches to avoid elasticsearch URL length error Closes: #139 --- .../lib/downloads/download.py | 2 +- .../lib/downloads/utils.py | 21 +++++++++---------- .../lib/downloads/test_downloads_utils.py | 12 ++++++----- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/ckanext/versioned_datastore/lib/downloads/download.py b/ckanext/versioned_datastore/lib/downloads/download.py index b3b14b38..ea7e319d 100644 --- a/ckanext/versioned_datastore/lib/downloads/download.py +++ b/ckanext/versioned_datastore/lib/downloads/download.py @@ -310,7 +310,7 @@ def generate_core(self): timeout=30, ) - schemas = get_schemas(self.query, es_client) + schemas = get_schemas(self.query) for resource_id, version in resources_to_generate.items(): self.request.update_status(DownloadRequest.state_core_gen, resource_id) diff --git a/ckanext/versioned_datastore/lib/downloads/utils.py b/ckanext/versioned_datastore/lib/downloads/utils.py index c2995fad..cf56ca76 100644 --- a/ckanext/versioned_datastore/lib/downloads/utils.py +++ b/ckanext/versioned_datastore/lib/downloads/utils.py @@ -1,4 +1,3 @@ -from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, A from fastavro import parse_schema from splitgill.search import create_version_query @@ -11,24 +10,24 @@ iter_data_fields, unprefix_index, ) +from ..query.fields import get_mappings -def get_schemas(query: Query, es_client: Elasticsearch): +def get_schemas(query: Query): """ Creates an avro schema from the elasticsearch index metadata. :param query: the Query object for this request - :param es_client: a connected elasticsearch client :return: a parsed avro schema """ - resource_mapping = es_client.indices.get_mapping( - index=','.join( - [ - prefix_resource(r) - for r, v in query.resource_ids_and_versions.items() - if v != common.NON_DATASTORE_VERSION - ] - ) + # get the mappings for the resources which would have a mapping (i.e. exclude + # non-datastore resources) + resource_mapping = get_mappings( + [ + resource_id + for resource_id, version in query.resource_ids_and_versions.items() + if version != common.NON_DATASTORE_VERSION + ] ) basic_avro_types = [ diff --git a/tests/unit/lib/downloads/test_downloads_utils.py b/tests/unit/lib/downloads/test_downloads_utils.py index e131076f..b15c34c2 100644 --- a/tests/unit/lib/downloads/test_downloads_utils.py +++ b/tests/unit/lib/downloads/test_downloads_utils.py @@ -37,7 +37,7 @@ def test_get_schema(self): # this is a _very_ stripped down version of the return value from # indices.get_mapping() - indices_mock = MagicMock( + get_mapping_mock = MagicMock( return_value={ index_name: { 'mappings': { @@ -71,10 +71,12 @@ def test_get_schema(self): } } ) - parsed_schemas = utils.get_schemas( - q, MagicMock(**{'indices.get_mapping': indices_mock}) - ) - parsed_schema = parsed_schemas[resource_dict['id']] + with patch( + "ckanext.versioned_datastore.lib.downloads.utils.get_mappings", + get_mapping_mock, + ): + parsed_schemas = utils.get_schemas(q) + parsed_schema = parsed_schemas[resource_dict["id"]] assert isinstance(parsed_schema, dict) assert parsed_schema['type'] == 'record' assert parsed_schema['name'] == 'ResourceRecord' From 3cf7b0717c96a6e9ffb1c229b89fa992412fdbf9 Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Fri, 24 Nov 2023 16:02:33 +0000 Subject: [PATCH 2/2] fix: ensure the workbook is closed if something goes wrong during the write call I'm not 100% sure this catches the problem as I couldn't recreate it locally (it must be an error at a very specific point or something?) but this has a decent chance of fixing the problem of a temporary workbook being left around in /tmp after we've finished/errored the download. Closes: #118 --- .../versioned_datastore/lib/downloads/derivatives/xlsx.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ckanext/versioned_datastore/lib/downloads/derivatives/xlsx.py b/ckanext/versioned_datastore/lib/downloads/derivatives/xlsx.py index d67ae081..84d650df 100644 --- a/ckanext/versioned_datastore/lib/downloads/derivatives/xlsx.py +++ b/ckanext/versioned_datastore/lib/downloads/derivatives/xlsx.py @@ -29,8 +29,12 @@ def setup(self): super(XlsxDerivativeGenerator, self).setup() def finalise(self): - self.workbook.save(self.file_paths['main']) - self.workbook.close() + try: + self.workbook.save(self.file_paths['main']) + finally: + # if something goes wrong when trying to save the workbook, make sure to + # close the workbook before raising the error + self.workbook.close() super(XlsxDerivativeGenerator, self).finalise() def _write(self, record):