From aea1912244036033434c8b0da9700074f5e2f03a Mon Sep 17 00:00:00 2001 From: ChrisC28 Date: Tue, 29 Oct 2024 17:16:46 +1100 Subject: [PATCH] Basic working example of WOD->CODA parquet --- notebooks/WOD_to_CODA_parquet.ipynb | 1583 +++++++++++++++++++++++++++ 1 file changed, 1583 insertions(+) create mode 100644 notebooks/WOD_to_CODA_parquet.ipynb diff --git a/notebooks/WOD_to_CODA_parquet.ipynb b/notebooks/WOD_to_CODA_parquet.ipynb new file mode 100644 index 0000000..43827ed --- /dev/null +++ b/notebooks/WOD_to_CODA_parquet.ipynb @@ -0,0 +1,1583 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 14, + "id": "d4624cff", + "metadata": {}, + "outputs": [], + "source": [ + "#import xarray\n", + "#import fsspec\n", + "import numpy as np\n", + "#import s3fs\n", + "import proplot\n", + "import os\n", + "\n", + "from matplotlib import pyplot as plt\n", + "import pandas\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e6d4e1fc-5d40-41e8-a125-79b5a1a4ae43", + "metadata": {}, + "outputs": [], + "source": [ + "def set_metadata(tbl, col_meta={}, tbl_meta={}):\n", + " \"\"\"Store table- and column-level metadata as json-encoded byte strings.\n", + "\n", + " Table-level metadata is stored in the table's schema.\n", + " Column-level metadata is stored in the table columns' fields.\n", + "\n", + " To update the metadata, first new fields are created for all columns.\n", + " Next a schema is created using the new fields and updated table metadata.\n", + " Finally a new table is created by replacing the old one's schema, but\n", + " without copying any data.\n", + "\n", + " Args:\n", + " tbl (pyarrow.Table): The table to store metadata in\n", + " col_meta: A json-serializable dictionary with column metadata in the form\n", + " {\n", + " 'column_1': {'some': 'data', 'value': 1},\n", + " 'column_2': {'more': 'stuff', 'values': [1,2,3]}\n", + " }\n", + " tbl_meta: A json-serializable dictionary with table-level metadata.\n", + " \"\"\"\n", + " # Create updated column fields with new metadata\n", + " if col_meta or tbl_meta:\n", + " fields = []\n", + " for col in tbl.schema.names:\n", + " if col in col_meta:\n", + " # Get updated column metadata\n", + " metadata = tbl.field(col).metadata or {}\n", + " for k, v in col_meta[col].items():\n", + " metadata[k] = json.dumps(v).encode('utf-8')\n", + " # Update field with updated metadata\n", + " fields.append(tbl.field(col).with_metadata(metadata))\n", + " else:\n", + " fields.append(tbl.field(col))\n", + " \n", + " # Get updated table metadata\n", + " tbl_metadata = tbl.schema.metadata or {}\n", + " for k, v in tbl_meta.items():\n", + " if type(v)==bytes:\n", + " tbl_metadata[k] = v\n", + " else:\n", + " tbl_metadata[k] = json.dumps(v).encode('utf-8')\n", + "\n", + " # Create new schema with updated field metadata and updated table metadata\n", + " schema = pa.schema(fields, metadata=tbl_metadata)\n", + "\n", + " # With updated schema build new table (shouldn't copy data)\n", + " # tbl = pa.Table.from_batches(tbl.to_batches(), schema)\n", + " tbl = tbl.cast(schema)\n", + "\n", + " return tbl" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "903fd7a3", + "metadata": {}, + "outputs": [], + "source": [ + "def Get_CODA_id(dates,source_data,obs_platform,year,profile_count):\n", + " \n", + " CODA_id_length = 20\n", + " CODA_ID = np.array(['' for _ in range(dates.size)], dtype='S{}'.format(CODA_id_length))\n", + " \n", + " unique_dates = np.unique(dates)\n", + "\n", + " \n", + " for i_date in unique_dates:\n", + " \n", + " if not pandas.isnull(i_date):\n", + " \n", + " idx_for_date = np.nonzero(dates == i_date)[0]\n", + " count_for_date = idx_for_date.size\n", + " \n", + " for i_index in idx_for_date:\n", + " #print(i_index)\n", + " #CODA_id_current_profiles = source_data + obs_platform + i_date.strftime('%Y%m%d') + f'{i_profile:04}' \n", + " CODA_id_current_profiles = source_data + obs_platform + np.datetime_as_string(i_date, unit='D').replace('-','') + f'{profile_count:04}'\n", + " CODA_ID[i_index] = CODA_id_current_profiles\n", + " \n", + " #END for i_index\n", + " #if not isnull\n", + " #END for i_date\n", + "\n", + " bad_dates_idx = np.nonzero(pandas.isnull(dates))[0]\n", + " \n", + " bad_profile_counter = 1\n", + " \n", + " for i_bad_idx in bad_dates_idx:\n", + " CODA_id_current_profiles = source_data + obs_platform + str(year) + 'XXXX' + f'{bad_profile_counter:04}'\n", + " CODA_ID[i_bad_idx] = CODA_id_current_profiles\n", + " bad_profile_counter = bad_profile_counter+1\n", + "\n", + " return CODA_ID" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fa3bab3f", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('/tube1/cha674/CARS_2022/cars-v2/wodpy/wodpy')\n", + "import wodnc" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "53988383", + "metadata": {}, + "outputs": [], + "source": [ + "START_YEAR = 2018\n", + "END_YEAR = 2019" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0223dbe4", + "metadata": {}, + "outputs": [], + "source": [ + "obs_type = ['ctd','xbt','pfl','mrb','gld'] #,'drb'] #,'gld']\n", + "#obs_type = ['ctd']\n", + "\n", + "base_s3_url = 's3://noaa-wod-pds'\n", + "\n", + "data_container = {}\n", + "\n", + "output_directory = '/tube1/cha674/CARS_2022_ancillary_data/WOD_2018'\n", + "\n", + "for i_year in range(START_YEAR,END_YEAR+1):\n", + "\n", + " data_container_for_year = {}\n", + " for i_obs_type in obs_type:\n", + " url = base_s3_url + '/' + str(i_year) + '/wod_' + i_obs_type + '_' + str(i_year) + '.nc' #_pfl_2018.nc'\n", + " print(url)\n", + " fs = s3fs.S3FileSystem(anon=True)\n", + " file_obj = fs.open(url)\n", + " ds = xarray.open_dataset(file_obj,engine='h5netcdf')\n", + " data_container_for_year[i_obs_type] = ds\n", + " \n", + " #Write data out\n", + " if not os.path.isdir(os.path.join(output_directory,str(i_year))):\n", + " os.mkdir(os.path.join(output_directory,str(i_year)))\n", + " print(os.path.join(output_directory,str(i_year),'wod_' + i_obs_type + '_' + str(i_year) + '.nc'))\n", + " ds.to_netcdf(os.path.join(output_directory,str(i_year),'wod_' + i_obs_type + '_' + str(i_year) + '.nc'))\n", + " \n", + " data_container[i_year] = data_container_for_year" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4c034d54", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Working on year: 2018\n", + "Working on platform: ctd\n", + "Profile: 0 of 1000\n", + "Building dataframe\n", + "Writing file: WOD2018_CODA_2018_ctd.parquet\n" + ] + }, + { + "ename": "NameError", + "evalue": "name 'sdaa' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 167\u001b[0m\n\u001b[1;32m 164\u001b[0m os\u001b[38;5;241m.\u001b[39mmkdir(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(output_path,\u001b[38;5;28mstr\u001b[39m(i_year)))\n\u001b[1;32m 165\u001b[0m wod_dataframe\u001b[38;5;241m.\u001b[39mto_parquet(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(output_path,\u001b[38;5;28mstr\u001b[39m(i_year),output_file_name),engine\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpyarrow\u001b[39m\u001b[38;5;124m'\u001b[39m,compression\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msnappy\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m--> 167\u001b[0m \u001b[43msdaa\u001b[49m\n", + "\u001b[0;31mNameError\u001b[0m: name 'sdaa' is not defined" + ] + } + ], + "source": [ + "\n", + "WOD_directory = '//oa-decadal-climate/work/observations/WOD2018/'\n", + "platforms_to_get = ['ctd','osd','pfl','xbt','mrb','gld']\n", + "WOD_file_name_stem = 'wod_'\n", + "\n", + "profile_vars_to_get = ['Temperature','Salinity','Oxygen','Chlorophyll','Silicate','Phosphate','Nitrate','pH','Alkalinity','tCO2','z']\n", + "point_vars_to_get = ['lat','lon','wod_unique_cast']\n", + "ancillary_vars_to_get = ['origflagset','country','dataset','Access_no','Recorder',\n", + " 'dbase_orig','Platform','Project','WOD_cruise_identifier',\n", + " 'Institute','needs_z_fix','Ocean_Vehicle','Temperature_Instrument']\n", + "\n", + "output_path = '/oa-decadal-climate/work/observations/CARSv2_ancillary/CODA/CODAv1/parquet'\n", + "\n", + "FILL_VALUE = -10000000000.0\n", + "\n", + "\n", + "column_names = point_vars_to_get.copy()\n", + "column_names = column_names.append(profile_vars_to_get)\n", + "\n", + "\n", + "\n", + "for i_year in range(START_YEAR,END_YEAR+1):\n", + " print('Working on year: ', i_year)\n", + " for i_platform in platforms_to_get:\n", + " print('Working on platform: ', i_platform)\n", + "\n", + " file_name = WOD_file_name_stem + i_platform + '_' + str(i_year) + '.nc'\n", + " WOD_ragged_object = wodnc.Ragged(os.path.join(WOD_directory,str(i_year),file_name))\n", + " \n", + " vars_in_dataset = list(WOD_ragged_object.variables().keys())\n", + " \n", + " \n", + " n_profiles = WOD_ragged_object.ncasts()\n", + " \n", + " \n", + " profile_data_container = {}\n", + " ancillary_data_container = {}\n", + " attributes_data_container = {}\n", + " \n", + " data_vars_to_get = profile_vars_to_get + point_vars_to_get\n", + " \n", + " #data_vars_to_get.append(point_vars_to_get)\n", + " \n", + " \n", + " for i_var in data_vars_to_get:\n", + " if i_var in vars_in_dataset:\n", + " \n", + " profile_data_container[i_var] = []\n", + " \n", + " if i_var in profile_vars_to_get:\n", + " profile_data_container[i_var + '_WODflag'] = []\n", + " profile_data_container[i_var + '_origflag'] = []\n", + " \n", + " \n", + " \n", + " attributes_data_container[i_var] = {}\n", + " attributes_data_container[i_var + '_WODflag'] = {}\n", + " attributes_data_container[i_var + '_origflag'] = {}\n", + " \n", + " for i_ancillary_var in ancillary_vars_to_get:\n", + " if i_ancillary_var in vars_in_dataset:\n", + " \n", + " ancillary_data_container[i_ancillary_var] = []\n", + " \n", + " attributes_data_container[i_ancillary_var] = {}\n", + " \n", + " \n", + " \n", + " vars_to_CODA = list(profile_data_container.keys())\n", + " \n", + " profile_data_container['time'] = []\n", + " profile_data_container['CODA_id'] = []\n", + "\n", + " \n", + " \n", + " # = list(WOD_ragged_object.variables().keys())\n", + "\n", + " number_of_levels_by_profile = []\n", + " \n", + " #DEBUG\n", + " n_profiles = 1000\n", + " for i_profile in range(0,n_profiles):\n", + " \n", + " #= pandas.DataFrame(columns=column_names)\n", + " if (i_profile % 1000) ==0:\n", + " print(\"Profile: \", i_profile, ' of ', n_profiles)\n", + " \n", + " WOD_profile_object = wodnc.ncProfile(WOD_ragged_object,i_profile) \n", + " n_levels = WOD_profile_object.n_levels()\n", + " \n", + " number_of_levels_by_profile.append(n_levels)\n", + " \n", + " for i_var in vars_to_CODA:\n", + " if i_var in vars_in_dataset:\n", + " if WOD_profile_object.is_level_data(i_var):\n", + " \n", + " current_variable = WOD_profile_object.level_unpack(i_var)\n", + " \n", + " #Test for missing data on profile\n", + " if current_variable.size != 0:\n", + " profile_data_container[i_var].append(current_variable)\n", + " else:\n", + " profile_data_container[i_var].append(np.repeat(np.nan,n_levels))\n", + " #END if current_variable.size != 0:\n", + " \n", + " elif WOD_profile_object.is_metadata(i_var):\n", + " \n", + " current_point_data = WOD_profile_object.metadata(i_var)\n", + " \n", + " profile_data_container[i_var].append( np.repeat(current_point_data,n_levels) )\n", + " \n", + " #END for i_var\n", + " \n", + " #Extract datetimes\n", + " current_time = WOD_profile_object.datetime()\n", + " current_datetime = np.datetime64(current_time).astype('datetime64[s]')\n", + " profile_data_container['time'].append( np.repeat(current_datetime,n_levels) )\n", + "\n", + " for i_ancillary_var in ancillary_data_container.keys():\n", + " current_ancillary_var = WOD_profile_object.metadata(i_ancillary_var)\n", + " \n", + " if isinstance(current_ancillary_var,str):\n", + " ancillary_data_container[i_ancillary_var].append(f\"{current_ancillary_var:<100}\")\n", + " else:\n", + " ancillary_data_container[i_ancillary_var].append(current_ancillary_var)\n", + " #END if isinstance()\n", + " \n", + " #END for i_ancillary_variable \n", + "\n", + " \n", + " source_data = 'WOD'\n", + "\n", + " \n", + " #Set the CODA Identifier\n", + " CODA_id = Get_CODA_id( profile_data_container['time'][-1][0],source_data,i_platform,i_year,i_profile)\n", + " profile_data_container['CODA_id'].append(np.repeat(CODA_id,n_levels))\n", + " \n", + " \n", + " #END for i_profile\n", + " #/oa-decadal-climate/work/observations/CARSv2_ancillary/CODA/CODAv1/\n", + " #Build Dataframe\n", + " print('Building dataframe')\n", + " wod_dataframe = pandas.DataFrame(columns=profile_data_container.keys())\n", + "\n", + " for i_var in profile_data_container:\n", + " wod_dataframe[i_var] = np.concatenate(profile_data_container[i_var])\n", + " #END for i_var\n", + " \n", + " for i_ancillary_var in ancillary_data_container.keys():\n", + " #print(i_ancillary_var)\n", + " current_ancillary_var = []\n", + " \n", + " for i_profile in range(0,n_profiles):\n", + " current_ancillary_var.append( np.repeat(ancillary_data_container[i_ancillary_var][i_profile], number_of_levels_by_profile[i_profile]) )\n", + " #END for i_profile \n", + " n_columns = len(wod_dataframe.columns)\n", + "\n", + " wod_dataframe.insert(n_columns,i_ancillary_var,np.concatenate(current_ancillary_var) )\n", + " \n", + " #END for i_ancillary\n", + " \n", + " output_file_name = 'WOD2018_CODA_' + str(i_year) + '_' + str(i_platform) + '.parquet'\n", + " print('Writing file:', output_file_name)\n", + " if not os.path.isdir(os.path.join(output_path,str(i_year))):\n", + " os.mkdir(os.path.join(output_path,str(i_year)))\n", + " wod_dataframe.to_parquet(os.path.join(output_path,str(i_year),output_file_name),engine='pyarrow',compression='snappy')\n", + "\n", + " sdaa\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "6a94ba74-28b5-41e2-9458-81d16e65b5d8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "origflagset\n", + "country\n", + "dataset\n", + "Access_no\n", + "Recorder\n", + "dbase_orig\n", + "Platform\n", + "Project\n", + "WOD_cruise_identifier\n", + "Institute\n", + "needs_z_fix\n", + "Temperature_Instrument\n" + ] + } + ], + "source": [ + "for i_ancillary_var in ancillary_data_container.keys():\n", + " print(i_ancillary_var)\n", + " current_ancillary_var = []\n", + " \n", + " for i_profile in range(0,n_profiles):\n", + " current_ancillary_var.append( np.repeat(ancillary_data_container[i_ancillary_var][i_profile], number_of_levels_by_profile[i_profile]) )\n", + " \n", + " wod_dataframe = wod_dataframe.assign(i_ancillary_var= np.concatenate(current_ancillary_var) )\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "06b53914-86fe-4186-9d3b-d6ff03574ca5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([' ',\n", + " ' ',\n", + " ' ',\n", + " ...,\n", + " ' ',\n", + " ' ',\n", + " ' '],\n", + " dtype=']" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "image/png": { + "height": 400, + "width": 400 + } + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(profile['Temperature'].values,-profile['z'].values)\n", + "plt.plot(profile['Salinity'].values,-profile['z'].values)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "58136614-eeac-4d73-88ca-5c025b0f5e27", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 4.0\n", + "1 26.0\n", + "2 80.0\n", + "3 90.0\n", + "4 113.0\n", + "5 116.0\n", + "6 123.0\n", + "7 135.0\n", + "8 141.0\n", + "9 154.0\n", + "10 164.0\n", + "11 175.0\n", + "12 191.0\n", + "13 219.0\n", + "14 241.0\n", + "15 270.0\n", + "16 294.0\n", + "17 310.0\n", + "18 391.0\n", + "19 441.0\n", + "20 618.0\n", + "21 746.0\n", + "22 834.0\n", + "23 861.0\n", + "24 989.0\n", + "Name: z, dtype: float32" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profile['z']" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "457fdbd9", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "image/png": { + "height": 480, + "width": 640 + } + }, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "CODA_id = wod_dataframe['CODA_id'].unique()\n", + "\n", + "#first_profile = wod_dataframe.loc[wod_dataframe['CODA_id']==CODA_id[0]]\n", + "for i_ancillary_var in ancillary_vars_to_get:\n", + " if i_ancillary_var in vars_in_dataset:\n", + " \n", + " ancillary_data_container[i_ancillary_var] = 0\n", + " attributes_data_container[i_ancillary_var] = {}\n", + "for i_profile in range(0,100):\n", + " #print()\n", + " profile_to_plot = wod_dataframe.loc[wod_dataframe['CODA_id']==CODA_id[i_profile]]\n", + "\n", + " plt.plot(profile_to_plot['Nitrate'],-profile_to_plot['z'])\n", + "#plt.xlim([0,40])" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "47e0a3d1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Oxygen 300\n", + "Oxygen_WODflag 300\n", + "Oxygen_origflag 300\n", + "Temperature 300\n", + "Temperature_WODflag 300\n", + "Temperature_origflag 300\n", + "Salinity 300\n", + "Salinity_WODflag 300\n", + "Salinity_origflag 300\n", + "Chlorophyll 300\n", + "Chlorophyll_WODflag 300\n", + "Chlorophyll_origflag 300\n", + "Nitrate 300\n", + "Nitrate_WODflag 300\n", + "Nitrate_origflag 300\n", + "z 300\n", + "z_WODflag 300\n", + "z_origflag 300\n", + "lat 300\n", + "lon 300\n", + "wod_unique_cast 300\n", + "time 300\n" + ] + } + ], + "source": [ + "for i_var in profile_data_container:\n", + " print(i_var,len(profile_data_container[i_var]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d44de7e", + "metadata": {}, + "outputs": [], + "source": [ + "time_stuff = profile_data_container['time']\n", + "\n", + "for i_profile in range(0,len(time_stuff)):\n", + " print(i_profile)\n", + " print(time_stuff[i_profile])\n", + " print(\"===================\")" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "ee3dd6cb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5c4a56e", + "metadata": {}, + "outputs": [], + "source": [ + "time_stuff = profile_data_container['time']\n", + "\n", + "for i_profile in range(0,len(time_stuff)):\n", + " \n", + " print(i_profile)\n", + " print(time_stuff[i_profile])\n", + " print('=========================')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be40c8bf", + "metadata": {}, + "outputs": [], + "source": [ + "first_pass = True\n", + " for i_var in point_vars_to_get:\n", + " \n", + " if i_var in vars_in_dataset:\n", + " \n", + " current_point_var = WOD_profile_object.metadata(i_var)\n", + " \n", + " if first_pass == True:\n", + " \n", + " current_profile_dataframe = pandas.DataFrame({i_var:np.repeat(current_point_var,n_levels)})\n", + " \n", + " first_pass = False\n", + " \n", + " else:\n", + " current_profile_dataframe[i_var] = np.repeat(current_point_var,n_levels)\n", + " #END if first_passtemperature = np.concatenate(profile_data_container['Temperature'])\n", + "\n", + "\n", + " else:\n", + " current_profile_dataframe[i_var] = np.repeat(np.nan,n_levels)\n", + "\n", + " \n", + " #END for i_var in point_vars_to_get\n", + " \n", + " current_profile_dataframe['time'] = np.datetime64(WOD_profile_object.datetime())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0febf1cf", + "metadata": {}, + "outputs": [], + "source": [ + "T = WOD_profile_object.level_unpack('Temperature')\n", + "S = WOD_profile_object.level_unpack('Salinity')\n", + "O2 = WOD_profile_object.level_unpack('Oxygen')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e2757996", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "image/png": { + "height": 480, + "width": 640 + } + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(S,T)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "fdeaf99c", + "metadata": {}, + "outputs": [], + "source": [ + "lat = pandas.unique(wod_dataframe.loc[wod_dataframe['wod_unique_cast']==21349376]['lat'])\n", + "lon = pandas.unique(wod_dataframe.loc[wod_dataframe['wod_unique_cast']==21349376]['lon'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "014ce4fe", + "metadata": {}, + "outputs": [], + "source": [ + "lat = wod_dataframe['lat']\n", + "lon = wod_dataframe['lon']" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "74f03d5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "image/png": { + "height": 480, + "width": 640 + } + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(lon,lat)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98717fdd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Working on year: 2005\n", + "Working on platform: osd\n", + "Profile: 0 of 14845\n", + "Profile: 1000 of 14845\n", + "Profile: 2000 of 14845\n", + "Profile: 3000 of 14845\n", + "Profile: 4000 of 14845\n", + "Profile: 5000 of 14845\n", + "Profile: 6000 of 14845\n", + "Profile: 7000 of 14845\n", + "Profile: 8000 of 14845\n", + "Profile: 9000 of 14845\n", + "Profile: 10000 of 14845\n", + "Profile: 11000 of 14845\n", + "Profile: 12000 of 14845\n", + "Profile: 13000 of 14845\n", + "Profile: 14000 of 14845\n", + "Writing file: WOD2018_CODA_2005_osd.nc\n", + "Working on platform: ctd\n", + "Profile: 0 of 26194\n", + "Profile: 1000 of 26194\n", + "Profile: 2000 of 26194\n", + "Profile: 3000 of 26194\n", + "Profile: 4000 of 26194\n", + "Profile: 5000 of 26194\n", + "Profile: 6000 of 26194\n", + "Profile: 7000 of 26194\n", + "Profile: 8000 of 26194\n", + "Profile: 9000 of 26194\n", + "Profile: 10000 of 26194\n", + "Profile: 11000 of 26194\n", + "Profile: 12000 of 26194\n", + "Profile: 13000 of 26194\n", + "Profile: 14000 of 26194\n", + "Profile: 15000 of 26194\n", + "Profile: 16000 of 26194\n", + "Profile: 17000 of 26194\n", + "Profile: 18000 of 26194\n", + "Profile: 19000 of 26194\n", + "Profile: 20000 of 26194\n", + "Profile: 21000 of 26194\n", + "Profile: 22000 of 26194\n", + "Profile: 23000 of 26194\n", + "Profile: 24000 of 26194\n", + "Profile: 25000 of 26194\n", + "Profile: 26000 of 26194\n", + "Writing file: WOD2018_CODA_2005_ctd.nc\n", + "Working on platform: xbt\n", + "Profile: 0 of 29618\n", + "Profile: 1000 of 29618\n", + "Profile: 2000 of 29618\n", + "Profile: 3000 of 29618\n", + "Profile: 4000 of 29618\n", + "Profile: 5000 of 29618\n", + "Profile: 6000 of 29618\n", + "Profile: 7000 of 29618\n", + "Profile: 8000 of 29618\n", + "Profile: 9000 of 29618\n", + "Profile: 10000 of 29618\n", + "Profile: 11000 of 29618\n", + "Profile: 12000 of 29618\n", + "Profile: 13000 of 29618\n", + "Profile: 14000 of 29618\n", + "Profile: 15000 of 29618\n", + "Profile: 16000 of 29618\n", + "Profile: 17000 of 29618\n", + "Profile: 18000 of 29618\n", + "Profile: 19000 of 29618\n", + "Profile: 20000 of 29618\n", + "Profile: 21000 of 29618\n", + "Profile: 22000 of 29618\n", + "Profile: 23000 of 29618\n", + "Profile: 24000 of 29618\n", + "Profile: 25000 of 29618\n", + "Profile: 26000 of 29618\n", + "Profile: 27000 of 29618\n", + "Profile: 28000 of 29618\n", + "Profile: 29000 of 29618\n", + "Writing file: WOD2018_CODA_2005_xbt.nc\n", + "Working on platform: pfl\n", + "Profile: 0 of 64100\n", + "Profile: 1000 of 64100\n", + "Profile: 2000 of 64100\n", + "Profile: 3000 of 64100\n", + "Profile: 4000 of 64100\n", + "Profile: 5000 of 64100\n", + "Profile: 6000 of 64100\n", + "Profile: 7000 of 64100\n", + "Profile: 8000 of 64100\n", + "Profile: 9000 of 64100\n", + "Profile: 10000 of 64100\n", + "Profile: 11000 of 64100\n", + "Profile: 12000 of 64100\n", + "Profile: 13000 of 64100\n", + "Profile: 14000 of 64100\n", + "Profile: 15000 of 64100\n", + "Profile: 16000 of 64100\n", + "Profile: 17000 of 64100\n", + "Profile: 18000 of 64100\n", + "Profile: 19000 of 64100\n", + "Profile: 20000 of 64100\n", + "Profile: 21000 of 64100\n", + "Profile: 22000 of 64100\n", + "Profile: 23000 of 64100\n", + "Profile: 24000 of 64100\n", + "Profile: 25000 of 64100\n", + "Profile: 26000 of 64100\n", + "Profile: 27000 of 64100\n", + "Profile: 28000 of 64100\n", + "Profile: 29000 of 64100\n", + "Profile: 30000 of 64100\n", + "Profile: 31000 of 64100\n", + "Profile: 32000 of 64100\n", + "Profile: 33000 of 64100\n", + "Profile: 34000 of 64100\n", + "Profile: 35000 of 64100\n", + "Profile: 36000 of 64100\n", + "Profile: 37000 of 64100\n", + "Profile: 38000 of 64100\n", + "Profile: 39000 of 64100\n", + "Profile: 40000 of 64100\n", + "Profile: 41000 of 64100\n", + "Profile: 42000 of 64100\n", + "Profile: 43000 of 64100\n", + "Profile: 44000 of 64100\n", + "Profile: 45000 of 64100\n", + "Profile: 46000 of 64100\n", + "Profile: 47000 of 64100\n", + "Profile: 48000 of 64100\n", + "Profile: 49000 of 64100\n", + "Profile: 50000 of 64100\n", + "Profile: 51000 of 64100\n", + "Profile: 52000 of 64100\n", + "Profile: 53000 of 64100\n", + "Profile: 54000 of 64100\n", + "Profile: 55000 of 64100\n", + "Profile: 56000 of 64100\n", + "Profile: 57000 of 64100\n", + "Profile: 58000 of 64100\n", + "Profile: 59000 of 64100\n", + "Profile: 60000 of 64100\n", + "Profile: 61000 of 64100\n", + "Profile: 62000 of 64100\n", + "Profile: 63000 of 64100\n", + "Profile: 64000 of 64100\n", + "Writing file: WOD2018_CODA_2005_pfl.nc\n", + "Working on platform: mrb\n", + "Profile: 0 of 45977\n", + "Profile: 1000 of 45977\n", + "Profile: 2000 of 45977\n", + "Profile: 3000 of 45977\n", + "Profile: 4000 of 45977\n", + "Profile: 5000 of 45977\n", + "Profile: 6000 of 45977\n", + "Profile: 7000 of 45977\n", + "Profile: 8000 of 45977\n", + "Profile: 9000 of 45977\n", + "Profile: 10000 of 45977\n", + "Profile: 11000 of 45977\n", + "Profile: 12000 of 45977\n", + "Profile: 13000 of 45977\n", + "Profile: 14000 of 45977\n", + "Profile: 15000 of 45977\n", + "Profile: 16000 of 45977\n", + "Profile: 17000 of 45977\n", + "Profile: 18000 of 45977\n", + "Profile: 19000 of 45977\n", + "Profile: 20000 of 45977\n", + "Profile: 21000 of 45977\n", + "Profile: 22000 of 45977\n", + "Profile: 23000 of 45977\n", + "Profile: 24000 of 45977\n", + "Profile: 25000 of 45977\n", + "Profile: 26000 of 45977\n", + "Profile: 27000 of 45977\n", + "Profile: 28000 of 45977\n", + "Profile: 29000 of 45977\n", + "Profile: 30000 of 45977\n", + "Profile: 31000 of 45977\n", + "Profile: 32000 of 45977\n", + "Profile: 33000 of 45977\n", + "Profile: 34000 of 45977\n", + "Profile: 35000 of 45977\n", + "Profile: 36000 of 45977\n", + "Profile: 37000 of 45977\n", + "Profile: 38000 of 45977\n", + "Profile: 39000 of 45977\n", + "Profile: 40000 of 45977\n", + "Profile: 41000 of 45977\n", + "Profile: 42000 of 45977\n", + "Profile: 43000 of 45977\n", + "Profile: 44000 of 45977\n", + "Profile: 45000 of 45977\n", + "Writing file: WOD2018_CODA_2005_mrb.nc\n", + "Working on platform: gld\n", + "Profile: 0 of 9161\n", + "Profile: 1000 of 9161\n", + "Profile: 2000 of 9161\n", + "Profile: 3000 of 9161\n", + "Profile: 4000 of 9161\n", + "Profile: 5000 of 9161\n", + "Profile: 6000 of 9161\n", + "Profile: 7000 of 9161\n", + "Profile: 8000 of 9161\n", + "Profile: 9000 of 9161\n", + "Writing file: WOD2018_CODA_2005_gld.nc\n", + "Working on year: 2006\n", + "Working on platform: osd\n", + "Profile: 0 of 15066\n", + "Profile: 1000 of 15066\n", + "Profile: 2000 of 15066\n", + "Profile: 3000 of 15066\n", + "Profile: 4000 of 15066\n", + "Profile: 5000 of 15066\n", + "Profile: 6000 of 15066\n", + "Profile: 7000 of 15066\n", + "Profile: 8000 of 15066\n", + "Profile: 9000 of 15066\n", + "Profile: 10000 of 15066\n", + "Profile: 11000 of 15066\n", + "Profile: 12000 of 15066\n", + "Profile: 13000 of 15066\n", + "Profile: 14000 of 15066\n", + "Profile: 15000 of 15066\n", + "Writing file: WOD2018_CODA_2006_osd.nc\n", + "Working on platform: ctd\n", + "Profile: 0 of 25074\n", + "Profile: 1000 of 25074\n", + "Profile: 2000 of 25074\n", + "Profile: 3000 of 25074\n", + "Profile: 4000 of 25074\n", + "Profile: 5000 of 25074\n", + "Profile: 6000 of 25074\n", + "Profile: 7000 of 25074\n", + "Profile: 8000 of 25074\n", + "Profile: 9000 of 25074\n", + "Profile: 10000 of 25074\n", + "Profile: 11000 of 25074\n", + "Profile: 12000 of 25074\n", + "Profile: 13000 of 25074\n", + "Profile: 14000 of 25074\n", + "Profile: 15000 of 25074\n", + "Profile: 16000 of 25074\n", + "Profile: 17000 of 25074\n", + "Profile: 18000 of 25074\n", + "Profile: 19000 of 25074\n", + "Profile: 20000 of 25074\n", + "Profile: 21000 of 25074\n", + "Profile: 22000 of 25074\n", + "Profile: 23000 of 25074\n", + "Profile: 24000 of 25074\n", + "Profile: 25000 of 25074\n", + "Writing file: WOD2018_CODA_2006_ctd.nc\n" + ] + } + ], + "source": [ + "WOD_directory = '//oa-decadal-climate/work/observations/WOD2018/'\n", + "platforms_to_get = ['osd','ctd','xbt','pfl','mrb','gld']\n", + "WOD_file_name_stem = 'wod_'\n", + "\n", + "profile_vars_to_get = ['Oxygen','Temperature','Salinity','Chlorophyll','Silicate','Phosphate','Nitrate','pH','Alkalinity','tCO2','z']\n", + "point_vars_to_get = ['lat','lon'] #,'time','wod_unique_cast']\n", + "ancillary_vars_to_get = ['origflagset','country','dataset','Access_no','Recorder',\n", + " 'dbase_orig','Platform','Project','WOD_cruise_identifier',\n", + " 'Institute','needs_z_fix','Ocean_Vehicle','Temperature_Instrument']\n", + "\n", + "output_path = '/oa-decadal-climate/work/observations/CARSv2_ancillary/CODA/CODAv1/'\n", + "\n", + "FILL_VALUE = -10000000000.0\n", + "\n", + "for i_year in range(START_YEAR,END_YEAR+1):\n", + " print('Working on year: ', i_year)\n", + " for i_platform in platforms_to_get:\n", + " print('Working on platform: ', i_platform)\n", + "\n", + " file_name = WOD_file_name_stem + i_platform + '_' + str(i_year) + '.nc'\n", + " WOD_ragged_object = wodnc.Ragged(os.path.join(WOD_directory,str(i_year),file_name))\n", + " n_profiles = WOD_ragged_object.ncasts()\n", + " \n", + " max_depth_levels = 0for i_ancillary_var in ancillary_vars_to_get:\n", + " if i_ancillary_var in vars_in_dataset:\n", + " \n", + " ancillary_data_container[i_ancillary_var] = 0\n", + " attributes_data_container[i_ancillary_var] = {}\n", + " \n", + " for i_profile in range(0,n_profiles):\n", + " \n", + " WOD_profile_object = wodnc.ncProfile(WOD_ragged_object,i_profile)\n", + " n_depth_levels = WOD_profile_object.n_levels()\n", + " \n", + " if n_depth_levels>max_depth_levels:\n", + " max_depth_levels = n_depth_levels\n", + " \n", + " \n", + " #END for i_profile\n", + " vars_in_dataset = list(WOD_ragged_object.variables().keys())\n", + " \n", + " \n", + " profile_data_container = {}\n", + " ancillary_data_container = {}\n", + " attributes_data_container = {}\n", + " for i_var in profile_vars_to_get:\n", + " if i_var in vars_in_dataset:\n", + "\n", + " profile_data_container[i_var] = np.nan*np.zeros([n_profiles,max_depth_levels],dtype='float32')\n", + " profile_data_container[i_var + '_WODflag'] = np.nan*np.zeros([n_profiles,max_depth_levels],dtype='int8')\n", + " profile_data_container[i_var + '_origflag'] = np.nan*np.zeros([n_profiles,max_depth_levels],dtype='float32')\n", + " \n", + " attributes_data_container[i_var] = {}\n", + " attributes_data_container[i_var + '_WODflag'] = {}\n", + " attributes_data_container[i_var + '_origflag'] = {}\n", + " #END if i_var\n", + " #END for i_var\n", + "\n", + " point_data_container = {}\n", + " for i_var in point_vars_to_get:\n", + " point_data_container[i_var] = np.nan*np.zeros([n_profiles],dtype='float32')\n", + " \n", + " attributes_data_container[i_var] = {}\n", + "\n", + " point_data_container['time'] = np.zeros([n_profiles],dtype='datetime64[ns]')\n", + " point_data_container['wod_unique_cast'] = np.zeros([n_profiles],dtype='int32')\n", + " \n", + " attributes_data_container['wod_unique_cast'] = {}\n", + " attributes_data_container['time'] = {}\n", + "\n", + " \n", + " for i_ancillary_var in ancillary_vars_to_get:\n", + " if i_ancillary_var in vars_in_dataset:\n", + " \n", + " ancillary_data_container[i_ancillary_var] = 0\n", + " attributes_data_container[i_ancillary_var] = {}\n", + "\n", + " \n", + " \n", + " #Begin the loop through the profiles\n", + " \n", + " for i_profile in range(0,n_profiles):\n", + " \n", + " if (i_profile % 1000) ==0:\n", + " print(\"Profile: \", i_profile, ' of ', n_profiles)\n", + " \n", + " WOD_profile_object = wodnc.ncProfile(WOD_ragged_object,i_profile) \n", + " \n", + " for i_var in profile_data_container:\n", + " current_variable = WOD_profile_object.level_unpack(i_var)\n", + " \n", + " if current_variable.size != 0:\n", + " #Test for missing data on profile\n", + " profile_data_container[i_var][i_profile,0:current_variable.size] = current_variable\n", + " \n", + " \n", + " #END if current_variable.size != 0:\n", + " #END for i_var\n", + " \n", + " if i_profile ==0:\n", + " for i_var in profile_data_container:\n", + " variable_attributes = WOD_profile_object.show_variable_attr(i_var)\n", + " for i_attribute in variable_attributes:\n", + " attributes_data_container[i_var][i_attribute] = WOD_profile_object.get_variable_attr(i_var,i_attribute)\n", + " #END for i_attribute\n", + " #for i_var \n", + " #if i_profile ==0 \n", + " \n", + " point_data_container['wod_unique_cast'][i_profile] = WOD_profile_object.metadata('wod_unique_cast')\n", + " if i_profile==0:\n", + " variable_attributes = WOD_profile_object.show_variable_attr('wod_unique_cast')\n", + " for i_attribute in variable_attributes:\n", + " attributes_data_container['wod_unique_cast'][i_attribute] = WOD_profile_object.get_variable_attr('wod_unique_cast',i_attribute)\n", + " \n", + " \n", + " \n", + " point_data_container['time'][i_profile] = np.datetime64(WOD_profile_object.datetime())\n", + " if i_profile==0:\n", + " variable_attributes = WOD_profile_object.show_variable_attr('time')\n", + " for i_attribute in variable_attributes:\n", + " attributes_data_container['time'][i_attribute] = WOD_profile_object.get_variable_attr('time',i_attribute)\n", + " \n", + " for i_var in point_vars_to_get:\n", + " if i_var in vars_in_dataset:\n", + " point_data_container[i_var][i_profile] = WOD_profile_object.metadata(i_var)\n", + " \n", + " if i_profile==0:\n", + " variable_attributes = WOD_profile_object.show_variable_attr(i_var)\n", + " for i_attribute in variable_attributes:\n", + " \n", + " attributes_data_container[i_var][i_attribute] = WOD_profile_object.get_variable_attr(i_var,i_attribute)\n", + " #END if i_var\n", + " #END for i_var\n", + " \n", + " \n", + " for i_ancillary_var in ancillary_data_container.keys():\n", + " current_ancillary_var = WOD_profile_object.metadata(i_ancillary_var)\n", + " #print(type(current_ancillary_var))\n", + " \n", + " \n", + " if i_profile ==0:\n", + " var_type = type(current_ancillary_var)\n", + " if var_type is str:\n", + " \n", + " ancillary_data_container[i_ancillary_var] = np.zeros([n_profiles],dtype='S{}'.format(100))\n", + " else:\n", + " ancillary_data_container[i_ancillary_var] = np.zeros([n_profiles],dtype=var_type)\n", + " #END if var_type=='str'\n", + " \n", + "\n", + " ancillary_variable_attributes = WOD_profile_object.show_variable_attr(i_ancillary_var)\n", + "\n", + " for i_attribute in ancillary_variable_attributes:\n", + " attributes_data_container[i_ancillary_var][i_attribute] = WOD_profile_object.get_variable_attr(i_ancillary_var,i_attribute)\n", + "\n", + " #END for i_attribute\n", + " #END if i_profile = 0\n", + " \n", + " ancillary_data_container[i_ancillary_var][i_profile] = current_ancillary_var\n", + "\n", + " #END for i_ancillary_var\n", + " \n", + " if i_profile==0:\n", + " crs_attribs = {}\n", + " crs = WOD_profile_object.r.variables()['crs'][:]\n", + "\n", + " for i_attr in WOD_profile_object.show_variable_attr('crs'):\n", + " crs_attribs[i_attr] = WOD_profile_object.get_variable_attr('crs',i_attr)\n", + " \n", + " #END for i_profile\n", + " \n", + " \n", + " \n", + " cast_index = np.arange(1,n_profiles+1)\n", + " \n", + " \n", + " \n", + " CODA_output_dataset = xarray.DataArray(point_data_container['wod_unique_cast'],dims=['cast'],coords={'cast':cast_index},attrs = attributes_data_container['wod_unique_cast'])\n", + " CODA_output_dataset = CODA_output_dataset.to_dataset(name='WOD_id') \n", + " \n", + " \n", + " CODA_output_dataset['time'] = xarray.DataArray(point_data_container['time'],dims=['cast'],coords={'cast':cast_index} ) #,attrs = attributes_data_container['time'])\n", + " CODA_output_dataset['time'].encoding['units'] = 'days since 1770-01-01T00:00:00+00:00'\n", + " CODA_output_dataset['time'].encoding['calendar'] = 'proleptic_gregorian'\n", + " CODA_output_dataset['time'].encoding['_FillValue'] = FILL_VALUE\n", + "\n", + " \n", + " \n", + " #Add the point or cast information (lat and lon)\n", + " for i_var in point_vars_to_get:\n", + " CODA_output_dataset[i_var] = xarray.DataArray(point_data_container[i_var],dims=['cast'],coords={'cast':cast_index},attrs = attributes_data_container[i_var])\n", + " CODA_output_dataset[i_var] = CODA_output_dataset[i_var].fillna(FILL_VALUE)\n", + " CODA_output_dataset[i_var].attrs['_FillValue'] = FILL_VALUE\n", + " #END for i_var\n", + " \n", + " #Add the ancillary data\n", + " for i_ancillary_var in ancillary_data_container.keys():\n", + " CODA_output_dataset[i_ancillary_var] = xarray.DataArray(ancillary_data_container[i_ancillary_var],dims=['cast'],coords={'cast':cast_index},\n", + " attrs = attributes_data_container[i_ancillary_var])\n", + " if '_FillValue' in CODA_output_dataset[i_ancillary_var].attrs:\n", + " #print('found fill value')\n", + " #print(CODA_output_dataset[i_ancillary_var])\n", + " CODA_output_dataset[i_ancillary_var].attrs['_FillValue'] = FILL_VALUE\n", + " CODA_output_dataset[i_ancillary_var] = CODA_output_dataset[i_ancillary_var].fillna(FILL_VALUE)\n", + " #END if \n", + " #END for i_ancillary_var\n", + " \n", + " \n", + " #Add the profile data\n", + " for i_var in profile_data_container.keys():\n", + " CODA_output_dataset[i_var] = xarray.DataArray(profile_data_container[i_var],dims=['cast','z_index'],\n", + " coords= {'cast':cast_index,'z_index':np.arange(0,max_depth_levels,1)}, attrs = attributes_data_container[i_var])\n", + " \n", + " CODA_output_dataset[i_var] = CODA_output_dataset[i_var].fillna(FILL_VALUE)\n", + " CODA_output_dataset[i_var].attrs['_FillValue'] = FILL_VALUE\n", + " #END for i_var\n", + " \n", + " CODA_output_dataset['crs'] = xarray.DataArray(crs,dims=[],attrs = crs_attribs) \n", + " \n", + " source_data = 'WOD'\n", + " CODA_dates = CODA_output_dataset['time'].dt.date.values\n", + "\n", + " CODA_ID = Get_CODA_id(CODA_dates,source_data,i_platform,i_year)\n", + " \n", + " \n", + " CODA_output_dataset['CODA_id'] = xarray.DataArray(CODA_ID,dims=['cast'],coords={'cast':cast_index},\n", + " attrs={'Comment':'Unique CODA identifier with format '})\n", + " CODA_output_dataset.attrs = {'Parent ragged array file':file_name}\n", + " \n", + " output_file_name = 'WOD2018_CODA_' + str(i_year) + '_' + str(i_platform) + '.nc'\n", + " print('Writing file:', output_file_name)\n", + " if not os.path.isdir(os.path.join(output_path,str(i_year))):\n", + " os.mkdir(os.path.join(output_path,str(i_year)))\n", + " \n", + " CODA_output_dataset.to_netcdf(os.path.join(output_path,str(i_year),output_file_name)) #,encoding=encoding) #,engine='netcdf4')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2091f5f0", + "metadata": {}, + "outputs": [], + "source": [ + "def Get_CODA_id(dates,source_data,obs_platform,year):\n", + " \n", + " CODA_id_length = 20\n", + " CODA_ID = np.array(['' for _ in range(dates.size)], dtype='S{}'.format(CODA_id_length))\n", + " \n", + " unique_dates = np.unique(CODA_dates)\n", + "\n", + " \n", + " for i_date in unique_dates:\n", + " \n", + " if not pandas.isnull(i_date):\n", + " \n", + " idx_for_date = np.nonzero(dates == i_date)[0]\n", + " count_for_date = idx_for_date.size\n", + " i_profile = 1\n", + " \n", + " for i_index in idx_for_date:\n", + " #print(i_index)\n", + " CODA_id_current_profiles = source_data + obs_platform + i_date.strftime('%Y%m%d') + f'{i_profile:04}'\n", + " CODA_ID[i_index] = CODA_id_current_profiles\n", + " i_profile = i_profile+1\n", + " \n", + " #END for i_index\n", + " #if not isnull\n", + " #END for i_date\n", + "\n", + " bad_dates_idx = np.nonzero(pandas.isnull(dates))[0]\n", + " \n", + " bad_profile_counter = 1\n", + " \n", + " for i_bad_idx in bad_dates_idx:\n", + " CODA_id_current_profiles = source_data + obs_platform + str(year) + 'XXXX' + f'{bad_profile_counter:04}'\n", + " CODA_ID[i_bad_idx] = CODA_id_current_profiles\n", + " bad_profile_counter = bad_profile_counter+1\n", + "\n", + " return CODA_ID" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53b2826b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d65db87", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}