diff --git a/notebooks/CDAWeb_masters.ipynb b/notebooks/CDAWeb_masters.ipynb new file mode 100644 index 0000000..6706cb6 --- /dev/null +++ b/notebooks/CDAWeb_masters.ipynb @@ -0,0 +1,338 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"PYISTP_CDFLIB\"] = 'pycdfpp'\n", + "#os.environ[\"PYISTP_CDFLIB\"]='spacepy'\n", + "import pyistp\n", + "from pyistp import _impl as pyistp_impl\n", + "import requests\n", + "from glob import glob\n", + "import zstd\n", + "import tempfile\n", + "import tarfile\n", + "import io\n", + "import logging\n", + "from tqdm import tqdm\n", + "\n", + "def clean_logs():\n", + " lines=open('output/result.log').readlines()\n", + " index = 1\n", + " while index < len(lines):\n", + " if lines[index].startswith('Loading') and lines[index-1].startswith('Loading'):\n", + " lines.pop(index-1)\n", + " else:\n", + " index+=1\n", + " open('output/result_clean.log','w').writelines(lines)\n", + "\n", + "def compare_master_and_sample(master, sample_url_part, variable_name):\n", + " pyistp_impl.log.handlers.clear()\n", + " pyistp_impl.log.addHandler(logging.StreamHandler())\n", + " master = pyistp.load(buffer=requests.get(f\"https://cdaweb.gsfc.nasa.gov/pub/software/cdawlib/0MASTERS/{master}\").content)\n", + " sample_file = pyistp.load(buffer=requests.get(f\"https://cdaweb.gsfc.nasa.gov/pub/data/{sample_url_part}\").content)\n", + " print(\"master:\", flush=True)\n", + " v=master.data_variable(variable_name)\n", + " print(\"sample_file:\", flush=True)\n", + " v=sample_file.data_variable(variable_name)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Prepare output folder" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "if not os.path.exists('output'):\n", + " os.mkdir('output')\n", + "\n", + "if os.path.exists('output/result.log'):\n", + " os.remove('output/result.log')\n", + "\n", + "pyistp_impl.log.handlers.clear()\n", + "pyistp_impl.log.addHandler(logging.FileHandler(\"output/result.log\"))\n", + "pyistp_impl.log.setLevel(logging.DEBUG)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "# Download and a load all masters as ISTP files" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3324/3324 [00:21<00:00, 152.98it/s]\n" + ] + } + ], + "source": [ + "\n", + "with tempfile.TemporaryDirectory() as tmp_dir:\n", + " tar = tarfile.open(fileobj=io.BytesIO(\n", + " zstd.decompress(requests.get('https://hephaistos.lpp.polytechnique.fr/data/jeandet/master.tar.zst').content)))\n", + " tar.extractall(tmp_dir)\n", + " del tar\n", + " cdf_masters = glob(f\"{tmp_dir}/*.cdf\")\n", + " for master in tqdm(cdf_masters):\n", + " istp_file = pyistp.load(master)\n", + " for vname in istp_file.data_variables():\n", + " v = istp_file.data_variable(vname)\n", + "\n", + "clean_logs()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "# let's manually check few files" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## c3_cp_efw_l3_e3d_inert\n", + "Both master and sample file have the same issue" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "master:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Non compliant ISTP file: variable delta_Ez_ISR2__C3_CP_EFW_L3_E3D_INERT has LABL_PTR_1 attribute which points to variable dEz which does not exist\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sample_file:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Non compliant ISTP file: variable delta_Ez_ISR2__C3_CP_EFW_L3_E3D_INERT has LABL_PTR_1 attribute which points to variable dEz which does not exist\n" + ] + } + ], + "source": [ + "compare_master_and_sample(master=\"c3_cp_efw_l3_e3d_inert_00000000_v01.cdf\",\n", + " sample_url_part=\"cluster/c3/efw/efield_3dvect_spinreso_l3_inertialframe/2003/c3_cp_efw_l3_e3d_inert_20030120_v20130803.cdf\",\n", + " variable_name=\"delta_Ez_ISR2__C3_CP_EFW_L3_E3D_INERT\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## mms4_fpi_fast_l2_dis-momsaux\n", + "Both master and sample file have the same issue" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "master:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Non compliant ISTP file: mms4_dis_compressionloss_fast was marked as data variable but it has 0 support variable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sample_file:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Non compliant ISTP file: mms4_dis_compressionloss_fast was marked as data variable but it has 0 support variable\n" + ] + } + ], + "source": [ + "compare_master_and_sample(master=\"mms4_fpi_fast_l2_dis-momsaux_00000000_v01.cdf\",\n", + " sample_url_part=\"mms/mms4/fpi/fast/l2/dis-momsaux/2021/10/mms4_fpi_fast_l2_dis-momsaux_20211003140000_v3.4.0.cdf\",\n", + " variable_name=\"mms4_dis_compressionloss_fast\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## mvn_swi_l2_finearc3d\n", + "Both master and sample file have the same issue" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "master:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Non compliant ISTP file: eindex was marked as data variable but it has 0 support variable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sample_file:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Non compliant ISTP file: eindex was marked as data variable but it has 0 support variable\n" + ] + } + ], + "source": [ + "compare_master_and_sample(master=\"mvn_swi_l2_finearc3d_00000000_v01.cdf\",\n", + " sample_url_part=\"maven/swia/l2/finearc3d/2020/11/mvn_swi_l2_finearc3d_20201125_v02_r00.cdf\",\n", + " variable_name=\"eindex\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/pyistp/_impl.py b/pyistp/_impl.py index 0564b6c..8c47871 100644 --- a/pyistp/_impl.py +++ b/pyistp/_impl.py @@ -26,35 +26,38 @@ def _get_attributes(cdf: object, var: str): attrs[attr] = value else: log.warning( - f"{ISTP_NOT_COMPLIANT_W}: variable {var} has {attr} attribute which points to a variable which does not exist") + f"{ISTP_NOT_COMPLIANT_W}: variable {var} has {attr} attribute which points to variable {value} which does not exist") attrs[attr] = value else: attrs[attr] = value return attrs -def _get_axis(cdf: object, var: str): - if cdf.has_variable(var): - if cdf.is_char(var): - if 'sig_digits' in cdf.variable_attributes(var): # cluster CSA trick :/ - return SupportDataVariable(name=var, values=np.asarray(cdf.values(var), dtype=float), - attributes=_get_attributes(cdf, var)) - return SupportDataVariable(name=var, values=cdf.values(var), attributes=_get_attributes(cdf, var)) - log.warning(f"{ISTP_NOT_COMPLIANT_W}: trying to load {var} as support data but it is absent from the file") +def _get_axis(cdf: object, axis_var: str, data_var: str): + if cdf.has_variable(axis_var): + if cdf.is_char(axis_var): + if 'sig_digits' in cdf.variable_attributes(axis_var): # cluster CSA trick :/ + return SupportDataVariable(name=axis_var, values=np.asarray(cdf.values(axis_var), dtype=float), + attributes=_get_attributes(cdf, axis_var)) + return SupportDataVariable(name=axis_var, values=cdf.values(axis_var), + attributes=_get_attributes(cdf, axis_var)) + else: + log.warning( + f"{ISTP_NOT_COMPLIANT_W}: trying to load {axis_var} as support data for {data_var} but it is absent from the file") return None def _get_axes(cdf: object, var: str, data_shape): attrs = sorted(filter(lambda attr: DEPEND_REGEX.match(attr), cdf.variable_attributes(var))) unix_time_name = cdf.variable_attribute_value(var, "DEPEND_TIME") - axes = list(map(lambda attr: _get_axis(cdf, cdf.variable_attribute_value(var, attr)), attrs)) + axes = list(map(lambda attr: _get_axis(cdf, cdf.variable_attribute_value(var, attr), var), attrs)) if unix_time_name is not None and unix_time_name in cdf.variables(): - unix_time = _get_axis(cdf, unix_time_name) - if len(unix_time) == data_shape[0]: + unix_time = _get_axis(cdf, unix_time_name, var) + if len(unix_time) == data_shape[0] and len(axes[0].values) != data_shape[0]: unix_time.values = (unix_time.values * 1e9).astype('