[tide] documented, corrected detide() and added resampling in tide_an…

…alysis() + remove unused function
seareport · Mar 9, 2024 · 524854b · 524854b
1 parent 0f3e54c
commit 524854b
Show file tree

Hide file tree

Showing 24 changed files with 288,685 additions and 6,040 deletions.
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -0,0 +1,60 @@
+name: test
+
+on:
+  push:
+    branches:
+      - "main"
+      - "master"
+      - "dev"
+    paths:
+      - "**.py"
+      - ".github/workflows/*test*.yml"
+      - "pyproject.toml"
+      - "poetry.lock"
+      - "requirements/requirements*.txt"
+  pull_request:
+    paths:
+      - "**.py"
+      - ".github/workflows/*test*.yml"
+      - "pyproject.toml"
+      - "poetry.lock"
+      - "requirements/requirements*.txt"
+
+jobs:
+  test:
+    name: "test Python ${{ matrix.python }} on ${{ matrix.os }}"
+    runs-on: "${{ matrix.os }}"
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-latest"]
+        python: ["3.9", "3.10", "3.11"]
+        include:
+          - os: "macos-latest"
+            python-version: "3.10"
+    defaults:
+      run:
+        shell: "bash -eo pipefail {0}"
+
+    steps:
+      - uses: "actions/checkout@main"
+      - uses: "actions/setup-python@main"
+        with:
+          python-version: "${{ matrix.python }}"
+      - uses: "actions/cache@main"
+        id: "cache"
+        with:
+          path: "${{ env.pythonLocation }}"
+          key: "test-${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('pyproject.toml', 'requirements/*') }}"
+      - run: "python --version"
+      - run: "python -mpip install -U pip"
+      - run: "python -mpip --version"
+      - run: "python -mpip install -r requirements/requirements.txt"
+      - name: "Install requirements"
+      - run: "python -mpip install ./"
+      - run: "python -mpip cache info"
+      - run: "python -mpip freeze"
+      - name: "Run tests"
+      - run: "pytest tests/"
+        env:
+          PYTHONPATH: ${{ github.workspace }}/tests
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -80,18 +80,6 @@ repos:
             "-o",
             "requirements/requirements.txt",
           ]
-      - id: "poetry-export"
-        name: "poetry export dev"
-        args:
-          [
-            "--with",
-            "dev",
-            "-f",
-            "requirements.txt",
-            "-o",
-            "requirements/requirements-dev.txt",
-          ]
-
   - repo: "local"
     hooks:
       - id: "mypy"

diff --git a/analysea/steps.py b/analysea/steps.py
@@ -8,8 +8,6 @@
 import numpy.typing as npt
 import pandas as pd
 import ruptures as rpt
-from scipy import interpolate
-from skimage.restoration import denoise_tv_chambolle
 
 from analysea.utils import detect_time_step
 
@@ -69,18 +67,3 @@ def remove_steps_simple(df: pd.DataFrame, threshold: float) -> Tuple[pd.DataFram
         step = df.interpolate().iloc[stepx : steps_ix[i + 1]].mean()
         step_function.iloc[stepx : steps_ix[i + 1]] = step
     return step_function, steps_ix
-
-
-def step_function_TV(df: pd.DataFrame, weight: float = 1) -> Tuple[pd.DataFrame, npt.NDArray[Any]]:
-    idx = range(0, len(df), 200)
-    signal = np.array(df.interpolate())[idx]
-    # adjust the parameters
-    signal_denoise = denoise_tv_chambolle(signal, weight=weight)
-    # x_step = -2*np.cumsum(signal_denoise)
-    # step_indicator = x_step == x_step.max()
-    f = interpolate.interp1d(
-        idx, signal_denoise, fill_value="extrapolate"
-    )  # for extrapolation in last scipy version
-    res = pd.Series(data=f(range(len(df))), index=df.index)
-    #
-    return remove_steps_simple(res, res.std())
diff --git a/analysea/tide.py b/analysea/tide.py
@@ -107,6 +107,26 @@ def calc_constituents(
     lat: float = 0.0,
     **kwargs: Dict[str, Any] | None,
 ) -> Any:
+    """
+    Calculate the tide constituents for a time series.
+
+    Parameters
+    ----------
+    ts : pd.Series
+        The time series to analyze.
+    resample_time : int, optional
+        The resample time in minutes, by default 30.
+    lat : float, optional
+        The latitude of the station, by default 0.0.
+    **kwargs : Dict[str, Any], optional
+        Additional keyword arguments to pass to the utide.solve function.
+
+    Returns
+    -------
+    np.ndarray
+        The tide constituents.
+
+    """
     # resample to 30 min for a MUCH faster analysis
     # https://github.com/wesleybowman/UTide/issues/103
     h_rsmp = ts.resample(f"{resample_time}min").mean()
@@ -134,9 +154,10 @@ def detide(
     @param lat: latitude of the station
     @param resample_time: resample time in minutes
     @param split_period: period in days to split the time series into (default 365)
-    @param kwargs: keyword arguments to be passed to utide.reconstruct
+    @param kwargs: keyword arguments to be passed to calc constituents
     @return: reconstructed time series
     """
+    verbose = kwargs.get("verbose", False)
     result_series = []
     if constituents is None:
         constituents = calc_constituents(ts=ts, resample_time=resample_time, lat=lat, **kwargs)
@@ -145,36 +166,99 @@ def detide(
     df = h_rsmp.shift(freq=f"{resample_time / 2}min")
 
     for start in range(0, len(df), chunk):
-        end = min(start + chunk, len(ts))
-        ts_chunk = ts.iloc[start:end]
+        end = min(start + chunk, len(df))
+        ts_chunk = df.iloc[start:end]
 
         if not ts_chunk.empty:
-            tidal = utide.reconstruct(ts_chunk.index, nd_format(constituents), verbose=kwargs["verbose"])
+            tidal = utide.reconstruct(ts_chunk.index, nd_format(constituents), verbose=verbose)
             storm_surge = ts_chunk - tidal.h
             result_series.append(storm_surge)
     return pd.concat(result_series)
 
 
 def tide_analysis(
     ts: pd.Series[float],
-    resample_time: int = 10,
+    resample_time: int = 30,
     lat: float = 0.0,
     **kwargs: Dict[str, Any] | None,
-) -> Tuple[pd.DataFrame, pd.DataFrame, npt.NDArray[Any]]:
+) -> Tuple[pd.Series, pd.Series, npt.NDArray[Any]]:
+    """
+    Perform a tide analysis on a time series.
+
+    Parameters
+    ----------
+    ts : pd.Series
+        The time series to analyze.
+    resample_time : int, optional
+        The resample time in minutes, by default 30.
+    lat : float, optional
+        The latitude of the station, by default 0.0.
+    **kwargs : Dict[str, Any], optional
+        Additional keyword arguments to pass to the utide.solve function.
+
+    Returns
+    -------
+    Tuple[pd.DataFrame, pd.DataFrame, npt.NDArray[Any]]
+        A tuple containing the following elements:
+
+        * tide : pd.DataFrame
+            A dataframe containing the tide values.
+        * surge : pd.DataFrame
+            A dataframe containing the surge values.
+        * constituents : npt.NDArray[Any]
+            The constituents used in the analysis.
+
+    """
+    verbose = kwargs.get("verbose", False)
     constituents = calc_constituents(ts=ts, lat=lat, resample_time=resample_time, **kwargs)
-    tidal = utide.reconstruct(ts.index, constituents, **kwargs)
-    tide = pd.Series(data=tidal.h, index=ts.index)
-    surge = pd.Series(data=ts.iloc[0, :].values - tidal.h, index=ts.index)
+
+    h_rsmp = ts.resample(f"{resample_time}min").apply(np.nanmean)
+    df = h_rsmp.shift(freq=f"{resample_time / 2}min")
+
+    tidal = utide.reconstruct(df.index, constituents, verbose=verbose)
+    tide = pd.Series(data=tidal.h, index=df.index)
+    surge = pd.Series(data=df.values - tidal.h, index=df.index)
     return tide, surge, constituents
 
 
 def yearly_tide_analysis(
     h: pd.Series[float],
-    resample_time: int = 10,
+    resample_time: int = 30,
     split_period: int = 365,
     lat: int = 0,
     **kwargs: Dict[str, Any] | None,
 ) -> Tuple[pd.DataFrame, pd.DataFrame, List[npt.NDArray[Any]], List[int]]:
+    """
+    Perform a tide analysis on a time series, split into yearly intervals.
+
+    Parameters
+    ----------
+    h : pd.Series
+        The time series to analyze.
+    resample_time : int, optional
+        The resample time in minutes, by default 30.
+    split_period : int, optional
+        The period in days to split the time series into, by default 365.
+    lat : int, optional
+        The latitude of the station, by default 0.
+    **kwargs : Dict[str, Any], optional
+        Additional keyword arguments to pass to the utide.solve function.
+
+    Returns
+    -------
+    Tuple[pd.DataFrame, pd.DataFrame, List[npt.NDArray[Any]], List[int]]
+        A tuple containing the following elements:
+
+        * tide : pd.DataFrame
+            A dataframe containing the tide values.
+        * surge : pd.DataFrame
+            A dataframe containing the surge values.
+        * coefs : List[npt.NDArray[Any]]
+            The constituents used in the analysis for each year.
+        * years : List[int]
+            The years analyzed.
+
+    """
     log = kwargs.get("verbose", False)
 
     min_time = pd.Timestamp(h.index.min())

diff --git a/analysea/utils.py b/analysea/utils.py
@@ -4,6 +4,7 @@
 from typing import Any
 from typing import cast
 from typing import Dict
+from typing import Iterator
 from typing import Optional
 from typing import Tuple
 from typing import Union
@@ -20,6 +21,41 @@
 # ===================
 # TIME SERIES
 # ===================
+def resample(df: pd.DataFrame, t_rsp: int = 30) -> pd.DataFrame:
+    """
+    Resample a pandas dataframe to a new time interval.
+
+    @param df (pd.DataFrame): The input dataframe.
+    @param t_rsp (int): optional, The target resample period in minutes
+    by default 30.
+
+    @returns (pd.DataFrame): The resampled dataframe.
+    """
+    ts = df.resample(f"{t_rsp}min").mean().shift(freq=f"{int(t_rsp/2)}min")
+    return ts
+
+
+def interpolate(df: pd.DataFrame, t_rsp: int = 30) -> pd.DataFrame:
+    """
+    This function resamples a pandas dataframe to a new time interval
+    using linear interpolation.
+
+    It uses analysea's detect_time_step function to interpolate only
+    between holes in the data and not extrapolate "flat areas" in the
+    signal
+
+    @param df (pd.DataFrame): The input dataframe.
+    @param t_rsp (int): optional, The target resample period in minutes
+    by default 30.
+
+    @returns (pd.DataFrame): The interpolated dataframe.
+    """
+    time_step = detect_time_step(df)
+    n_interp = int(t_rsp * 60 / time_step.total_seconds())
+    ts = df.interpolate(method="linear", limit=n_interp)
+    return ts
+
+
 def detect_splits(sr: pd.Series, max_gap: pd.Timedelta) -> pd.DatetimeIndex:
     split_points = pd.DatetimeIndex([sr.index[0], sr.index[-1]])
     condition = sr.index.to_series().diff() > max_gap
@@ -28,7 +64,15 @@ def detect_splits(sr: pd.Series, max_gap: pd.Timedelta) -> pd.DatetimeIndex:
     return split_points
 
 
-def split_series(sr: pd.Series, max_gap: pd.Timedelta = pd.Timedelta(hours=24)) -> pd.Series:
+def split_series(sr: pd.Series, max_gap: pd.Timedelta = pd.Timedelta(hours=24)) -> Iterator[pd.Series]:
+    """
+    Splits a pandas series into segments without overlapping gaps larger than max_gap.
+
+    @param sr (pd.Series): The input series.
+    @param max_gap (pd.Timedelta): The maximum allowed gap between two segments.
+
+    @returns: Iterator[pd.Series]: An iterator of segments.
+    """
     for start, stop in itertools.pairwise(detect_splits(sr=sr, max_gap=max_gap)):
         segment = sr[start:stop]
         yield segment[:-1]
@@ -69,6 +113,26 @@ def cleanup(
     despike: bool = True,
     demean: bool = True,
 ) -> pd.DataFrame:
+    """
+    This function cleans up a time series by removing outliers,
+    detecting and removing flat areas, and removing steps.
+
+    @param ts (pd.Series): The input time series.
+    @param clip_limits tuple[float, float]: Optional, The lower and upper
+      bounds for outlier detection. If None, outlier detection is not performed.
+    @param kurtosis (float): The threshold for detecting outliers. If the absolute
+      value of the kurtosis of a segment is less than this value, the segment is
+      considered clean.
+    @param remove_flats (bool): Whether to remove flat areas from the time series.
+      If True, flat areas are detected by comparing the difference in consecutive values.
+    @param despike (bool): Whether to remove outliers using the provided clip_limits.
+      If True and clip_limits is not None, outlier detection is performed.
+    @param demean (bool): Whether to demean the time series.
+      If True, the mean of the time series is subtracted from each value.
+
+    @returns (pd.DataFrame): The cleaned up time series.
+
+    """
     # Check if the input is empty
     if ts.empty:
         return pd.DataFrame()