941664810387 — Arnaud Campeas[arnaud.campeas@pythonian.fr] 1 year, 1 month ago
api: add inferred_freq flag to .edited point

An infered frequency is calculated on the requested points.
A regular index is created from this frequency and the series index
(if no bound is provided) or from the bounds (if presents).
A special care must be done to have consistent tz-awarness between
the bounds and the series index.
This regular index will present nans when the series has no
associated values. The edition markers of such "holes" are set
as False.
M test/test_api.py +205 -0
@@ 188,3 188,208 @@ 2020-01-01    False
 2020-01-02    False
 2020-01-03    False
 """, marker)
+
+
+def test_infer_freq(tsx):
+    ts = pd.Series(
+        [1, 2, 3, 4, 6],
+        index=[
+            pd.Timestamp('2024-01-01'),
+            pd.Timestamp('2024-01-02'),
+            pd.Timestamp('2024-01-03'),
+            pd.Timestamp('2024-01-04'),
+            pd.Timestamp('2024-01-06'),
+        ]
+    )
+    tsx.update('series_with_holes', ts, 'test')
+
+    ts, markers = tsx.edited('series_with_holes')
+    assert len(ts) == 5
+
+    ts, markers = tsx.edited('series_with_holes', inferred_freq=True)
+
+    assert_df("""
+2024-01-01    1.0
+2024-01-02    2.0
+2024-01-03    3.0
+2024-01-04    4.0
+2024-01-05    NaN
+2024-01-06    6.0
+""", ts)
+
+    assert_df("""
+2024-01-01    False
+2024-01-02    False
+2024-01-03    False
+2024-01-04    False
+2024-01-05    False
+2024-01-06    False
+""", markers)
+
+    ts, markers = tsx.edited(
+        'series_with_holes',
+        inferred_freq=True,
+        to_value_date=pd.Timestamp('2024-01-08 12:00:00')
+    )
+    assert_df("""
+2024-01-01    1.0
+2024-01-02    2.0
+2024-01-03    3.0
+2024-01-04    4.0
+2024-01-05    NaN
+2024-01-06    6.0
+2024-01-07    NaN
+2024-01-08    NaN
+""", ts)
+    assert len(markers) == 8
+
+    ts, markers = tsx.edited(
+        'series_with_holes',
+        inferred_freq=True,
+        to_value_date=pd.Timestamp('2024-01-05 12:00:00')
+    )
+    assert_df("""
+2024-01-01    1.0
+2024-01-02    2.0
+2024-01-03    3.0
+2024-01-04    4.0
+2024-01-05    NaN
+""", ts)
+    assert len(markers) == 5
+
+    ts, markers = tsx.edited(
+        'series_with_holes',
+        inferred_freq=True,
+        from_value_date=pd.Timestamp('2023-12-30 18:00:00'),
+    )
+    assert_df("""
+2023-12-31    NaN
+2024-01-01    1.0
+2024-01-02    2.0
+2024-01-03    3.0
+2024-01-04    4.0
+2024-01-05    NaN
+2024-01-06    6.0
+""", ts)
+    assert len(markers) == 7
+
+    ts, markers = tsx.edited(
+        'series_with_holes',
+        inferred_freq=True,
+        from_value_date=pd.Timestamp('2024-01-01 18:00:00'),
+    )
+    assert_df("""
+2024-01-02    2.0
+2024-01-03    3.0
+2024-01-04    4.0
+2024-01-05    NaN
+2024-01-06    6.0
+""", ts)
+    assert len(markers) == 5
+
+    ts, markers = tsx.edited(
+        'series_with_holes',
+        inferred_freq=True,
+        from_value_date=pd.Timestamp('2023-12-30 18:00:00'),
+        to_value_date=pd.Timestamp('2024-01-07 12:00:00')
+    )
+    assert_df("""
+2023-12-31    NaN
+2024-01-01    1.0
+2024-01-02    2.0
+2024-01-03    3.0
+2024-01-04    4.0
+2024-01-05    NaN
+2024-01-06    6.0
+2024-01-07    NaN
+""", ts)
+
+    assert len(markers) == 8
+
+    ts, markers = tsx.edited(
+        'series_with_holes',
+        inferred_freq=True,
+        from_value_date=pd.Timestamp('2024-01-02 12:00:00'),
+        to_value_date=pd.Timestamp('2024-01-05 12:00:00')
+    )
+    assert_df("""
+2024-01-03    3.0
+2024-01-04    4.0
+""", ts)
+
+    assert len(markers) == 2
+
+    ts, markers = tsx.edited(
+        'series_with_holes',
+        inferred_freq=True,
+        from_value_date=pd.Timestamp('2024-01-04 12:00:00'),
+        to_value_date=pd.Timestamp('2024-01-07 12:00:00')
+    )
+    assert_df("""
+2024-01-06    6.0
+""", ts)
+
+    assert len(markers) == 1
+
+
+def test_infer_freq_tz(tsx):
+    """Since we build a pseudo index based
+    both on request bounds and the series index
+    we make sure that the tz-status are correctly processed"""
+
+    ts = pd.Series(
+        [1, 2, 3, 4, 6],
+        index=[
+            pd.Timestamp('2024-01-01'),
+            pd.Timestamp('2024-01-02'),
+            pd.Timestamp('2024-01-03'),
+            pd.Timestamp('2024-01-04'),
+            pd.Timestamp('2024-01-06'),
+        ]
+    )
+    tsx.update('series_with_holes_naive', ts, 'test')
+
+    ts = pd.Series(
+        [1, 2, 3, 4, 6],
+        index=[
+            pd.Timestamp('2024-01-01', tz='UTC'),
+            pd.Timestamp('2024-01-02', tz='UTC'),
+            pd.Timestamp('2024-01-03', tz='UTC'),
+            pd.Timestamp('2024-01-04', tz='UTC'),
+            pd.Timestamp('2024-01-06', tz='UTC'),
+        ]
+    )
+    tsx.update('series_with_holes_tz_aware', ts, 'test')
+
+    from_naive = pd.Timestamp('2024-01-01 12:00:00')
+    to_naive = pd.Timestamp('2024-01-07 12:00:00')
+
+    from_tz_aware = pd.Timestamp('2024-01-01 12:00:00', tz='CET')
+    to_tz_aware = pd.Timestamp('2024-01-07 12:00:00', tz='CET')
+
+    assert len(
+        tsx.edited(
+            'series_with_holes_naive',
+            from_value_date=from_tz_aware,
+            to_value_date=to_tz_aware,
+            inferred_freq=True,
+        )[0]
+    ) == 6
+
+    assert len(
+        tsx.edited(
+            'series_with_holes_tz_aware',
+            from_value_date=from_tz_aware,
+            to_value_date=to_tz_aware,
+            inferred_freq=True,
+        )[0]
+    ) == 6
+
+    assert len(
+        tsx.edited(
+            'series_with_holes_tz_aware',
+            from_value_date=from_naive,
+            to_value_date=to_naive,
+            inferred_freq=True,
+        )[0]
+    ) == 6

          
M tshistory_supervision/api.py +4 -0
@@ 14,6 14,7 @@ def edited(self, name: str,
            revision_date: Optional[pd.Timestamp]=None,
            from_value_date: Optional[pd.Timestamp]=None,
            to_value_date: Optional[pd.Timestamp]=None,
+           inferred_freq: Optional[bool]=False,
            _keep_nans: bool=False) -> Tuple[pd.Series, pd.Series]:
     """
     Returns the base series and a second boolean series whose entries

          
@@ 28,6 29,7 @@ def edited(self, name: str,
                 revision_date=revision_date,
                 from_value_date=from_value_date,
                 to_value_date=to_value_date,
+                inferred_freq=inferred_freq,
                 _keep_nans=_keep_nans
             )
 

          
@@ 46,6 48,7 @@ def edited(self,  # noqa: F811
            revision_date=None,
            from_value_date=None,
            to_value_date=None,
+           inferred_freq=False,
            _keep_nans=False):
 
     source = self._findsourcefor(name)

          
@@ 56,6 59,7 @@ def edited(self,  # noqa: F811
         revision_date,
         from_value_date,
         to_value_date,
+        inferred_freq,
         _keep_nans
     )
 

          
M tshistory_supervision/http.py +8 -0
@@ 54,6 54,10 @@ edited.add_argument(
     help='override from/to_value_date'
 )
 edited.add_argument(
+    'inferred_freq', type=bool, default=False,
+    help='re-index series on a inferred frequency'
+)
+edited.add_argument(
     '_keep_nans', type=inputs.boolean, default=False,
     help='keep erasure information'
 )

          
@@ 100,6 104,7 @@ class supervision_httpapi(httpapi):
                     revision_date=args.insertion_date,
                     from_value_date=fvd,
                     to_value_date=tvd,
+                    inferred_freq=args.get('inferred_freq'),
                     _keep_nans=args._keep_nans
                 )
                 metadata = tsa.metadata(args.name, all=True)

          
@@ 146,6 151,7 @@ class supervision_httpclient(httpclient)
                revision_date=None,
                from_value_date=None,
                to_value_date=None,
+               inferred_freq=False,
                _keep_nans=False):
         args = {
             'name': name,

          
@@ 158,6 164,8 @@ class supervision_httpclient(httpclient)
             args['from_value_date'] = strft(from_value_date)
         if to_value_date:
             args['to_value_date'] = strft(to_value_date)
+        if inferred_freq:
+            args['inferred_freq'] = inferred_freq
         res = self.session.get(
             f'{self.uri}/series/supervision', params=args
         )

          
M tshistory_supervision/tsio.py +108 -6
@@ 1,10 1,15 @@ 
 import pandas as pd
 import numpy as np
 
-from tshistory.util import tx, diff
+from tshistory.util import (
+    compatible_date,
+    infer_freq,
+    diff,
+    tx
+)
 from tshistory.tsio import timeseries as basets
 
-from tshistory_supervision import api  # trigger registration  # noqa: F401
+from tshistory_supervision import api  # noqa
 
 
 def join_index(ts1, ts2):

          
@@ 17,6 22,66 @@ def join_index(ts1, ts2):
     return ts1.index.union(ts2.index)
 
 
+def extended(inferred_freq, ts, from_value_date, to_value_date):
+    if not inferred_freq or len(ts) < 3 :
+        return ts
+
+    first_index = ts.index[0]
+    last_index = ts.index[-1]
+    delta_interval = infer_freq(ts)[0]
+    tz_series = first_index.tz
+    to_value_date = compatible_date(tz_series, to_value_date)
+    from_value_date = compatible_date(tz_series, from_value_date)
+
+    if from_value_date is None and to_value_date is None:
+        new_index = pd.date_range(
+            start=first_index,
+            end=last_index,
+            freq=delta_interval
+        )
+        return ts.reindex(new_index)
+
+    if from_value_date is None:
+        new_index = pd.date_range(
+            start=first_index,
+            end=to_value_date,
+            freq=delta_interval
+        )
+        return ts.reindex(new_index)
+
+    if to_value_date is None:
+        new_index = pd.date_range(
+            start=last_index,
+            end=from_value_date,
+            freq=-delta_interval
+        ).sort_values()
+        return ts.reindex(new_index)
+
+    # we have to build the index in two parts
+    new_index = pd.date_range(
+        start=first_index,
+        end=to_value_date,
+        freq=delta_interval
+    )
+    complement = pd.date_range(
+        start=first_index,
+        end=from_value_date,
+        freq=-delta_interval
+    )
+    new_index = new_index.union(complement).sort_values()
+    return ts.reindex(new_index)
+
+
+def fill_markers(markers):
+    """ markers must remain pure boolean series.
+    When a point is created by the infer-freq option,
+    the associated markers should be set at False i.e.
+    this is not a manual edition
+    """
+    markers = markers.fillna(False)
+    return markers
+
+
 class timeseries(basets):
     """This class refines the base `tshistory.timeseries` by adding a
     specific workflow on top of it.

          
@@ 223,6 288,7 @@ class timeseries(basets):
     @tx
     def get_ts_marker(self, cn, name, revision_date=None,
                       from_value_date=None, to_value_date=None,
+                      inferred_freq=False,
                       _keep_nans=False):
         table = self._series_to_tablename(cn, name)
         if table is None:

          
@@ 239,6 305,12 @@ class timeseries(basets):
             # because of a revision_date
             return None, None
 
+        def finish(edited):
+            keep_nans = _keep_nans or inferred_freq
+            if not keep_nans:
+                return edited.dropna()
+            return edited
+
         supervision = self.supervision_status(cn, name)
         if supervision in ('unsupervised', 'handcrafted'):
             flags = pd.Series(

          
@@ 247,7 319,23 @@ class timeseries(basets):
                 dtype=np.dtype('bool')
             )
             flags.name = name
-            return edited.dropna(), flags
+            edited = finish(edited)
+            return (
+                extended(
+                    inferred_freq,
+                    edited,
+                    from_value_date,
+                    to_value_date
+                ),
+                fill_markers(
+                    extended(
+                        inferred_freq,
+                        flags,
+                        from_value_date,
+                        to_value_date
+                    )
+                )
+            )
 
         upstreamtsh = self.upstream
         upstream = upstreamtsh.get(

          
@@ 272,6 360,20 @@ class timeseries(basets):
             mask_manual[manual.index] = True
             mask_manual.name = name
 
-        if not _keep_nans:
-            edited = edited.dropna()
-        return edited, mask_manual
+        edited = finish(edited)
+        return (
+            extended(
+                inferred_freq,
+                edited,
+                from_value_date,
+                to_value_date
+            ),
+            fill_markers(
+                extended(
+                    inferred_freq,
+                    mask_manual,
+                    from_value_date,
+                    to_value_date
+                )
+            )
+        )