util/diff and tsio/create: regularize the nan values handling

They were special cased in `diff` and tsio/create, in wrong ways.

There's nothing special about a nan, it's a floating point value.
The erase semantics does not change anything.

At creation time, they seem to be useless. But what if we create a
series whose purpose is to patch another ? The formula system
certainly encourages that kind of thing. So we take the nans and we
don't touch them.
4 files changed, 73 insertions(+), 36 deletions(-)

M test/test_tsio.py
M test/test_util.py
M tshistory/tsio.py
M tshistory/util.py
M test/test_tsio.py +66 -24
@@ 482,8 482,12 @@ def test_update_na_vs_hole(engine, tsh):
     ts = ts.drop(hole)  # punch hole
 
     tsh.update(engine, ts, 'na-in-hole', 'Babar')
-
-    ts = pd.Series(
+    assert_df("""
+2024-01-01 00:00:00+00:00    1
+2024-01-01 02:00:00+00:00    3
+""", ts)
+
+    ts2 = pd.Series(
         [1, np.nan, 3],  # hole will be 'erased'
         index=pd.date_range(
             pd.Timestamp('2024-1-1', tz='utc'),

          
@@ 491,12 495,20 @@ def test_update_na_vs_hole(engine, tsh):
             freq='h'
         )
     )
-    diff = tsh.update(engine, ts, 'na-in-hole', 'Babar')
+    assert_df("""
+2024-01-01 00:00:00+00:00    1.0
+2024-01-01 01:00:00+00:00    NaN
+2024-01-01 02:00:00+00:00    3.0
+""", ts2)
+
+    diff = tsh.update(engine, ts2, 'na-in-hole', 'Babar')
+    assert_df("""
+2024-01-01 01:00:00+00:00   NaN
+""", diff)
 
     revs = tsh.insertion_dates(engine, 'na-in-hole')
-    # looks odd
-    assert len(revs) == 1
-    assert len(diff) == 0
+    assert len(revs) == 2
+    assert len(diff) == 1
 
 
 def test_serie_metadata(engine, tsh):

          
@@ 868,7 880,7 @@ def test_point_deletion(engine, tsh):
     tsh.update(engine, ts_begin, 'ts_del', 'test')
 
     _, ts = Postgres(engine, tsh, 'ts_del').find()
-    assert ts.iloc[-2] == 8.0
+    assert ts.iloc[-3] == 8.0
 
     ts_begin.iloc[0] = np.nan
     ts_begin.iloc[3] = np.nan

          
@@ 969,8 981,8 @@ def test_nan_first(engine, tsh):
 
 
 def test_more_point_deletion(engine, tsh):
-    ts_repushed = genserie(datetime(2010, 1, 1), 'D', 11)
-    ts_repushed.iloc[0:3] = np.nan
+    ts_nans = genserie(datetime(2010, 1, 1), 'D', 11)
+    ts_nans.iloc[0:3] = np.nan
 
     assert_df("""
 2010-01-01     NaN

          
@@ 985,31 997,48 @@ 2010-01-09     8.0
 2010-01-10     9.0
 2010-01-11    10.0
 Freq: D
-""", ts_repushed)
-
-    tsh.update(engine, ts_repushed, 'ts_repushed', 'test')
-    dif = tsh.update(engine, ts_repushed, 'ts_repushed', 'test')
+""", ts_nans)
+
+    tsh.update(engine, ts_nans, 'ts_nans', 'test')
+    dif = tsh.update(engine, ts_nans, 'ts_nans', 'test')
     assert len(dif) == 0
 
     # there is no difference
-    assert 0 == len(diff(ts_repushed, ts_repushed))
+    assert 0 == len(diff(ts_nans, ts_nans))
 
     ts_add = genserie(datetime(2010, 1, 1), 'D', 15)
     ts_add.iloc[0] = np.nan
     ts_add.iloc[13:] = np.nan
     ts_add.iloc[8] = np.nan
-    dif = diff(ts_repushed, ts_add)
-
+
+    assert_df("""
+2010-01-01     NaN
+2010-01-02     1.0
+2010-01-03     2.0
+2010-01-04     3.0
+2010-01-05     4.0
+2010-01-06     5.0
+2010-01-07     6.0
+2010-01-08     7.0
+2010-01-09     NaN
+2010-01-10     9.0
+2010-01-11    10.0
+2010-01-12    11.0
+2010-01-13    12.0
+2010-01-14     NaN
+2010-01-15     NaN
+""", ts_add)
+
+    dif = diff(ts_nans, ts_add)
     assert_df("""
 2010-01-02     1.0
 2010-01-03     2.0
 2010-01-09     NaN
 2010-01-12    11.0
-2010-01-13    12.0""", dif)
-    # value on nan => value
-    # nan on value => nan
-    # nan on nan => Nothing
-    # nan on nothing=> Nothing
+2010-01-13    12.0
+2010-01-14     NaN
+2010-01-15     NaN
+""", dif)
 
     # full erasing
     # numeric

          
@@ 1377,8 1406,6 @@ 2016-01-04 00:00:00+00:00  2017-01-01   
 
 
 def test_add_na(engine, tsh):
-    # a serie of NaNs won't be insert in base
-    # in case of first insertion
     ts_nan = genserie(datetime(2010, 1, 1), 'D', 5)
     ts_nan[[True] * len(ts_nan)] = np.nan
 

          
@@ 1394,7 1421,7 @@ def test_add_na(engine, tsh):
     ts_nan = pd.concat([ts_begin, ts_nan])
 
     diff = tsh.update(engine, ts_nan, 'ts_add_na', 'test')
-    assert len(diff) == 0
+    assert len(diff) == 5
 
     result = tsh.get(engine, 'ts_add_na')
     assert len(result) == 5

          
@@ 2443,11 2470,16 @@ 2010-01-17    3.0
 
     result = tsh.get(engine, 'test_nan', _keep_nans=True)
     assert_df("""
+2010-01-10    NaN
+2010-01-11    NaN
+2010-01-12    NaN
 2010-01-13    3.0
 2010-01-14    3.0
 2010-01-15    3.0
 2010-01-16    3.0
 2010-01-17    3.0
+2010-01-18    NaN
+2010-01-19    NaN
 """, result)
 
     ival = tsh.interval(engine, 'test_nan')

          
@@ 2462,11 2494,16 @@ 2010-01-17    3.0
     result = tsh.get(engine, 'test_nan', _keep_nans=True)
     # they don't show up
     assert_df("""
+2010-01-10    NaN
+2010-01-11    NaN
+2010-01-12    NaN
 2010-01-13    4.0
 2010-01-14    4.0
 2010-01-15    4.0
 2010-01-16    4.0
 2010-01-17    4.0
+2010-01-18    NaN
+2010-01-19    NaN
 """, result)
 
     ival = tsh.interval(engine, 'test_nan')

          
@@ 2480,11 2517,16 @@ 2010-01-17    4.0
     tsh.update(engine, ts, 'test_nan', 'test')
     result = tsh.get(engine, 'test_nan', _keep_nans=True)
     assert_df("""
+2010-01-10    NaN
+2010-01-11    NaN
+2010-01-12    NaN
 2010-01-13    NaN
 2010-01-14    5.0
 2010-01-15    5.0
 2010-01-16    5.0
 2010-01-17    NaN
+2010-01-18    NaN
+2010-01-19    NaN
 """, result)
 
     ival = tsh.interval(engine, 'test_nan')

          
M test/test_util.py +6 -6
@@ 277,7 277,7 @@ 2020-01-01 03:00:00    4.0
 """, ds2s1)
 
 
-def test_diff_nans():
+def test_diff_nan_pure():
     s1 = pd.Series(
         [1, np.nan, 3],
         index=pd.date_range(datetime(2024, 1, 1), freq='h', periods=3)

          
@@ 302,6 302,8 @@ 2024-01-01 03:00:00    4.0
 2024-01-01 02:00:00   NaN
 """, d)
 
+
+def test_nan_mixed():
     n1 = pd.Series(
         [np.nan],
         index=pd.date_range(datetime(2024, 1, 1), freq='h', periods=1)

          
@@ 311,14 313,11 @@ 2024-01-01 02:00:00   NaN
         index=pd.date_range(datetime(2024, 1, 1), freq='h', periods=3)
     )
     d = diff(n1, n2)
-    # oops !
     assert_df("""
-2024-01-01 00:00:00    NaN
 2024-01-01 01:00:00    NaN
 2024-01-01 02:00:00    3.0
 """, d)
 
-
     n1 = pd.Series(
         [np.nan, 2, 3],
         index=pd.date_range(datetime(2024, 1, 1), freq='h', periods=3)

          
@@ 329,8 328,9 @@ 2024-01-01 02:00:00    3.0
         index=pd.date_range(datetime(2024, 1, 1), freq='h', periods=3)
     )
     d = diff(n1, n2)
-    # oops !
-    assert len(d) == 0
+    assert_df("""
+2024-01-01 01:00:00   NaN
+""", d)
 
 
 def test_diff_duplicated():

          
M tshistory/tsio.py +0 -4
@@ 884,10 884,6 @@ class timeseries:
             assert end is None
             # this is just full of nans
             return None
-        # chop off unwanted nans
-        newts = newts.loc[start:end]
-        if len(newts) == 0:
-            return None
 
         # at creation time we take an exclusive lock to avoid
         # a deadlock on created tables against the changeset-series fk

          
M tshistory/util.py +1 -2
@@ 991,7 991,6 @@ def patchmany(series):
 def diff(base, other, _precision=1e-14):
     if base is None:
         return other
-    base = base.dropna()
     if not len(base):
         return other
 

          
@@ 1025,7 1024,7 @@ def diff(base, other, _precision=1e-14):
     diff_overlap = other_overlap[~mask_equal]
     # series of new elements brought by the `other` side
     diff_new = other[~mask_overlap]
-    diff_new = diff_new.dropna()
+    diff_new = diff_new
 
     return pd.concat([diff_overlap, diff_new]).sort_index()