# HG changeset patch # User Aurélien Campéas # Date 1719232964 -7200 # Mon Jun 24 14:42:44 2024 +0200 # Node ID 9941e64021459847631e049acde18468ce828874 # Parent 626a20acb0359ef529712525eeea450c4da89194 util/diff and tsio/create: regularize the nan values handling They were special cased in `diff` and tsio/create, in wrong ways. There's nothing special about a nan, it's a floating point value. The erase semantics does not change anything. At creation time, they seem to be useless. But what if we create a series whose purpose is to patch another ? The formula system certainly encourages that kind of thing. So we take the nans and we don't touch them. diff --git a/test/test_tsio.py b/test/test_tsio.py --- a/test/test_tsio.py +++ b/test/test_tsio.py @@ -482,8 +482,12 @@ ts = ts.drop(hole) # punch hole tsh.update(engine, ts, 'na-in-hole', 'Babar') - - ts = pd.Series( + assert_df(""" +2024-01-01 00:00:00+00:00 1 +2024-01-01 02:00:00+00:00 3 +""", ts) + + ts2 = pd.Series( [1, np.nan, 3], # hole will be 'erased' index=pd.date_range( pd.Timestamp('2024-1-1', tz='utc'), @@ -491,12 +495,20 @@ freq='h' ) ) - diff = tsh.update(engine, ts, 'na-in-hole', 'Babar') + assert_df(""" +2024-01-01 00:00:00+00:00 1.0 +2024-01-01 01:00:00+00:00 NaN +2024-01-01 02:00:00+00:00 3.0 +""", ts2) + + diff = tsh.update(engine, ts2, 'na-in-hole', 'Babar') + assert_df(""" +2024-01-01 01:00:00+00:00 NaN +""", diff) revs = tsh.insertion_dates(engine, 'na-in-hole') - # looks odd - assert len(revs) == 1 - assert len(diff) == 0 + assert len(revs) == 2 + assert len(diff) == 1 def test_serie_metadata(engine, tsh): @@ -868,7 +880,7 @@ tsh.update(engine, ts_begin, 'ts_del', 'test') _, ts = Postgres(engine, tsh, 'ts_del').find() - assert ts.iloc[-2] == 8.0 + assert ts.iloc[-3] == 8.0 ts_begin.iloc[0] = np.nan ts_begin.iloc[3] = np.nan @@ -969,8 +981,8 @@ def test_more_point_deletion(engine, tsh): - ts_repushed = genserie(datetime(2010, 1, 1), 'D', 11) - ts_repushed.iloc[0:3] = np.nan + ts_nans = genserie(datetime(2010, 1, 1), 'D', 11) + ts_nans.iloc[0:3] = np.nan assert_df(""" 2010-01-01 NaN @@ -985,31 +997,48 @@ 2010-01-10 9.0 2010-01-11 10.0 Freq: D -""", ts_repushed) - - tsh.update(engine, ts_repushed, 'ts_repushed', 'test') - dif = tsh.update(engine, ts_repushed, 'ts_repushed', 'test') +""", ts_nans) + + tsh.update(engine, ts_nans, 'ts_nans', 'test') + dif = tsh.update(engine, ts_nans, 'ts_nans', 'test') assert len(dif) == 0 # there is no difference - assert 0 == len(diff(ts_repushed, ts_repushed)) + assert 0 == len(diff(ts_nans, ts_nans)) ts_add = genserie(datetime(2010, 1, 1), 'D', 15) ts_add.iloc[0] = np.nan ts_add.iloc[13:] = np.nan ts_add.iloc[8] = np.nan - dif = diff(ts_repushed, ts_add) - + + assert_df(""" +2010-01-01 NaN +2010-01-02 1.0 +2010-01-03 2.0 +2010-01-04 3.0 +2010-01-05 4.0 +2010-01-06 5.0 +2010-01-07 6.0 +2010-01-08 7.0 +2010-01-09 NaN +2010-01-10 9.0 +2010-01-11 10.0 +2010-01-12 11.0 +2010-01-13 12.0 +2010-01-14 NaN +2010-01-15 NaN +""", ts_add) + + dif = diff(ts_nans, ts_add) assert_df(""" 2010-01-02 1.0 2010-01-03 2.0 2010-01-09 NaN 2010-01-12 11.0 -2010-01-13 12.0""", dif) - # value on nan => value - # nan on value => nan - # nan on nan => Nothing - # nan on nothing=> Nothing +2010-01-13 12.0 +2010-01-14 NaN +2010-01-15 NaN +""", dif) # full erasing # numeric @@ -1377,8 +1406,6 @@ def test_add_na(engine, tsh): - # a serie of NaNs won't be insert in base - # in case of first insertion ts_nan = genserie(datetime(2010, 1, 1), 'D', 5) ts_nan[[True] * len(ts_nan)] = np.nan @@ -1394,7 +1421,7 @@ ts_nan = pd.concat([ts_begin, ts_nan]) diff = tsh.update(engine, ts_nan, 'ts_add_na', 'test') - assert len(diff) == 0 + assert len(diff) == 5 result = tsh.get(engine, 'ts_add_na') assert len(result) == 5 @@ -2443,11 +2470,16 @@ result = tsh.get(engine, 'test_nan', _keep_nans=True) assert_df(""" +2010-01-10 NaN +2010-01-11 NaN +2010-01-12 NaN 2010-01-13 3.0 2010-01-14 3.0 2010-01-15 3.0 2010-01-16 3.0 2010-01-17 3.0 +2010-01-18 NaN +2010-01-19 NaN """, result) ival = tsh.interval(engine, 'test_nan') @@ -2462,11 +2494,16 @@ result = tsh.get(engine, 'test_nan', _keep_nans=True) # they don't show up assert_df(""" +2010-01-10 NaN +2010-01-11 NaN +2010-01-12 NaN 2010-01-13 4.0 2010-01-14 4.0 2010-01-15 4.0 2010-01-16 4.0 2010-01-17 4.0 +2010-01-18 NaN +2010-01-19 NaN """, result) ival = tsh.interval(engine, 'test_nan') @@ -2480,11 +2517,16 @@ tsh.update(engine, ts, 'test_nan', 'test') result = tsh.get(engine, 'test_nan', _keep_nans=True) assert_df(""" +2010-01-10 NaN +2010-01-11 NaN +2010-01-12 NaN 2010-01-13 NaN 2010-01-14 5.0 2010-01-15 5.0 2010-01-16 5.0 2010-01-17 NaN +2010-01-18 NaN +2010-01-19 NaN """, result) ival = tsh.interval(engine, 'test_nan') diff --git a/test/test_util.py b/test/test_util.py --- a/test/test_util.py +++ b/test/test_util.py @@ -277,7 +277,7 @@ """, ds2s1) -def test_diff_nans(): +def test_diff_nan_pure(): s1 = pd.Series( [1, np.nan, 3], index=pd.date_range(datetime(2024, 1, 1), freq='h', periods=3) @@ -302,6 +302,8 @@ 2024-01-01 02:00:00 NaN """, d) + +def test_nan_mixed(): n1 = pd.Series( [np.nan], index=pd.date_range(datetime(2024, 1, 1), freq='h', periods=1) @@ -311,14 +313,11 @@ index=pd.date_range(datetime(2024, 1, 1), freq='h', periods=3) ) d = diff(n1, n2) - # oops ! assert_df(""" -2024-01-01 00:00:00 NaN 2024-01-01 01:00:00 NaN 2024-01-01 02:00:00 3.0 """, d) - n1 = pd.Series( [np.nan, 2, 3], index=pd.date_range(datetime(2024, 1, 1), freq='h', periods=3) @@ -329,8 +328,9 @@ index=pd.date_range(datetime(2024, 1, 1), freq='h', periods=3) ) d = diff(n1, n2) - # oops ! - assert len(d) == 0 + assert_df(""" +2024-01-01 01:00:00 NaN +""", d) def test_diff_duplicated(): diff --git a/tshistory/tsio.py b/tshistory/tsio.py --- a/tshistory/tsio.py +++ b/tshistory/tsio.py @@ -884,10 +884,6 @@ assert end is None # this is just full of nans return None - # chop off unwanted nans - newts = newts.loc[start:end] - if len(newts) == 0: - return None # at creation time we take an exclusive lock to avoid # a deadlock on created tables against the changeset-series fk diff --git a/tshistory/util.py b/tshistory/util.py --- a/tshistory/util.py +++ b/tshistory/util.py @@ -991,7 +991,6 @@ def diff(base, other, _precision=1e-14): if base is None: return other - base = base.dropna() if not len(base): return other @@ -1025,7 +1024,7 @@ diff_overlap = other_overlap[~mask_equal] # series of new elements brought by the `other` side diff_new = other[~mask_overlap] - diff_new = diff_new.dropna() + diff_new = diff_new return pd.concat([diff_overlap, diff_new]).sort_index()