Source code for tradingstrategy.utils.gap

"""Detect gaps and bugs in timeseries data, and deal with it."""
from dataclasses import dataclass

import pandas as pd
import numpy as np


[docs]@dataclass(frozen=True, slots=True) class Gap: # Gap size in entries gap_size: int
[docs]def detect_frequency(series: pd.Series) -> str: """Automatically detect the frequency of a time series. Parameters: ----------- series : pandas.Series Input time series with DateTimeIndex Returns: -------- str Detected frequency as a string (e.g., 'D', 'H', 'T', etc.) """ if not isinstance(series.index, pd.DatetimeIndex): raise ValueError("Series must have a DateTimeIndex") if len(series.index) < 2: raise ValueError("Series must have at least 2 timestamps to detect frequency") # Calculate all time differences between consecutive timestamps time_diffs = np.diff(series.index) # Get the most common time difference most_common_diff = pd.Timedelta(np.median(time_diffs)) # Convert timedelta to frequency string seconds = most_common_diff.total_seconds() if seconds < 1: milliseconds = most_common_diff.total_seconds() * 1000 return f'{int(milliseconds)}L' # milliseconds elif seconds < 60: return f'{int(seconds)}S' # seconds elif seconds < 3600: minutes = seconds / 60 if minutes.is_integer(): return f'{int(minutes)}T' # minutes elif seconds < 86400: hours = seconds / 3600 if hours.is_integer(): return f'{int(hours)}H' # hours elif seconds < 604800: days = seconds / 86400 if days.is_integer(): return 'D' # days elif seconds < 2592000: weeks = seconds / 604800 if weeks.is_integer(): return 'W' # weeks elif seconds < 31536000: months = seconds / 2592000 if months.is_integer(): return 'M' # months else: years = seconds / 31536000 if years.is_integer(): return 'Y' # years # If no standard frequency matches, return in seconds return f'{int(seconds)}S'
[docs]def detect_timestamp_gaps(series, freq=None) -> list[Gap]: """ Detect and print gaps in a time series. Parameters: ----------- series : pandas.Series Input time series with DateTimeIndex freq : str, optional Frequency to use for gap detection. If None, will automatically detect frequency. Common options: 'D' for daily, 'H' for hourly, 'T' or 'min' for minute, 'S' for second Returns: -------- list of tuples List of (gap_start, gap_end, gap_size) tuples representing the gaps """ # Ensure we have a DateTimeIndex if not isinstance(series.index, pd.DatetimeIndex): raise ValueError("Series must have a DateTimeIndex") # If frequency not provided, detect it if freq is None: freq = detect_frequency(series) # Create a complete date range full_index = pd.date_range( start=series.index.min(), end=series.index.max(), freq=freq ) # Find missing dates missing_dates = full_index.difference(series.index) # If no gaps found, return empty list if len(missing_dates) == 0: return [] # Find consecutive gaps gaps = [] gap_start = missing_dates[0] prev_date = missing_dates[0] for date in missing_dates[1:]: expected_next = prev_date + pd.Timedelta(freq) if date != expected_next: # Gap ends, store it and start new gap gap_size = len(pd.date_range(gap_start, prev_date, freq=freq)) gaps.append((gap_start, prev_date, gap_size)) gap_start = date prev_date = date # Add the last gap gap_size = len(pd.date_range(gap_start, prev_date, freq=freq)) gaps.append((gap_start, prev_date, gap_size)) return gaps
[docs]def fill_missing_ohlcv(df, columns_to_fill=['open', 'high', 'low', 'close', 'volume', 'tvl']): """ Fill missing timestamps for each pair_id with zeros for specified columns. Parameters: ----------- df : pandas.DataFrame Input DataFrame with MultiIndex (pair_id, timestamp) columns_to_zero : list, optional Columns to fill with zeros when data is missing Returns: -------- pandas.DataFrame DataFrame with missing timestamps filled with zeros """ # Get full timestamp range across all pair_ids full_timestamp_range = df.index.get_level_values('timestamp').unique() # Create a new index with all combinations of pair_ids and timestamps pair_ids = df.index.get_level_values('pair_id').unique() multi_index = pd.MultiIndex.from_product([pair_ids, full_timestamp_range], names=['pair_id', 'timestamp']) # Reindex the original DataFrame filled_df = df.reindex(multi_index) # Fill specified columns with zeros where data is missing # for col in columns_to_zero: # filled_df[col] = filled_df[col] return filled_df
[docs]def equalise_timestamp_index( data: pd.Series, ): """Make all pair data series equally length. :param series: pandas.Series of (pair_id, timestamp) multiindex """ assert isinstance(data.index, pd.MultiIndex) unique_pair_ids = data.index.unique(level='pair_id') unique_timestamps = data.index.unique(level='timestamp') # Create a full MultiIndex with all combinations full_index = pd.MultiIndex.from_product( [unique_pair_ids, unique_timestamps], names=['pair_id', 'timestamp'] ) # Reindex the Series to fill missing values with NaN filled_series = data.reindex(full_index) return filled_series