Updated main to allow significantly more efficient downloading

This commit is contained in:
2025-10-05 17:53:41 +01:00
parent 32ef7401e3
commit 4d0460944a

458
main.py
View File

@@ -1,8 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
main.py - Complete Binance Trading Data Collection System main.py - Complete Binance Trading Data Collection System
Main application entry point with async data collection, websocket handling, and task management Main application entry point with async data collection, websocket handling, bulk
backfill orchestration, periodic gap scans, and task management.
""" """
import asyncio import asyncio
@@ -13,11 +16,11 @@ import json
import subprocess import subprocess
import os import os
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from typing import Dict, List, Optional, Any from typing import Dict, List, Optional, Any, Tuple
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
import websockets import websockets
import aiohttp import aiohttp # kept for future-proofing network ops
from binance.client import Client from binance.client import Client
from binance.exceptions import BinanceAPIException from binance.exceptions import BinanceAPIException
import pandas as pd import pandas as pd
@@ -52,11 +55,15 @@ class BinanceDataCollector:
self.websocket_collection_running = False self.websocket_collection_running = False
self.download_progress: Dict[str, Any] = {} self.download_progress: Dict[str, Any] = {}
# Concurrency controls from env with sensible defaults
max_downloads = int(os.getenv('MAX_CONCURRENT_DOWNLOADS', '3')) max_downloads = int(os.getenv('MAX_CONCURRENT_DOWNLOADS', '3'))
max_gap_fills = int(os.getenv('MAX_CONCURRENT_GAP_FILLS', '2')) max_gap_fills = int(os.getenv('MAX_CONCURRENT_GAP_FILLS', '2'))
self._download_semaphore = asyncio.Semaphore(max_downloads) self._download_semaphore = asyncio.Semaphore(max_downloads)
self._gap_fill_semaphore = asyncio.Semaphore(max_gap_fills) self._gap_fill_semaphore = asyncio.Semaphore(max_gap_fills)
self.logger.info(f"Initialized with max {max_downloads} concurrent downloads, {max_gap_fills} gap fills")
self.logger.info(
f"Initialized with max {max_downloads} concurrent downloads, {max_gap_fills} gap fills"
)
async def initialize(self): async def initialize(self):
"""Initialize the data collector""" """Initialize the data collector"""
@@ -64,10 +71,32 @@ class BinanceDataCollector:
# Setup logging # Setup logging
setup_logging() setup_logging()
self.logger.info("Initializing Binance Data Collector") self.logger.info("Initializing Binance Data Collector")
# Load configuration # Load configuration
config = load_config() config = load_config()
# Optional: add defaults for new flags
coll = config.setdefault("collection", {})
coll.setdefault("default_record_from_date", "2020-01-01T00:00:00Z")
coll.setdefault("initial_full_backfill", True)
coll.setdefault("candle_intervals", ["1m", "5m", "15m", "1h", "4h", "1d"])
coll.setdefault("bulk_chunk_size", int(os.getenv("BULK_DOWNLOAD_BATCH_SIZE", "1000")))
coll.setdefault("tick_batch_size", int(os.getenv("TICK_BATCH_SIZE", "100")))
coll.setdefault("max_retries", int(os.getenv("MAX_RETRIES", "3")))
coll.setdefault("retry_delay", 1)
gap = config.setdefault("gap_filling", {})
gap.setdefault("enable_auto_gap_filling", True)
gap.setdefault("auto_fill_schedule_hours", 24)
gap.setdefault("enable_intelligent_averaging", True)
gap.setdefault("max_fill_attempts", int(os.getenv("MAX_RETRIES", "3")))
gap.setdefault("intervals_to_monitor", coll.get("candle_intervals", ["1m", "5m", "15m", "1h", "4h", "1d"]))
gap.setdefault("max_gap_size_candles", 1000)
gap.setdefault("averaging_lookback_candles", 10)
gap.setdefault("max_consecutive_empty_candles", 5)
self.logger.info(f"Loaded configuration for {len(config['trading_pairs'])} trading pairs") self.logger.info(f"Loaded configuration for {len(config['trading_pairs'])} trading pairs")
# Initialize database # Initialize database
@@ -85,6 +114,124 @@ class BinanceDataCollector:
self.client = Client() self.client = Client()
self.logger.info("Binance client initialized without API credentials (public data only)") self.logger.info("Binance client initialized without API credentials (public data only)")
# ---------------------------
# Bulk backfill orchestration
# ---------------------------
async def start_bulk_download_for_all_pairs(self):
"""
Automatically launch full-history downloads for all enabled pairs,
starting from record_from_date for each pair, across all configured intervals.
"""
global config
enabled_pairs = [p for p in config['trading_pairs'] if p.get('enabled', True)]
if not enabled_pairs:
self.logger.warning("No enabled trading pairs found for bulk backfill")
return
# Sort by priority ascending (1 higher priority), then by symbol for stable order
enabled_pairs.sort(key=lambda p: (int(p.get('priority', 1)), p.get('symbol', '')))
# Build tasks per symbol to respect MAX_CONCURRENT_DOWNLOADS at the symbol level
tasks: List[asyncio.Task] = []
now_utc = datetime.now(timezone.utc)
for pair in enabled_pairs:
symbol = pair['symbol'].upper()
start_iso = pair.get('record_from_date') or config["collection"]["default_record_from_date"]
try:
start_dt = datetime.fromisoformat(start_iso.replace("Z", "+00:00"))
except Exception:
self.logger.warning(f"Invalid record_from_date for {symbol}: {start_iso}, falling back to default")
start_dt = datetime.fromisoformat(config["collection"]["default_record_from_date"].replace("Z", "+00:00"))
# One task per symbol to execute all intervals concurrently for that symbol
tasks.append(asyncio.create_task(
self._bulk_download_symbol_all_intervals(symbol, start_dt, now_utc),
name=f"bulk_{symbol}"
))
# Execute with graceful progress logging
self.logger.info(f"Launching bulk backfill for {len(tasks)} symbols...")
results = await asyncio.gather(*tasks, return_exceptions=True)
errors = [r for r in results if isinstance(r, Exception)]
if errors:
self.logger.error(f"Bulk backfill completed with {len(errors)} errors; see logs for details")
else:
self.logger.info("Bulk backfill completed successfully for all symbols")
async def _bulk_download_symbol_all_intervals(
self,
symbol: str,
start_date: datetime,
end_date: Optional[datetime] = None
):
"""
Launch concurrent downloads of all configured intervals for one symbol,
bounded by the download semaphore to control exchange load.
"""
global config
async with self._download_semaphore:
end_date = end_date or datetime.now(timezone.utc)
# Ensure progress structure
intervals = config.get("collection", {}).get("candle_intervals",
["1m", "5m", "15m", "1h", "4h", "1d"])
self.download_progress[symbol] = {
"status": "running",
"intervals": {iv: {"status": "pending", "records": 0} for iv in intervals},
"start_time": datetime.now(timezone.utc).isoformat()
}
# Spawn all intervals concurrently for this symbol
self.logger.info(f"Starting concurrent bulk for {symbol} on {intervals}")
interval_tasks = [
asyncio.create_task(
self._bulk_download_one_interval(symbol, interval, start_date, end_date),
name=f"bulk_{symbol}_{interval}"
)
for interval in intervals
]
results = await asyncio.gather(*interval_tasks, return_exceptions=True)
# Mark final status
if any(isinstance(r, Exception) for r in results):
self.download_progress[symbol]["status"] = "error"
self.download_progress[symbol]["error"] = "One or more intervals failed"
else:
self.download_progress[symbol]["status"] = "completed"
self.download_progress[symbol]["end_time"] = datetime.now(timezone.utc).isoformat()
async def _bulk_download_one_interval(
self,
symbol: str,
interval: str,
start_date: datetime,
end_date: datetime
):
"""Run the bulk downloader for a single symbol+interval and then compute indicators."""
# Update status
sp = self.download_progress.setdefault(symbol, {"intervals": {}})
sp["intervals"].setdefault(interval, {"status": "pending", "records": 0})
sp["intervals"][interval]["status"] = "checking"
records_count = await self._collect_historical_klines(symbol, interval, start_date, end_date)
if records_count > 0:
sp["intervals"][interval]["status"] = "calculating_indicators"
sp["intervals"][interval]["records"] = records_count
await self._calculate_and_store_indicators(symbol, interval)
sp["intervals"][interval]["status"] = "completed"
self.logger.info(f"Completed {interval} data for {symbol} - {records_count} new records")
else:
sp["intervals"][interval]["status"] = "skipped_complete"
self.logger.info(f"Skipped {interval} for {symbol} - data already complete or no new records")
# ---------------------------
# Intelligent bulk downloader
# ---------------------------
async def bulk_download_historical_data( async def bulk_download_historical_data(
self, symbol: str, start_date: datetime, end_date: Optional[datetime] = None, self, symbol: str, start_date: datetime, end_date: Optional[datetime] = None,
intervals: Optional[List[str]] = None intervals: Optional[List[str]] = None
@@ -92,6 +239,7 @@ class BinanceDataCollector:
""" """
Bulk download historical OHLCV data from Binance with intelligent gap detection. Bulk download historical OHLCV data from Binance with intelligent gap detection.
Only downloads data that doesn't already exist in the database. Only downloads data that doesn't already exist in the database.
Note: kept for API/UI compatibility; orchestration now prefers start_bulk_download_for_all_pairs.
""" """
async with self._download_semaphore: async with self._download_semaphore:
if end_date is None: if end_date is None:
@@ -107,7 +255,8 @@ class BinanceDataCollector:
# Get intervals # Get intervals
if intervals is None: if intervals is None:
intervals = config.get("collection", {}).get("candle_intervals", ["1m", "5m", "15m", "1h", "4h", "1d"]) intervals = config.get("collection", {}).get("candle_intervals",
["1m", "5m", "15m", "1h", "4h", "1d"])
# Initialize progress tracking # Initialize progress tracking
self.download_progress[symbol] = { self.download_progress[symbol] = {
@@ -115,38 +264,23 @@ class BinanceDataCollector:
"intervals": {}, "intervals": {},
"start_time": datetime.now(timezone.utc).isoformat() "start_time": datetime.now(timezone.utc).isoformat()
} }
for interval in intervals: for interval in intervals:
self.download_progress[symbol]["intervals"][interval] = { self.download_progress[symbol]["intervals"][interval] = {"status": "pending", "records": 0}
"status": "pending",
"records": 0
}
try: try:
for interval in intervals: # Run intervals concurrently to improve throughput for one symbol
self.logger.info(f"Processing {interval} data for {symbol}") tasks = [
self.download_progress[symbol]["intervals"][interval]["status"] = "checking" asyncio.create_task(self._bulk_download_one_interval(symbol, interval, start_date, end_date),
name=f"bulk_single_{symbol}_{interval}")
# Intelligent download - only missing data for interval in intervals
records_count = await self._collect_historical_klines( ]
symbol, interval, start_date, end_date await asyncio.gather(*tasks)
)
if records_count > 0:
self.download_progress[symbol]["intervals"][interval]["status"] = "calculating_indicators"
self.download_progress[symbol]["intervals"][interval]["records"] = records_count
# Calculate indicators for new data
await self._calculate_and_store_indicators(symbol, interval)
self.download_progress[symbol]["intervals"][interval]["status"] = "completed"
self.logger.info(f"Completed {interval} data for {symbol} - {records_count} new records")
else:
self.download_progress[symbol]["intervals"][interval]["status"] = "skipped_complete"
self.logger.info(f"Skipped {interval} for {symbol} - data already complete")
self.download_progress[symbol]["status"] = "completed" self.download_progress[symbol]["status"] = "completed"
self.download_progress[symbol]["end_time"] = datetime.now(timezone.utc).isoformat() self.download_progress[symbol]["end_time"] = datetime.now(timezone.utc).isoformat()
except Exception as e: except Exception as e:
self.logger.error(f"Error in bulk download for {symbol}: {e}") self.logger.error(f"Error in bulk download for {symbol}: {e}", exc_info=True)
self.download_progress[symbol]["status"] = "error" self.download_progress[symbol]["status"] = "error"
self.download_progress[symbol]["error"] = str(e) self.download_progress[symbol]["error"] = str(e)
raise raise
@@ -172,7 +306,7 @@ class BinanceDataCollector:
) )
# If coverage is complete, skip download # If coverage is complete, skip download
if coverage_check['is_complete']: if coverage_check.get('is_complete'):
self.logger.info( self.logger.info(
f"Skipping {symbol} {interval} - data already complete " f"Skipping {symbol} {interval} - data already complete "
f"({coverage_check['coverage_percent']:.2f}% coverage)" f"({coverage_check['coverage_percent']:.2f}% coverage)"
@@ -183,6 +317,7 @@ class BinanceDataCollector:
missing_ranges = await db_manager.get_missing_time_ranges( missing_ranges = await db_manager.get_missing_time_ranges(
symbol, interval, start_date, end_date symbol, interval, start_date, end_date
) )
if not missing_ranges: if not missing_ranges:
self.logger.info(f"No missing data ranges for {symbol} {interval}") self.logger.info(f"No missing data ranges for {symbol} {interval}")
return 0 return 0
@@ -205,7 +340,6 @@ class BinanceDataCollector:
symbol, interval, range_start, range_end symbol, interval, range_start, range_end
) )
total_new_records += records_in_range total_new_records += records_in_range
self.logger.info( self.logger.info(
f"Downloaded {records_in_range} records for range {idx}/{len(missing_ranges)}" f"Downloaded {records_in_range} records for range {idx}/{len(missing_ranges)}"
) )
@@ -249,104 +383,132 @@ class BinanceDataCollector:
"T": int(kline_row[6]), "T": int(kline_row[6]),
"s": symbol.upper(), "s": symbol.upper(),
"i": interval, "i": interval,
"f": None, # first trade id (unknown from REST row) "f": None, # first trade id
"L": None, # last trade id (unknown) "L": None, # last trade id
"o": str(kline_row[1]), "o": str(kline_row[1]),
"c": str(kline_row[4]), "c": str(kline_row[4]),
"h": str(kline_row[2]), "h": str(kline_row[2]),
"l": str(kline_row[3]), "l": str(kline_row[3]),
"v": str(kline_row[5]), "v": str(kline_row[5]),
"n": int(kline_row[8]), "n": int(kline_row[8]),
"x": True, # REST klines are for closed candles "x": True, # closed candle
"q": str(kline_row[7]), "q": str(kline_row[7]),
"V": None, # taker buy base asset volume (optional) "V": None, # taker buy base vol (optional)
"Q": None, # taker buy quote asset volume (optional) "Q": None, # taker buy quote vol (optional)
"B": None # ignore "B": None # ignore
} }
} }
async def _get_historical_klines_async(
self, symbol: str, interval: str, start_ms: int, end_ms: int, limit: int
) -> List[List[Any]]:
"""
Run python-binance get_historical_klines in a worker thread to avoid blocking the event loop.
"""
def call():
return self.client.get_historical_klines(
symbol=symbol,
interval=interval,
start_str=start_ms,
end_str=end_ms,
limit=limit
)
return await asyncio.to_thread(call)
async def _download_time_range( async def _download_time_range(
self, symbol: str, interval: str, start_date: datetime, end_date: datetime self, symbol: str, interval: str, start_date: datetime, end_date: datetime
) -> int: ) -> int:
""" """
Download data for a specific time range (internal method). Download data for a specific time range (internal method).
This is the actual download logic extracted from the original collect_historical_klines.
Returns: Returns:
Number of records downloaded and inserted Number of records downloaded and inserted
""" """
global config, db_manager global config, db_manager
chunk_size = config.get("collection", {}).get("bulk_chunk_size", 1000) # Resolve batch size and retry policy, prefer config then env
max_retries = config.get("collection", {}).get("max_retries", 3) chunk_size = int(config.get("collection", {}).get("bulk_chunk_size",
retry_delay = config.get("collection", {}).get("retry_delay", 1) int(os.getenv("BULK_DOWNLOAD_BATCH_SIZE", "1000"))))
max_retries = int(config.get("collection", {}).get("max_retries",
int(os.getenv("MAX_RETRIES", "3"))))
retry_delay = float(config.get("collection", {}).get("retry_delay", 1))
# Normalize time inputs that might be naive time objects # Normalize time inputs
from datetime import time as dt_time from datetime import time as dt_time
if isinstance(start_date, dt_time): if isinstance(start_date, dt_time):
# Use today's date in UTC for safety if only a time is provided
start_date = datetime.combine(datetime.now(timezone.utc).date(), start_date) start_date = datetime.combine(datetime.now(timezone.utc).date(), start_date)
if isinstance(end_date, dt_time): if isinstance(end_date, dt_time):
# Use the same date as start_date if possible for consistency
base_date = start_date.date() if isinstance(start_date, datetime) else datetime.now(timezone.utc).date() base_date = start_date.date() if isinstance(start_date, datetime) else datetime.now(timezone.utc).date()
end_date = datetime.combine(base_date, end_date) end_date = datetime.combine(base_date, end_date)
if start_date.tzinfo is None: if start_date.tzinfo is None:
start_date = start_date.replace(tzinfo=timezone.utc) start_date = start_date.replace(tzinfo=timezone.utc)
if end_date.tzinfo is None: if end_date.tzinfo is None:
end_date = end_date.replace(tzinfo=timezone.utc) end_date = end_date.replace(tzinfo=timezone.utc)
# Convert to naive UTC for Binance API # Binance API expects naive UTC timestamps (ms)
current_start = start_date.replace(tzinfo=None) current_start = start_date.replace(tzinfo=timezone.utc)
end = end_date.replace(tzinfo=None) end = end_date.replace(tzinfo=timezone.utc)
total_records = 0 total_records = 0
retry_count = 0 consecutive_empty = 0
while current_start < end: while current_start < end:
try: try:
# Calculate chunk end time based on interval # Calculate chunk end time based on interval
chunk_end = self._calculate_chunk_end(current_start, interval, chunk_size) chunk_end = self._calculate_chunk_end(current_start, interval, chunk_size)
chunk_end = min(chunk_end, end) if chunk_end > end:
chunk_end = end
# Get klines from Binance with retry logic
klines: Optional[List[List[Any]]] = None klines: Optional[List[List[Any]]] = None
# Try with retry policy; also handle rate-limit backoffs
for attempt in range(max_retries): for attempt in range(max_retries):
try: try:
klines = self.client.get_historical_klines( klines = await self._get_historical_klines_async(
symbol=symbol, symbol=symbol,
interval=interval, interval=interval,
start_str=int(current_start.timestamp() * 1000), start_ms=int(current_start.timestamp() * 1000),
end_str=int(chunk_end.timestamp() * 1000), end_ms=int(chunk_end.timestamp() * 1000),
limit=chunk_size limit=chunk_size
) )
break break
except BinanceAPIException as e: except BinanceAPIException as e:
if e.code == -1003: # Rate limit if e.code == -1003: # Rate limit
wait_time = retry_delay * (2 ** attempt) wait_time = retry_delay * (2 ** attempt)
self.logger.warning(f"Rate limit hit, waiting {wait_time}s before retry") self.logger.warning(f"Rate limit hit for {symbol} {interval}, waiting {wait_time}s")
await asyncio.sleep(wait_time) await asyncio.sleep(wait_time)
else: else:
raise self.logger.error(f"Binance API exception for {symbol} {interval}: {e}")
except Exception as e: if attempt == max_retries - 1:
raise
await asyncio.sleep(retry_delay)
except Exception as e:
self.logger.warning(f"Attempt {attempt + 1}/{max_retries} failed for {symbol} {interval}: {e}")
if attempt == max_retries - 1: if attempt == max_retries - 1:
raise raise
self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
await asyncio.sleep(retry_delay) await asyncio.sleep(retry_delay)
# No data returned; advance conservatively or terminate
if not klines or len(klines) == 0: if not klines or len(klines) == 0:
self.logger.info(f"No more data available for {symbol} {interval}") consecutive_empty += 1
# If multiple consecutive empty chunks, assume past available history
if consecutive_empty >= 2:
self.logger.info(f"No more data available for {symbol} {interval}; ending range loop")
break break
# Otherwise, advance by one chunk and continue
current_start = chunk_end + timedelta(milliseconds=1)
await asyncio.sleep(0.05)
continue
# Reset empty counter on success
consecutive_empty = 0
# Parse and store klines # Parse and store klines
ohlcv_data: List[Dict[str, Any]] = [] ohlcv_data: List[Dict[str, Any]] = []
for kline in klines: for kline in klines:
try: try:
# Normalize to WebSocket-style event expected by parse_kline_data
ws_event = self._rest_kline_to_ws_event(symbol, interval, kline) ws_event = self._rest_kline_to_ws_event(symbol, interval, kline)
parsed_data = parse_kline_data(ws_event) parsed_data = parse_kline_data(ws_event)
ohlcv_data.append(parsed_data) ohlcv_data.append(parsed_data)
except Exception as e: except Exception as e:
# Keep original message to aid debugging if structure differs
self.logger.error(f"Error parsing kline data: {e} | raw={kline!r}") self.logger.error(f"Error parsing kline data: {e} | raw={kline!r}")
continue continue
@@ -356,57 +518,63 @@ class BinanceDataCollector:
total_records += len(ohlcv_data) total_records += len(ohlcv_data)
# Update progress # Update progress
if symbol in self.download_progress and interval in self.download_progress[symbol]["intervals"]: if symbol in self.download_progress and \
interval in self.download_progress[symbol].get("intervals", {}):
self.download_progress[symbol]["intervals"][interval]["records"] = total_records self.download_progress[symbol]["intervals"][interval]["records"] = total_records
self.logger.debug(f"Stored {len(ohlcv_data)} {interval} candles for {symbol} (total: {total_records})") self.logger.debug(
f"Stored {len(ohlcv_data)} {interval} candles for {symbol} (total: {total_records})"
)
# Update current_start for next chunk # Update current_start based on last candle close time
if klines: if klines:
last_close_time_ms = klines[-1][6] # Use the close time of the last kline last_close_time_ms = int(klines[-1][6]) # close time ms
current_start = datetime.utcfromtimestamp((last_close_time_ms + 1) / 1000) # Advance by 1 ms past last close to avoid duplicate fetch
current_start = datetime.utcfromtimestamp((last_close_time_ms + 1) / 1000).replace(tzinfo=timezone.utc)
else: else:
break break
# Delay to respect rate limits # Light delay to avoid hammering
await asyncio.sleep(0.2) await asyncio.sleep(0.05)
retry_count = 0 # Reset retry count on success
except BinanceAPIException as e:
retry_count += 1
self.logger.error(f"Binance API error (attempt {retry_count}): {e}")
if retry_count >= max_retries:
self.logger.error(f"Max retries reached for {symbol} {interval}")
raise
# Exponential backoff
wait_time = retry_delay * (2 ** retry_count)
await asyncio.sleep(wait_time)
except asyncio.CancelledError: except asyncio.CancelledError:
self.logger.info(f"Download for {symbol} {interval} cancelled") self.logger.info(f"Download for {symbol} {interval} cancelled")
break break
except Exception as e: except Exception as e:
self.logger.error(f"Error collecting {interval} data for {symbol}: {e}", exc_info=True) self.logger.error(f"Error collecting {interval} data for {symbol}: {e}", exc_info=True)
raise # Backoff before continuing or aborting loop
await asyncio.sleep(max(0.5, retry_delay))
# Decide: keep attempting next loop iteration to avoid a hard stop
# If persistent errors, the retries around REST will surface again
# Optional: could break here if desired, but continuing is safer for coverage
return total_records return total_records
# ---------------------------
# Technical indicators
# ---------------------------
async def _calculate_and_store_indicators(self, symbol: str, interval: str): async def _calculate_and_store_indicators(self, symbol: str, interval: str):
"""Calculate and store technical indicators for a symbol and interval""" """Calculate and store technical indicators for a symbol and interval"""
try: try:
# Check if indicators are enabled for this interval # Check if indicators are enabled for this interval
indicator_config = config.get('technical_indicators', {}) indicator_config = config.get('technical_indicators', {})
calc_intervals = indicator_config.get('calculation_intervals', ['1m', '5m', '15m', '1h', '4h', '1d']) calc_intervals = indicator_config.get('calculation_intervals',
['1m', '5m', '15m', '1h', '4h', '1d'])
if interval not in calc_intervals: if interval not in calc_intervals:
self.logger.debug(f"Skipping indicators for {symbol} {interval} (not in calculation_intervals)") self.logger.debug(
f"Skipping indicators for {symbol} {interval} (not in calculation_intervals)"
)
return return
# Get OHLCV data from database (need enough for longest indicator period) # Get OHLCV data from database (need enough for longest indicator period)
max_period = 200 # Maximum period for indicators like SMA-200 max_period = 200 # SMA-200, etc.
ohlcv_data = await db_manager.get_ohlcv_data(symbol, interval, limit=max_period + 50) ohlcv_data = await db_manager.get_ohlcv_data(symbol, interval, limit=max_period + 50)
if len(ohlcv_data) < 50: # Need minimum data for indicators
self.logger.warning(f"Not enough data for indicators: {symbol} {interval} ({len(ohlcv_data)} records)") if len(ohlcv_data) < 50:
self.logger.warning(
f"Not enough data for indicators: {symbol} {interval} ({len(ohlcv_data)} records)"
)
return return
# Convert to DataFrame # Convert to DataFrame
@@ -429,11 +597,21 @@ class BinanceDataCollector:
# Store indicators in database # Store indicators in database
if indicators_data: if indicators_data:
await db_manager.insert_indicators_batch(symbol, interval, indicators_data) await db_manager.insert_indicators_batch(symbol, interval, indicators_data)
self.logger.info(f"Stored {len(indicators_data)} indicator values for {symbol} {interval}") self.logger.info(
f"Stored {len(indicators_data)} indicator values for {symbol} {interval}"
)
except asyncio.CancelledError: except asyncio.CancelledError:
self.logger.info(f"Indicator calculation cancelled for {symbol} {interval}") self.logger.info(f"Indicator calculation cancelled for {symbol} {interval}")
except Exception as e: except Exception as e:
self.logger.error(f"Error calculating indicators for {symbol} {interval}: {e}", exc_info=True) self.logger.error(
f"Error calculating indicators for {symbol} {interval}: {e}",
exc_info=True
)
# ---------------------------
# Gap detection and filling
# ---------------------------
async def auto_fill_gaps( async def auto_fill_gaps(
self, self,
@@ -443,6 +621,7 @@ class BinanceDataCollector:
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
Automatically fill gaps for a symbol Automatically fill gaps for a symbol
Args: Args:
symbol: Trading pair symbol symbol: Trading pair symbol
intervals: List of intervals to fill (default: from config) intervals: List of intervals to fill (default: from config)
@@ -455,9 +634,11 @@ class BinanceDataCollector:
global config, db_manager global config, db_manager
if intervals is None: if intervals is None:
intervals = config.get('gap_filling', {}).get('intervals_to_monitor', ['1m', '5m', '15m', '1h', '4h', '1d']) intervals = config.get('gap_filling', {}).get('intervals_to_monitor',
['1m', '5m', '15m', '1h', '4h', '1d'])
self.logger.info(f"Starting auto gap fill for {symbol} on intervals: {intervals}") self.logger.info(f"Starting auto gap fill for {symbol} on intervals: {intervals}")
results: Dict[str, Any] = { results: Dict[str, Any] = {
'symbol': symbol, 'symbol': symbol,
'intervals': {}, 'intervals': {},
@@ -472,20 +653,23 @@ class BinanceDataCollector:
self.logger.warning(f"Symbol {symbol} not found in config") self.logger.warning(f"Symbol {symbol} not found in config")
return results return results
record_from_date = pair_config.get('record_from_date') record_from_date_iso = pair_config.get('record_from_date') or \
if not record_from_date: config.get('collection', {}).get('default_record_from_date', '2020-01-01T00:00:00Z')
record_from_date = config.get('collection', {}).get('default_record_from_date', '2020-01-01T00:00:00Z') _ = datetime.fromisoformat(record_from_date_iso.replace('Z', '+00:00')) # reserved
_ = datetime.fromisoformat(record_from_date.replace('Z', '+00:00')) # kept for future use
gap_config = config.get('gap_filling', {}) gap_config = config.get('gap_filling', {})
max_gap_size = gap_config.get('max_gap_size_candles', 1000) max_gap_size = gap_config.get('max_gap_size_candles', 1000)
max_attempts = int(gap_config.get('max_fill_attempts',
int(config.get("collection", {}).get("max_retries",
int(os.getenv("MAX_RETRIES", "3"))))))
averaging_lookback = gap_config.get('averaging_lookback_candles', 10)
max_empty_seq = gap_config.get('max_consecutive_empty_candles', 5)
for interval in intervals: for interval in intervals:
self.logger.info(f"Checking gaps for {symbol} {interval}") self.logger.info(f"Checking gaps for {symbol} {interval}")
# Detect gaps # Detect gaps
gaps_info = await db_manager.detect_gaps(symbol, interval) gaps_info = await db_manager.detect_gaps(symbol, interval)
gaps = gaps_info.get('gaps', []) gaps = gaps_info.get('gaps', [])
interval_result = { interval_result = {
'gaps_found': len(gaps), 'gaps_found': len(gaps),
'gaps_filled': 0, 'gaps_filled': 0,
@@ -493,7 +677,6 @@ class BinanceDataCollector:
'errors': [] 'errors': []
} }
# Fill downloadable gaps
for gap in gaps: for gap in gaps:
missing_candles = gap['missing_candles'] missing_candles = gap['missing_candles']
# Skip if gap is too large # Skip if gap is too large
@@ -503,32 +686,48 @@ class BinanceDataCollector:
continue continue
try: try:
# Download missing data
gap_start = datetime.fromisoformat(gap['gap_start']) gap_start = datetime.fromisoformat(gap['gap_start'])
gap_end = datetime.fromisoformat(gap['gap_end']) gap_end = datetime.fromisoformat(gap['gap_end'])
self.logger.info(f"Filling gap: {gap_start} to {gap_end}") self.logger.info(f"Filling gap: {gap_start} to {gap_end}")
records_count = await self._collect_historical_klines( # Attempt multiple real fills before resorting to averaging
symbol, interval, gap_start, gap_end real_filled_records = 0
for attempt in range(1, max_attempts + 1):
try:
# Small buffer around the gap to ensure edges are covered
buffered_start = gap_start - timedelta(milliseconds=1)
buffered_end = gap_end + timedelta(milliseconds=1)
added = await self._collect_historical_klines(
symbol, interval, buffered_start, buffered_end
) )
real_filled_records += added
if added > 0:
# A successful fill; break early
break
except Exception as e:
# Log and continue attempts
interval_result['errors'].append(
f"Attempt {attempt} failed for gap {gap_start}->{gap_end}: {e}"
)
await asyncio.sleep(0.5 * attempt)
if records_count > 0: if real_filled_records > 0:
interval_result['gaps_filled'] += 1 interval_result['gaps_filled'] += 1
results['total_gaps_filled'] += 1 results['total_gaps_filled'] += 1
self.logger.info(f"Successfully filled gap with {records_count} records") self.logger.info(f"Successfully filled gap with {real_filled_records} records")
else: else:
# Genuine empty gap - fill with averages if enabled # Genuine empty gap - fill with averages if enabled
if fill_genuine_gaps: if fill_genuine_gaps and gap_config.get('enable_intelligent_averaging', True):
filled = await db_manager.fill_genuine_gaps_with_averages( filled = await db_manager.fill_genuine_gaps_with_averages(
symbol, interval, symbol, interval,
gap_config.get('max_consecutive_empty_candles', 5), max_empty_seq,
gap_config.get('averaging_lookback_candles', 10) averaging_lookback
) )
interval_result['genuine_filled'] += filled interval_result['genuine_filled'] += filled
results['total_genuine_filled'] += filled results['total_genuine_filled'] += filled
# Small delay between gaps # Small delay between gaps
await asyncio.sleep(0.5) await asyncio.sleep(0.2)
except Exception as e: except Exception as e:
error_msg = f"Error filling gap: {str(e)}" error_msg = f"Error filling gap: {str(e)}"
@@ -537,7 +736,7 @@ class BinanceDataCollector:
results['intervals'][interval] = interval_result results['intervals'][interval] = interval_result
# Calculate and store indicators after filling gaps # Calculate and store indicators after any fills
if interval_result['gaps_filled'] > 0 or interval_result['genuine_filled'] > 0: if interval_result['gaps_filled'] > 0 or interval_result['genuine_filled'] > 0:
try: try:
await self._calculate_and_store_indicators(symbol, interval) await self._calculate_and_store_indicators(symbol, interval)
@@ -554,7 +753,7 @@ class BinanceDataCollector:
async def start_auto_gap_fill_scheduler(self): async def start_auto_gap_fill_scheduler(self):
"""Start background task for automatic gap filling""" """Start background task for automatic gap filling"""
global config, db_manager global config
gap_config = config.get('gap_filling', {}) gap_config = config.get('gap_filling', {})
if not gap_config.get('enable_auto_gap_filling', False): if not gap_config.get('enable_auto_gap_filling', False):
self.logger.info("Auto gap filling is disabled") self.logger.info("Auto gap filling is disabled")
@@ -562,6 +761,7 @@ class BinanceDataCollector:
schedule_hours = gap_config.get('auto_fill_schedule_hours', 24) schedule_hours = gap_config.get('auto_fill_schedule_hours', 24)
self.logger.info(f"Starting auto gap fill scheduler (every {schedule_hours} hours)") self.logger.info(f"Starting auto gap fill scheduler (every {schedule_hours} hours)")
self.is_collecting = True
while self.is_collecting: while self.is_collecting:
try: try:
@@ -590,6 +790,10 @@ class BinanceDataCollector:
self.logger.error(f"Error in auto gap fill scheduler: {e}", exc_info=True) self.logger.error(f"Error in auto gap fill scheduler: {e}", exc_info=True)
await asyncio.sleep(3600) # Wait 1 hour on error await asyncio.sleep(3600) # Wait 1 hour on error
# ---------------------------
# WebSocket continuous streams
# ---------------------------
async def start_continuous_collection(self): async def start_continuous_collection(self):
"""Start continuous data collection via WebSocket""" """Start continuous data collection via WebSocket"""
if self.websocket_collection_running: if self.websocket_collection_running:
@@ -667,7 +871,6 @@ class BinanceDataCollector:
# Parse kline data # Parse kline data
ohlcv_data = parse_kline_data(data) ohlcv_data = parse_kline_data(data)
# Store in database # Store in database
await db_manager.insert_ohlcv_single(ohlcv_data) await db_manager.insert_ohlcv_single(ohlcv_data)
@@ -720,7 +923,10 @@ class BinanceDataCollector:
websocket_connections[stream_name] = websocket websocket_connections[stream_name] = websocket
tick_batch: List[Dict[str, Any]] = [] tick_batch: List[Dict[str, Any]] = []
batch_size = config.get('collection', {}).get('tick_batch_size', 100) batch_size = int(config.get('collection', {}).get(
'tick_batch_size',
int(os.getenv("TICK_BATCH_SIZE", "100"))
))
async for message in websocket: async for message in websocket:
if not self.websocket_collection_running: if not self.websocket_collection_running:
@@ -798,6 +1004,10 @@ class BinanceDataCollector:
websocket_connections.clear() websocket_connections.clear()
self.logger.info("Continuous data collection stopped") self.logger.info("Continuous data collection stopped")
# ---------------------------
# Candle generation from ticks
# ---------------------------
async def generate_candles_from_ticks( async def generate_candles_from_ticks(
self, self,
symbol: str, symbol: str,
@@ -864,6 +1074,10 @@ class BinanceDataCollector:
else: else:
self.logger.warning(f"No candles generated for {symbol} {interval}") self.logger.warning(f"No candles generated for {symbol} {interval}")
# ---------------------------
# Progress and cleanup
# ---------------------------
async def get_download_progress(self, symbol: str = None) -> Dict[str, Any]: async def get_download_progress(self, symbol: str = None) -> Dict[str, Any]:
"""Get download progress for a symbol or all symbols""" """Get download progress for a symbol or all symbols"""
if symbol: if symbol:
@@ -873,15 +1087,21 @@ class BinanceDataCollector:
async def cleanup(self): async def cleanup(self):
"""Clean up resources""" """Clean up resources"""
await self.stop_continuous_collection() await self.stop_continuous_collection()
# db_manager may have a close method; guard if absent # db_manager may have a close method; guard if absent
try: try:
if db_manager and hasattr(db_manager, "close"): if db_manager and hasattr(db_manager, "close"):
await db_manager.close() await db_manager.close()
except Exception as e: except Exception as e:
self.logger.warning(f"Error closing database manager: {e}") self.logger.warning(f"Error closing database manager: {e}")
self.logger.info("BinanceDataCollector cleanup complete") self.logger.info("BinanceDataCollector cleanup complete")
# ---------------------------
# UI process management
# ---------------------------
def start_ui_server(): def start_ui_server():
"""Start the UI server as a subprocess""" """Start the UI server as a subprocess"""
global ui_process global ui_process
@@ -916,7 +1136,7 @@ def start_ui_server():
elif ' - DEBUG - ' in line or 'DEBUG:' in line: elif ' - DEBUG - ' in line or 'DEBUG:' in line:
return logging.DEBUG return logging.DEBUG
else: else:
# Default to INFO for all other lines (including INFO: and standard messages) # Default to INFO for all other lines
return logging.INFO return logging.INFO
def log_ui_output(): def log_ui_output():
@@ -925,7 +1145,7 @@ def start_ui_server():
return return
for line in ui_process.stdout: for line in ui_process.stdout:
line = line.rstrip() line = line.rstrip()
if line: # Only log non-empty lines if line:
log_level = parse_log_level(line) log_level = parse_log_level(line)
logger.log(log_level, f"[UI] {line}") logger.log(log_level, f"[UI] {line}")
@@ -935,7 +1155,7 @@ def start_ui_server():
return return
for line in ui_process.stderr: for line in ui_process.stderr:
line = line.rstrip() line = line.rstrip()
if line: # Only log non-empty lines if line:
log_level = parse_log_level(line) log_level = parse_log_level(line)
logger.log(log_level, f"[UI] {line}") logger.log(log_level, f"[UI] {line}")
@@ -974,7 +1194,10 @@ def stop_ui_server():
logger.debug("UI server process not running") logger.debug("UI server process not running")
# ---------------------------
# Global signal handlers # Global signal handlers
# ---------------------------
def signal_handler(signum, frame): def signal_handler(signum, frame):
"""Handle shutdown signals""" """Handle shutdown signals"""
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -988,6 +1211,10 @@ def signal_handler(signum, frame):
task.cancel() task.cancel()
# ---------------------------
# Main entry point
# ---------------------------
async def main(): async def main():
"""Main application entry point""" """Main application entry point"""
# Setup signal handlers # Setup signal handlers
@@ -1003,7 +1230,14 @@ async def main():
# Start UI server # Start UI server
start_ui_server() start_ui_server()
# Start continuous collection # Optionally kick off an initial full backfill for all configured pairs
if config.get("collection", {}).get("initial_full_backfill", True):
# Launch as a background task so WebSockets can start immediately
task_name = "initial_full_backfill"
backfill_task = asyncio.create_task(collector.start_bulk_download_for_all_pairs(), name=task_name)
running_tasks[task_name] = backfill_task
# Start continuous collection (kline + trade streams) and gap scheduler
await collector.start_continuous_collection() await collector.start_continuous_collection()
# Keep the application running # Keep the application running