Updated main to allow significantly more efficient downloading

2025-10-05 17:53:41 +01:00
parent 32ef7401e3
commit 4d0460944a
1 changed files with 355 additions and 121 deletions
--- a/main.py
+++ b/main.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python3
 """
 main.py - Complete Binance Trading Data Collection System
-Main application entry point with async data collection, websocket handling, and task management
+Main application entry point with async data collection, websocket handling, bulk
 backfill orchestration, periodic gap scans, and task management.
 """
 import asyncio
@@ -13,11 +16,11 @@ import json
 import subprocess
 import os
 from datetime import datetime, timedelta, timezone
-from typing import Dict, List, Optional, Any
+from typing import Dict, List, Optional, Any, Tuple
 from contextlib import asynccontextmanager
 import websockets
-import aiohttp
+import aiohttp  # kept for future-proofing network ops
 from binance.client import Client
 from binance.exceptions import BinanceAPIException
 import pandas as pd
@@ -52,11 +55,15 @@ class BinanceDataCollector:
        self.websocket_collection_running = False
        self.download_progress: Dict[str, Any] = {}
        # Concurrency controls from env with sensible defaults
        max_downloads = int(os.getenv('MAX_CONCURRENT_DOWNLOADS', '3'))
        max_gap_fills = int(os.getenv('MAX_CONCURRENT_GAP_FILLS', '2'))
        self._download_semaphore = asyncio.Semaphore(max_downloads)
        self._gap_fill_semaphore = asyncio.Semaphore(max_gap_fills)
-        self.logger.info(f"Initialized with max {max_downloads} concurrent downloads, {max_gap_fills} gap fills")
+
        self.logger.info(
            f"Initialized with max {max_downloads} concurrent downloads, {max_gap_fills} gap fills"
        )
    async def initialize(self):
        """Initialize the data collector"""
@@ -64,10 +71,32 @@ class BinanceDataCollector:
        # Setup logging
        setup_logging()
        self.logger.info("Initializing Binance Data Collector")
        # Load configuration
        config = load_config()
        # Optional: add defaults for new flags
        coll = config.setdefault("collection", {})
        coll.setdefault("default_record_from_date", "2020-01-01T00:00:00Z")
        coll.setdefault("initial_full_backfill", True)
        coll.setdefault("candle_intervals", ["1m", "5m", "15m", "1h", "4h", "1d"])
        coll.setdefault("bulk_chunk_size", int(os.getenv("BULK_DOWNLOAD_BATCH_SIZE", "1000")))
        coll.setdefault("tick_batch_size", int(os.getenv("TICK_BATCH_SIZE", "100")))
        coll.setdefault("max_retries", int(os.getenv("MAX_RETRIES", "3")))
        coll.setdefault("retry_delay", 1)
        gap = config.setdefault("gap_filling", {})
        gap.setdefault("enable_auto_gap_filling", True)
        gap.setdefault("auto_fill_schedule_hours", 24)
        gap.setdefault("enable_intelligent_averaging", True)
        gap.setdefault("max_fill_attempts", int(os.getenv("MAX_RETRIES", "3")))
        gap.setdefault("intervals_to_monitor", coll.get("candle_intervals", ["1m", "5m", "15m", "1h", "4h", "1d"]))
        gap.setdefault("max_gap_size_candles", 1000)
        gap.setdefault("averaging_lookback_candles", 10)
        gap.setdefault("max_consecutive_empty_candles", 5)
        self.logger.info(f"Loaded configuration for {len(config['trading_pairs'])} trading pairs")
        # Initialize database
@@ -85,6 +114,124 @@ class BinanceDataCollector:
            self.client = Client()
            self.logger.info("Binance client initialized without API credentials (public data only)")
    # ---------------------------
    # Bulk backfill orchestration
    # ---------------------------
    async def start_bulk_download_for_all_pairs(self):
        """
        Automatically launch full-history downloads for all enabled pairs,
        starting from record_from_date for each pair, across all configured intervals.
        """
        global config
        enabled_pairs = [p for p in config['trading_pairs'] if p.get('enabled', True)]
        if not enabled_pairs:
            self.logger.warning("No enabled trading pairs found for bulk backfill")
            return
        # Sort by priority ascending (1 higher priority), then by symbol for stable order
        enabled_pairs.sort(key=lambda p: (int(p.get('priority', 1)), p.get('symbol', '')))
        # Build tasks per symbol to respect MAX_CONCURRENT_DOWNLOADS at the symbol level
        tasks: List[asyncio.Task] = []
        now_utc = datetime.now(timezone.utc)
        for pair in enabled_pairs:
            symbol = pair['symbol'].upper()
            start_iso = pair.get('record_from_date') or config["collection"]["default_record_from_date"]
            try:
                start_dt = datetime.fromisoformat(start_iso.replace("Z", "+00:00"))
            except Exception:
                self.logger.warning(f"Invalid record_from_date for {symbol}: {start_iso}, falling back to default")
                start_dt = datetime.fromisoformat(config["collection"]["default_record_from_date"].replace("Z", "+00:00"))
            # One task per symbol to execute all intervals concurrently for that symbol
            tasks.append(asyncio.create_task(
                self._bulk_download_symbol_all_intervals(symbol, start_dt, now_utc),
                name=f"bulk_{symbol}"
            ))
        # Execute with graceful progress logging
        self.logger.info(f"Launching bulk backfill for {len(tasks)} symbols...")
        results = await asyncio.gather(*tasks, return_exceptions=True)
        errors = [r for r in results if isinstance(r, Exception)]
        if errors:
            self.logger.error(f"Bulk backfill completed with {len(errors)} errors; see logs for details")
        else:
            self.logger.info("Bulk backfill completed successfully for all symbols")
    async def _bulk_download_symbol_all_intervals(
        self,
        symbol: str,
        start_date: datetime,
        end_date: Optional[datetime] = None
    ):
        """
        Launch concurrent downloads of all configured intervals for one symbol,
        bounded by the download semaphore to control exchange load.
        """
        global config
        async with self._download_semaphore:
            end_date = end_date or datetime.now(timezone.utc)
            # Ensure progress structure
            intervals = config.get("collection", {}).get("candle_intervals",
                                                        ["1m", "5m", "15m", "1h", "4h", "1d"])
            self.download_progress[symbol] = {
                "status": "running",
                "intervals": {iv: {"status": "pending", "records": 0} for iv in intervals},
                "start_time": datetime.now(timezone.utc).isoformat()
            }
            # Spawn all intervals concurrently for this symbol
            self.logger.info(f"Starting concurrent bulk for {symbol} on {intervals}")
            interval_tasks = [
                asyncio.create_task(
                    self._bulk_download_one_interval(symbol, interval, start_date, end_date),
                    name=f"bulk_{symbol}_{interval}"
                )
                for interval in intervals
            ]
            results = await asyncio.gather(*interval_tasks, return_exceptions=True)
            # Mark final status
            if any(isinstance(r, Exception) for r in results):
                self.download_progress[symbol]["status"] = "error"
                self.download_progress[symbol]["error"] = "One or more intervals failed"
            else:
                self.download_progress[symbol]["status"] = "completed"
            self.download_progress[symbol]["end_time"] = datetime.now(timezone.utc).isoformat()
    async def _bulk_download_one_interval(
        self,
        symbol: str,
        interval: str,
        start_date: datetime,
        end_date: datetime
    ):
        """Run the bulk downloader for a single symbol+interval and then compute indicators."""
        # Update status
        sp = self.download_progress.setdefault(symbol, {"intervals": {}})
        sp["intervals"].setdefault(interval, {"status": "pending", "records": 0})
        sp["intervals"][interval]["status"] = "checking"
        records_count = await self._collect_historical_klines(symbol, interval, start_date, end_date)
        if records_count > 0:
            sp["intervals"][interval]["status"] = "calculating_indicators"
            sp["intervals"][interval]["records"] = records_count
            await self._calculate_and_store_indicators(symbol, interval)
            sp["intervals"][interval]["status"] = "completed"
            self.logger.info(f"Completed {interval} data for {symbol} - {records_count} new records")
        else:
            sp["intervals"][interval]["status"] = "skipped_complete"
            self.logger.info(f"Skipped {interval} for {symbol} - data already complete or no new records")
    # ---------------------------
    # Intelligent bulk downloader
    # ---------------------------
    async def bulk_download_historical_data(
        self, symbol: str, start_date: datetime, end_date: Optional[datetime] = None,
        intervals: Optional[List[str]] = None
@@ -92,6 +239,7 @@ class BinanceDataCollector:
        """
        Bulk download historical OHLCV data from Binance with intelligent gap detection.
        Only downloads data that doesn't already exist in the database.
        Note: kept for API/UI compatibility; orchestration now prefers start_bulk_download_for_all_pairs.
        """
        async with self._download_semaphore:
            if end_date is None:
@@ -107,7 +255,8 @@ class BinanceDataCollector:
            # Get intervals
            if intervals is None:
-                intervals = config.get("collection", {}).get("candle_intervals", ["1m", "5m", "15m", "1h", "4h", "1d"])
+                intervals = config.get("collection", {}).get("candle_intervals",
                                                             ["1m", "5m", "15m", "1h", "4h", "1d"])
            # Initialize progress tracking
            self.download_progress[symbol] = {
@@ -115,38 +264,23 @@ class BinanceDataCollector:
                "intervals": {},
                "start_time": datetime.now(timezone.utc).isoformat()
            }
            for interval in intervals:
-                self.download_progress[symbol]["intervals"][interval] = {
+                self.download_progress[symbol]["intervals"][interval] = {"status": "pending", "records": 0}
                    "status": "pending",
                    "records": 0
                }
            try:
-                for interval in intervals:
+                # Run intervals concurrently to improve throughput for one symbol
-                    self.logger.info(f"Processing {interval} data for {symbol}")
+                tasks = [
-                    self.download_progress[symbol]["intervals"][interval]["status"] = "checking"
+                    asyncio.create_task(self._bulk_download_one_interval(symbol, interval, start_date, end_date),
-
+                                        name=f"bulk_single_{symbol}_{interval}")
-                    # Intelligent download - only missing data
+                    for interval in intervals
-                    records_count = await self._collect_historical_klines(
+                ]
-                        symbol, interval, start_date, end_date
+                await asyncio.gather(*tasks)
                    )
                    if records_count > 0:
                        self.download_progress[symbol]["intervals"][interval]["status"] = "calculating_indicators"
                        self.download_progress[symbol]["intervals"][interval]["records"] = records_count
                        # Calculate indicators for new data
                        await self._calculate_and_store_indicators(symbol, interval)
                        self.download_progress[symbol]["intervals"][interval]["status"] = "completed"
                        self.logger.info(f"Completed {interval} data for {symbol} - {records_count} new records")
                    else:
                        self.download_progress[symbol]["intervals"][interval]["status"] = "skipped_complete"
                        self.logger.info(f"Skipped {interval} for {symbol} - data already complete")
                self.download_progress[symbol]["status"] = "completed"
                self.download_progress[symbol]["end_time"] = datetime.now(timezone.utc).isoformat()
            except Exception as e:
-                self.logger.error(f"Error in bulk download for {symbol}: {e}")
+                self.logger.error(f"Error in bulk download for {symbol}: {e}", exc_info=True)
                self.download_progress[symbol]["status"] = "error"
                self.download_progress[symbol]["error"] = str(e)
                raise
@@ -172,7 +306,7 @@ class BinanceDataCollector:
        )
        # If coverage is complete, skip download
-        if coverage_check['is_complete']:
+        if coverage_check.get('is_complete'):
            self.logger.info(
                f"Skipping {symbol} {interval} - data already complete "
                f"({coverage_check['coverage_percent']:.2f}% coverage)"
@@ -183,6 +317,7 @@ class BinanceDataCollector:
        missing_ranges = await db_manager.get_missing_time_ranges(
            symbol, interval, start_date, end_date
        )
        if not missing_ranges:
            self.logger.info(f"No missing data ranges for {symbol} {interval}")
            return 0
@@ -205,7 +340,6 @@ class BinanceDataCollector:
                symbol, interval, range_start, range_end
            )
            total_new_records += records_in_range
            self.logger.info(
                f"Downloaded {records_in_range} records for range {idx}/{len(missing_ranges)}"
            )
@@ -249,104 +383,132 @@ class BinanceDataCollector:
                "T": int(kline_row[6]),
                "s": symbol.upper(),
                "i": interval,
-                "f": None,       # first trade id (unknown from REST row)
+                "f": None,  # first trade id
-                "L": None,       # last trade id (unknown)
+                "L": None,  # last trade id
                "o": str(kline_row[1]),
                "c": str(kline_row[4]),
                "h": str(kline_row[2]),
                "l": str(kline_row[3]),
                "v": str(kline_row[5]),
                "n": int(kline_row[8]),
-                "x": True,       # REST klines are for closed candles
+                "x": True,  # closed candle
                "q": str(kline_row[7]),
-                "V": None,       # taker buy base asset volume (optional)
+                "V": None,  # taker buy base vol (optional)
-                "Q": None,       # taker buy quote asset volume (optional)
+                "Q": None,  # taker buy quote vol (optional)
                "B": None   # ignore
            }
        }
    async def _get_historical_klines_async(
        self, symbol: str, interval: str, start_ms: int, end_ms: int, limit: int
    ) -> List[List[Any]]:
        """
        Run python-binance get_historical_klines in a worker thread to avoid blocking the event loop.
        """
        def call():
            return self.client.get_historical_klines(
                symbol=symbol,
                interval=interval,
                start_str=start_ms,
                end_str=end_ms,
                limit=limit
            )
        return await asyncio.to_thread(call)
    async def _download_time_range(
        self, symbol: str, interval: str, start_date: datetime, end_date: datetime
    ) -> int:
        """
        Download data for a specific time range (internal method).
        This is the actual download logic extracted from the original collect_historical_klines.
        Returns:
            Number of records downloaded and inserted
        """
        global config, db_manager
-        chunk_size = config.get("collection", {}).get("bulk_chunk_size", 1000)
+        # Resolve batch size and retry policy, prefer config then env
-        max_retries = config.get("collection", {}).get("max_retries", 3)
+        chunk_size = int(config.get("collection", {}).get("bulk_chunk_size",
-        retry_delay = config.get("collection", {}).get("retry_delay", 1)
+                       int(os.getenv("BULK_DOWNLOAD_BATCH_SIZE", "1000"))))
        max_retries = int(config.get("collection", {}).get("max_retries",
                        int(os.getenv("MAX_RETRIES", "3"))))
        retry_delay = float(config.get("collection", {}).get("retry_delay", 1))
-        # Normalize time inputs that might be naive time objects
+        # Normalize time inputs
        from datetime import time as dt_time
        if isinstance(start_date, dt_time):
            # Use today's date in UTC for safety if only a time is provided
            start_date = datetime.combine(datetime.now(timezone.utc).date(), start_date)
        if isinstance(end_date, dt_time):
            # Use the same date as start_date if possible for consistency
            base_date = start_date.date() if isinstance(start_date, datetime) else datetime.now(timezone.utc).date()
            end_date = datetime.combine(base_date, end_date)
        if start_date.tzinfo is None:
            start_date = start_date.replace(tzinfo=timezone.utc)
        if end_date.tzinfo is None:
            end_date = end_date.replace(tzinfo=timezone.utc)
-        # Convert to naive UTC for Binance API
+        # Binance API expects naive UTC timestamps (ms)
-        current_start = start_date.replace(tzinfo=None)
+        current_start = start_date.replace(tzinfo=timezone.utc)
-        end = end_date.replace(tzinfo=None)
+        end = end_date.replace(tzinfo=timezone.utc)
        total_records = 0
-        retry_count = 0
+        consecutive_empty = 0
        while current_start < end:
            try:
                # Calculate chunk end time based on interval
                chunk_end = self._calculate_chunk_end(current_start, interval, chunk_size)
-                chunk_end = min(chunk_end, end)
+                if chunk_end > end:
                    chunk_end = end
                # Get klines from Binance with retry logic
                klines: Optional[List[List[Any]]] = None
                # Try with retry policy; also handle rate-limit backoffs
                for attempt in range(max_retries):
                    try:
-                        klines = self.client.get_historical_klines(
+                        klines = await self._get_historical_klines_async(
                            symbol=symbol,
                            interval=interval,
-                            start_str=int(current_start.timestamp() * 1000),
+                            start_ms=int(current_start.timestamp() * 1000),
-                            end_str=int(chunk_end.timestamp() * 1000),
+                            end_ms=int(chunk_end.timestamp() * 1000),
                            limit=chunk_size
                        )
                        break
                    except BinanceAPIException as e:
                        if e.code == -1003:  # Rate limit
                            wait_time = retry_delay * (2 ** attempt)
-                            self.logger.warning(f"Rate limit hit, waiting {wait_time}s before retry")
+                            self.logger.warning(f"Rate limit hit for {symbol} {interval}, waiting {wait_time}s")
                            await asyncio.sleep(wait_time)
                        else:
-                            raise
+                            self.logger.error(f"Binance API exception for {symbol} {interval}: {e}")
-                    except Exception as e:
+                            if attempt == max_retries - 1:
                                raise
                            await asyncio.sleep(retry_delay)
                    except Exception as e:
                        self.logger.warning(f"Attempt {attempt + 1}/{max_retries} failed for {symbol} {interval}: {e}")
                        if attempt == max_retries - 1:
                            raise
                        self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
                        await asyncio.sleep(retry_delay)
                # No data returned; advance conservatively or terminate
                if not klines or len(klines) == 0:
-                    self.logger.info(f"No more data available for {symbol} {interval}")
+                    consecutive_empty += 1
                    # If multiple consecutive empty chunks, assume past available history
                    if consecutive_empty >= 2:
                        self.logger.info(f"No more data available for {symbol} {interval}; ending range loop")
                        break
                    # Otherwise, advance by one chunk and continue
                    current_start = chunk_end + timedelta(milliseconds=1)
                    await asyncio.sleep(0.05)
                    continue
                # Reset empty counter on success
                consecutive_empty = 0
                # Parse and store klines
                ohlcv_data: List[Dict[str, Any]] = []
                for kline in klines:
                    try:
                        # Normalize to WebSocket-style event expected by parse_kline_data
                        ws_event = self._rest_kline_to_ws_event(symbol, interval, kline)
                        parsed_data = parse_kline_data(ws_event)
                        ohlcv_data.append(parsed_data)
                    except Exception as e:
                        # Keep original message to aid debugging if structure differs
                        self.logger.error(f"Error parsing kline data: {e} | raw={kline!r}")
                        continue
@@ -356,57 +518,63 @@ class BinanceDataCollector:
                    total_records += len(ohlcv_data)
                    # Update progress
-                    if symbol in self.download_progress and interval in self.download_progress[symbol]["intervals"]:
+                    if symbol in self.download_progress and \
                       interval in self.download_progress[symbol].get("intervals", {}):
                        self.download_progress[symbol]["intervals"][interval]["records"] = total_records
-                    self.logger.debug(f"Stored {len(ohlcv_data)} {interval} candles for {symbol} (total: {total_records})")
+                    self.logger.debug(
                        f"Stored {len(ohlcv_data)} {interval} candles for {symbol} (total: {total_records})"
                    )
-                # Update current_start for next chunk
+                # Update current_start based on last candle close time
                if klines:
-                    last_close_time_ms = klines[-1][6]  # Use the close time of the last kline
+                    last_close_time_ms = int(klines[-1][6])  # close time ms
-                    current_start = datetime.utcfromtimestamp((last_close_time_ms + 1) / 1000)
+                    # Advance by 1 ms past last close to avoid duplicate fetch
                    current_start = datetime.utcfromtimestamp((last_close_time_ms + 1) / 1000).replace(tzinfo=timezone.utc)
                else:
                    break
-                # Delay to respect rate limits
+                # Light delay to avoid hammering
-                await asyncio.sleep(0.2)
+                await asyncio.sleep(0.05)
                retry_count = 0  # Reset retry count on success
            except BinanceAPIException as e:
                retry_count += 1
                self.logger.error(f"Binance API error (attempt {retry_count}): {e}")
                if retry_count >= max_retries:
                    self.logger.error(f"Max retries reached for {symbol} {interval}")
                    raise
                # Exponential backoff
                wait_time = retry_delay * (2 ** retry_count)
                await asyncio.sleep(wait_time)
            except asyncio.CancelledError:
                self.logger.info(f"Download for {symbol} {interval} cancelled")
                break
            except Exception as e:
                self.logger.error(f"Error collecting {interval} data for {symbol}: {e}", exc_info=True)
-                raise
+                # Backoff before continuing or aborting loop
                await asyncio.sleep(max(0.5, retry_delay))
                # Decide: keep attempting next loop iteration to avoid a hard stop
                # If persistent errors, the retries around REST will surface again
                # Optional: could break here if desired, but continuing is safer for coverage
        return total_records
    # ---------------------------
    # Technical indicators
    # ---------------------------
    async def _calculate_and_store_indicators(self, symbol: str, interval: str):
        """Calculate and store technical indicators for a symbol and interval"""
        try:
            # Check if indicators are enabled for this interval
            indicator_config = config.get('technical_indicators', {})
-            calc_intervals = indicator_config.get('calculation_intervals', ['1m', '5m', '15m', '1h', '4h', '1d'])
+            calc_intervals = indicator_config.get('calculation_intervals',
                                                  ['1m', '5m', '15m', '1h', '4h', '1d'])
            if interval not in calc_intervals:
-                self.logger.debug(f"Skipping indicators for {symbol} {interval} (not in calculation_intervals)")
+                self.logger.debug(
                    f"Skipping indicators for {symbol} {interval} (not in calculation_intervals)"
                )
                return
            # Get OHLCV data from database (need enough for longest indicator period)
-            max_period = 200  # Maximum period for indicators like SMA-200
+            max_period = 200  # SMA-200, etc.
            ohlcv_data = await db_manager.get_ohlcv_data(symbol, interval, limit=max_period + 50)
-            if len(ohlcv_data) < 50:  # Need minimum data for indicators
+
-                self.logger.warning(f"Not enough data for indicators: {symbol} {interval} ({len(ohlcv_data)} records)")
+            if len(ohlcv_data) < 50:
                self.logger.warning(
                    f"Not enough data for indicators: {symbol} {interval} ({len(ohlcv_data)} records)"
                )
                return
            # Convert to DataFrame
@@ -429,11 +597,21 @@ class BinanceDataCollector:
            # Store indicators in database
            if indicators_data:
                await db_manager.insert_indicators_batch(symbol, interval, indicators_data)
-                self.logger.info(f"Stored {len(indicators_data)} indicator values for {symbol} {interval}")
+                self.logger.info(
                    f"Stored {len(indicators_data)} indicator values for {symbol} {interval}"
                )
        except asyncio.CancelledError:
            self.logger.info(f"Indicator calculation cancelled for {symbol} {interval}")
        except Exception as e:
-            self.logger.error(f"Error calculating indicators for {symbol} {interval}: {e}", exc_info=True)
+            self.logger.error(
                f"Error calculating indicators for {symbol} {interval}: {e}",
                exc_info=True
            )
    # ---------------------------
    # Gap detection and filling
    # ---------------------------
    async def auto_fill_gaps(
        self,
@@ -443,6 +621,7 @@ class BinanceDataCollector:
    ) -> Dict[str, Any]:
        """
        Automatically fill gaps for a symbol
        Args:
            symbol: Trading pair symbol
            intervals: List of intervals to fill (default: from config)
@@ -455,9 +634,11 @@ class BinanceDataCollector:
            global config, db_manager
            if intervals is None:
-                intervals = config.get('gap_filling', {}).get('intervals_to_monitor', ['1m', '5m', '15m', '1h', '4h', '1d'])
+                intervals = config.get('gap_filling', {}).get('intervals_to_monitor',
                                                              ['1m', '5m', '15m', '1h', '4h', '1d'])
            self.logger.info(f"Starting auto gap fill for {symbol} on intervals: {intervals}")
            results: Dict[str, Any] = {
                'symbol': symbol,
                'intervals': {},
@@ -472,20 +653,23 @@ class BinanceDataCollector:
                    self.logger.warning(f"Symbol {symbol} not found in config")
                    return results
-                record_from_date = pair_config.get('record_from_date')
+                record_from_date_iso = pair_config.get('record_from_date') or \
-                if not record_from_date:
+                                       config.get('collection', {}).get('default_record_from_date', '2020-01-01T00:00:00Z')
-                    record_from_date = config.get('collection', {}).get('default_record_from_date', '2020-01-01T00:00:00Z')
+                _ = datetime.fromisoformat(record_from_date_iso.replace('Z', '+00:00'))  # reserved
                _ = datetime.fromisoformat(record_from_date.replace('Z', '+00:00'))  # kept for future use
                gap_config = config.get('gap_filling', {})
                max_gap_size = gap_config.get('max_gap_size_candles', 1000)
                max_attempts = int(gap_config.get('max_fill_attempts',
                                  int(config.get("collection", {}).get("max_retries",
                                      int(os.getenv("MAX_RETRIES", "3"))))))
                averaging_lookback = gap_config.get('averaging_lookback_candles', 10)
                max_empty_seq = gap_config.get('max_consecutive_empty_candles', 5)
                for interval in intervals:
                    self.logger.info(f"Checking gaps for {symbol} {interval}")
                    # Detect gaps
                    gaps_info = await db_manager.detect_gaps(symbol, interval)
                    gaps = gaps_info.get('gaps', [])
                    interval_result = {
                        'gaps_found': len(gaps),
                        'gaps_filled': 0,
@@ -493,7 +677,6 @@ class BinanceDataCollector:
                        'errors': []
                    }
                    # Fill downloadable gaps
                    for gap in gaps:
                        missing_candles = gap['missing_candles']
                        # Skip if gap is too large
@@ -503,32 +686,48 @@ class BinanceDataCollector:
                            continue
                        try:
                            # Download missing data
                            gap_start = datetime.fromisoformat(gap['gap_start'])
                            gap_end = datetime.fromisoformat(gap['gap_end'])
                            self.logger.info(f"Filling gap: {gap_start} to {gap_end}")
-                            records_count = await self._collect_historical_klines(
+                            # Attempt multiple real fills before resorting to averaging
-                                symbol, interval, gap_start, gap_end
+                            real_filled_records = 0
                            for attempt in range(1, max_attempts + 1):
                                try:
                                    # Small buffer around the gap to ensure edges are covered
                                    buffered_start = gap_start - timedelta(milliseconds=1)
                                    buffered_end = gap_end + timedelta(milliseconds=1)
                                    added = await self._collect_historical_klines(
                                        symbol, interval, buffered_start, buffered_end
                                    )
                                    real_filled_records += added
                                    if added > 0:
                                        # A successful fill; break early
                                        break
                                except Exception as e:
                                    # Log and continue attempts
                                    interval_result['errors'].append(
                                        f"Attempt {attempt} failed for gap {gap_start}->{gap_end}: {e}"
                                    )
                                    await asyncio.sleep(0.5 * attempt)
-                            if records_count > 0:
+                            if real_filled_records > 0:
                                interval_result['gaps_filled'] += 1
                                results['total_gaps_filled'] += 1
-                                self.logger.info(f"Successfully filled gap with {records_count} records")
+                                self.logger.info(f"Successfully filled gap with {real_filled_records} records")
                            else:
                                # Genuine empty gap - fill with averages if enabled
-                                if fill_genuine_gaps:
+                                if fill_genuine_gaps and gap_config.get('enable_intelligent_averaging', True):
                                    filled = await db_manager.fill_genuine_gaps_with_averages(
                                        symbol, interval,
-                                        gap_config.get('max_consecutive_empty_candles', 5),
+                                        max_empty_seq,
-                                        gap_config.get('averaging_lookback_candles', 10)
+                                        averaging_lookback
                                    )
                                    interval_result['genuine_filled'] += filled
                                    results['total_genuine_filled'] += filled
                            # Small delay between gaps
-                            await asyncio.sleep(0.5)
+                            await asyncio.sleep(0.2)
                        except Exception as e:
                            error_msg = f"Error filling gap: {str(e)}"
@@ -537,7 +736,7 @@ class BinanceDataCollector:
                    results['intervals'][interval] = interval_result
-                    # Calculate and store indicators after filling gaps
+                    # Calculate and store indicators after any fills
                    if interval_result['gaps_filled'] > 0 or interval_result['genuine_filled'] > 0:
                        try:
                            await self._calculate_and_store_indicators(symbol, interval)
@@ -554,7 +753,7 @@ class BinanceDataCollector:
    async def start_auto_gap_fill_scheduler(self):
        """Start background task for automatic gap filling"""
-        global config, db_manager
+        global config
        gap_config = config.get('gap_filling', {})
        if not gap_config.get('enable_auto_gap_filling', False):
            self.logger.info("Auto gap filling is disabled")
@@ -562,6 +761,7 @@ class BinanceDataCollector:
        schedule_hours = gap_config.get('auto_fill_schedule_hours', 24)
        self.logger.info(f"Starting auto gap fill scheduler (every {schedule_hours} hours)")
        self.is_collecting = True
        while self.is_collecting:
            try:
@@ -590,6 +790,10 @@ class BinanceDataCollector:
                self.logger.error(f"Error in auto gap fill scheduler: {e}", exc_info=True)
                await asyncio.sleep(3600)  # Wait 1 hour on error
    # ---------------------------
    # WebSocket continuous streams
    # ---------------------------
    async def start_continuous_collection(self):
        """Start continuous data collection via WebSocket"""
        if self.websocket_collection_running:
@@ -667,7 +871,6 @@ class BinanceDataCollector:
                            # Parse kline data
                            ohlcv_data = parse_kline_data(data)
                            # Store in database
                            await db_manager.insert_ohlcv_single(ohlcv_data)
@@ -720,7 +923,10 @@ class BinanceDataCollector:
                    websocket_connections[stream_name] = websocket
                    tick_batch: List[Dict[str, Any]] = []
-                    batch_size = config.get('collection', {}).get('tick_batch_size', 100)
+                    batch_size = int(config.get('collection', {}).get(
                        'tick_batch_size',
                        int(os.getenv("TICK_BATCH_SIZE", "100"))
                    ))
                    async for message in websocket:
                        if not self.websocket_collection_running:
@@ -798,6 +1004,10 @@ class BinanceDataCollector:
        websocket_connections.clear()
        self.logger.info("Continuous data collection stopped")
    # ---------------------------
    # Candle generation from ticks
    # ---------------------------
    async def generate_candles_from_ticks(
        self,
        symbol: str,
@@ -864,6 +1074,10 @@ class BinanceDataCollector:
        else:
            self.logger.warning(f"No candles generated for {symbol} {interval}")
    # ---------------------------
    # Progress and cleanup
    # ---------------------------
    async def get_download_progress(self, symbol: str = None) -> Dict[str, Any]:
        """Get download progress for a symbol or all symbols"""
        if symbol:
@@ -873,15 +1087,21 @@ class BinanceDataCollector:
    async def cleanup(self):
        """Clean up resources"""
        await self.stop_continuous_collection()
        # db_manager may have a close method; guard if absent
        try:
            if db_manager and hasattr(db_manager, "close"):
                await db_manager.close()
        except Exception as e:
            self.logger.warning(f"Error closing database manager: {e}")
        self.logger.info("BinanceDataCollector cleanup complete")
 # ---------------------------
 # UI process management
 # ---------------------------
 def start_ui_server():
    """Start the UI server as a subprocess"""
    global ui_process
@@ -916,7 +1136,7 @@ def start_ui_server():
            elif ' - DEBUG - ' in line or 'DEBUG:' in line:
                return logging.DEBUG
            else:
-                # Default to INFO for all other lines (including INFO: and standard messages)
+                # Default to INFO for all other lines
                return logging.INFO
        def log_ui_output():
@@ -925,7 +1145,7 @@ def start_ui_server():
                return
            for line in ui_process.stdout:
                line = line.rstrip()
-                if line:  # Only log non-empty lines
+                if line:
                    log_level = parse_log_level(line)
                    logger.log(log_level, f"[UI] {line}")
@@ -935,7 +1155,7 @@ def start_ui_server():
                return
            for line in ui_process.stderr:
                line = line.rstrip()
-                if line:  # Only log non-empty lines
+                if line:
                    log_level = parse_log_level(line)
                    logger.log(log_level, f"[UI] {line}")
@@ -974,7 +1194,10 @@ def stop_ui_server():
        logger.debug("UI server process not running")
 # ---------------------------
 # Global signal handlers
 # ---------------------------
 def signal_handler(signum, frame):
    """Handle shutdown signals"""
    logger = logging.getLogger(__name__)
@@ -988,6 +1211,10 @@ def signal_handler(signum, frame):
        task.cancel()
 # ---------------------------
 # Main entry point
 # ---------------------------
 async def main():
    """Main application entry point"""
    # Setup signal handlers
@@ -1003,7 +1230,14 @@ async def main():
        # Start UI server
        start_ui_server()
-        # Start continuous collection
+        # Optionally kick off an initial full backfill for all configured pairs
        if config.get("collection", {}).get("initial_full_backfill", True):
            # Launch as a background task so WebSockets can start immediately
            task_name = "initial_full_backfill"
            backfill_task = asyncio.create_task(collector.start_bulk_download_for_all_pairs(), name=task_name)
            running_tasks[task_name] = backfill_task
        # Start continuous collection (kline + trade streams) and gap scheduler
        await collector.start_continuous_collection()
        # Keep the application running