Updated main to allow significantly more efficient downloading

2025-10-05 17:53:41 +01:00
parent 32ef7401e3
commit 4d0460944a
1 changed files with 355 additions and 121 deletions
--- a/main.py
+++ b/main.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python3
+
 """
 main.py - Complete Binance Trading Data Collection System

-Main application entry point with async data collection, websocket handling, and task management
+Main application entry point with async data collection, websocket handling, bulk
+backfill orchestration, periodic gap scans, and task management.
+
 """

 import asyncio
@@ -13,11 +16,11 @@ import json
 import subprocess
 import os
 from datetime import datetime, timedelta, timezone
-from typing import Dict, List, Optional, Any
+from typing import Dict, List, Optional, Any, Tuple
 from contextlib import asynccontextmanager

 import websockets
-import aiohttp
+import aiohttp  # kept for future-proofing network ops
 from binance.client import Client
 from binance.exceptions import BinanceAPIException
 import pandas as pd
@@ -52,11 +55,15 @@ class BinanceDataCollector:
        self.websocket_collection_running = False
        self.download_progress: Dict[str, Any] = {}

+        # Concurrency controls from env with sensible defaults
        max_downloads = int(os.getenv('MAX_CONCURRENT_DOWNLOADS', '3'))
        max_gap_fills = int(os.getenv('MAX_CONCURRENT_GAP_FILLS', '2'))
        self._download_semaphore = asyncio.Semaphore(max_downloads)
        self._gap_fill_semaphore = asyncio.Semaphore(max_gap_fills)
-        self.logger.info(f"Initialized with max {max_downloads} concurrent downloads, {max_gap_fills} gap fills")
+
+        self.logger.info(
+            f"Initialized with max {max_downloads} concurrent downloads, {max_gap_fills} gap fills"
+        )

    async def initialize(self):
        """Initialize the data collector"""
@@ -64,10 +71,32 @@ class BinanceDataCollector:

        # Setup logging
        setup_logging()
+
        self.logger.info("Initializing Binance Data Collector")

        # Load configuration
        config = load_config()
+
+        # Optional: add defaults for new flags
+        coll = config.setdefault("collection", {})
+        coll.setdefault("default_record_from_date", "2020-01-01T00:00:00Z")
+        coll.setdefault("initial_full_backfill", True)
+        coll.setdefault("candle_intervals", ["1m", "5m", "15m", "1h", "4h", "1d"])
+        coll.setdefault("bulk_chunk_size", int(os.getenv("BULK_DOWNLOAD_BATCH_SIZE", "1000")))
+        coll.setdefault("tick_batch_size", int(os.getenv("TICK_BATCH_SIZE", "100")))
+        coll.setdefault("max_retries", int(os.getenv("MAX_RETRIES", "3")))
+        coll.setdefault("retry_delay", 1)
+
+        gap = config.setdefault("gap_filling", {})
+        gap.setdefault("enable_auto_gap_filling", True)
+        gap.setdefault("auto_fill_schedule_hours", 24)
+        gap.setdefault("enable_intelligent_averaging", True)
+        gap.setdefault("max_fill_attempts", int(os.getenv("MAX_RETRIES", "3")))
+        gap.setdefault("intervals_to_monitor", coll.get("candle_intervals", ["1m", "5m", "15m", "1h", "4h", "1d"]))
+        gap.setdefault("max_gap_size_candles", 1000)
+        gap.setdefault("averaging_lookback_candles", 10)
+        gap.setdefault("max_consecutive_empty_candles", 5)
+
        self.logger.info(f"Loaded configuration for {len(config['trading_pairs'])} trading pairs")

        # Initialize database
@@ -85,6 +114,124 @@ class BinanceDataCollector:
            self.client = Client()
            self.logger.info("Binance client initialized without API credentials (public data only)")

+    # ---------------------------
+    # Bulk backfill orchestration
+    # ---------------------------
+
+    async def start_bulk_download_for_all_pairs(self):
+        """
+        Automatically launch full-history downloads for all enabled pairs,
+        starting from record_from_date for each pair, across all configured intervals.
+        """
+        global config
+
+        enabled_pairs = [p for p in config['trading_pairs'] if p.get('enabled', True)]
+        if not enabled_pairs:
+            self.logger.warning("No enabled trading pairs found for bulk backfill")
+            return
+
+        # Sort by priority ascending (1 higher priority), then by symbol for stable order
+        enabled_pairs.sort(key=lambda p: (int(p.get('priority', 1)), p.get('symbol', '')))
+
+        # Build tasks per symbol to respect MAX_CONCURRENT_DOWNLOADS at the symbol level
+        tasks: List[asyncio.Task] = []
+        now_utc = datetime.now(timezone.utc)
+
+        for pair in enabled_pairs:
+            symbol = pair['symbol'].upper()
+            start_iso = pair.get('record_from_date') or config["collection"]["default_record_from_date"]
+            try:
+                start_dt = datetime.fromisoformat(start_iso.replace("Z", "+00:00"))
+            except Exception:
+                self.logger.warning(f"Invalid record_from_date for {symbol}: {start_iso}, falling back to default")
+                start_dt = datetime.fromisoformat(config["collection"]["default_record_from_date"].replace("Z", "+00:00"))
+
+            # One task per symbol to execute all intervals concurrently for that symbol
+            tasks.append(asyncio.create_task(
+                self._bulk_download_symbol_all_intervals(symbol, start_dt, now_utc),
+                name=f"bulk_{symbol}"
+            ))
+
+        # Execute with graceful progress logging
+        self.logger.info(f"Launching bulk backfill for {len(tasks)} symbols...")
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        errors = [r for r in results if isinstance(r, Exception)]
+        if errors:
+            self.logger.error(f"Bulk backfill completed with {len(errors)} errors; see logs for details")
+        else:
+            self.logger.info("Bulk backfill completed successfully for all symbols")
+
+    async def _bulk_download_symbol_all_intervals(
+        self,
+        symbol: str,
+        start_date: datetime,
+        end_date: Optional[datetime] = None
+    ):
+        """
+        Launch concurrent downloads of all configured intervals for one symbol,
+        bounded by the download semaphore to control exchange load.
+        """
+        global config
+        async with self._download_semaphore:
+            end_date = end_date or datetime.now(timezone.utc)
+
+            # Ensure progress structure
+            intervals = config.get("collection", {}).get("candle_intervals",
+                                                        ["1m", "5m", "15m", "1h", "4h", "1d"])
+            self.download_progress[symbol] = {
+                "status": "running",
+                "intervals": {iv: {"status": "pending", "records": 0} for iv in intervals},
+                "start_time": datetime.now(timezone.utc).isoformat()
+            }
+
+            # Spawn all intervals concurrently for this symbol
+            self.logger.info(f"Starting concurrent bulk for {symbol} on {intervals}")
+            interval_tasks = [
+                asyncio.create_task(
+                    self._bulk_download_one_interval(symbol, interval, start_date, end_date),
+                    name=f"bulk_{symbol}_{interval}"
+                )
+                for interval in intervals
+            ]
+
+            results = await asyncio.gather(*interval_tasks, return_exceptions=True)
+
+            # Mark final status
+            if any(isinstance(r, Exception) for r in results):
+                self.download_progress[symbol]["status"] = "error"
+                self.download_progress[symbol]["error"] = "One or more intervals failed"
+            else:
+                self.download_progress[symbol]["status"] = "completed"
+            self.download_progress[symbol]["end_time"] = datetime.now(timezone.utc).isoformat()
+
+    async def _bulk_download_one_interval(
+        self,
+        symbol: str,
+        interval: str,
+        start_date: datetime,
+        end_date: datetime
+    ):
+        """Run the bulk downloader for a single symbol+interval and then compute indicators."""
+        # Update status
+        sp = self.download_progress.setdefault(symbol, {"intervals": {}})
+        sp["intervals"].setdefault(interval, {"status": "pending", "records": 0})
+        sp["intervals"][interval]["status"] = "checking"
+
+        records_count = await self._collect_historical_klines(symbol, interval, start_date, end_date)
+        if records_count > 0:
+            sp["intervals"][interval]["status"] = "calculating_indicators"
+            sp["intervals"][interval]["records"] = records_count
+            await self._calculate_and_store_indicators(symbol, interval)
+            sp["intervals"][interval]["status"] = "completed"
+            self.logger.info(f"Completed {interval} data for {symbol} - {records_count} new records")
+        else:
+            sp["intervals"][interval]["status"] = "skipped_complete"
+            self.logger.info(f"Skipped {interval} for {symbol} - data already complete or no new records")
+
+    # ---------------------------
+    # Intelligent bulk downloader
+    # ---------------------------
+
    async def bulk_download_historical_data(
        self, symbol: str, start_date: datetime, end_date: Optional[datetime] = None,
        intervals: Optional[List[str]] = None
@@ -92,6 +239,7 @@ class BinanceDataCollector:
        """
        Bulk download historical OHLCV data from Binance with intelligent gap detection.
        Only downloads data that doesn't already exist in the database.
+        Note: kept for API/UI compatibility; orchestration now prefers start_bulk_download_for_all_pairs.
        """
        async with self._download_semaphore:
            if end_date is None:
@@ -107,7 +255,8 @@ class BinanceDataCollector:

            # Get intervals
            if intervals is None:
-                intervals = config.get("collection", {}).get("candle_intervals", ["1m", "5m", "15m", "1h", "4h", "1d"])
+                intervals = config.get("collection", {}).get("candle_intervals",
+                                                             ["1m", "5m", "15m", "1h", "4h", "1d"])

            # Initialize progress tracking
            self.download_progress[symbol] = {
@@ -115,38 +264,23 @@ class BinanceDataCollector:
                "intervals": {},
                "start_time": datetime.now(timezone.utc).isoformat()
            }
+
            for interval in intervals:
-                self.download_progress[symbol]["intervals"][interval] = {
-                    "status": "pending",
-                    "records": 0
-                }
+                self.download_progress[symbol]["intervals"][interval] = {"status": "pending", "records": 0}

            try:
-                for interval in intervals:
-                    self.logger.info(f"Processing {interval} data for {symbol}")
-                    self.download_progress[symbol]["intervals"][interval]["status"] = "checking"
-
-                    # Intelligent download - only missing data
-                    records_count = await self._collect_historical_klines(
-                        symbol, interval, start_date, end_date
-                    )
-
-                    if records_count > 0:
-                        self.download_progress[symbol]["intervals"][interval]["status"] = "calculating_indicators"
-                        self.download_progress[symbol]["intervals"][interval]["records"] = records_count
-
-                        # Calculate indicators for new data
-                        await self._calculate_and_store_indicators(symbol, interval)
-                        self.download_progress[symbol]["intervals"][interval]["status"] = "completed"
-                        self.logger.info(f"Completed {interval} data for {symbol} - {records_count} new records")
-                    else:
-                        self.download_progress[symbol]["intervals"][interval]["status"] = "skipped_complete"
-                        self.logger.info(f"Skipped {interval} for {symbol} - data already complete")
+                # Run intervals concurrently to improve throughput for one symbol
+                tasks = [
+                    asyncio.create_task(self._bulk_download_one_interval(symbol, interval, start_date, end_date),
+                                        name=f"bulk_single_{symbol}_{interval}")
+                    for interval in intervals
+                ]
+                await asyncio.gather(*tasks)

                self.download_progress[symbol]["status"] = "completed"
                self.download_progress[symbol]["end_time"] = datetime.now(timezone.utc).isoformat()
            except Exception as e:
-                self.logger.error(f"Error in bulk download for {symbol}: {e}")
+                self.logger.error(f"Error in bulk download for {symbol}: {e}", exc_info=True)
                self.download_progress[symbol]["status"] = "error"
                self.download_progress[symbol]["error"] = str(e)
                raise
@@ -172,7 +306,7 @@ class BinanceDataCollector:
        )

        # If coverage is complete, skip download
-        if coverage_check['is_complete']:
+        if coverage_check.get('is_complete'):
            self.logger.info(
                f"Skipping {symbol} {interval} - data already complete "
                f"({coverage_check['coverage_percent']:.2f}% coverage)"
@@ -183,6 +317,7 @@ class BinanceDataCollector:
        missing_ranges = await db_manager.get_missing_time_ranges(
            symbol, interval, start_date, end_date
        )
+
        if not missing_ranges:
            self.logger.info(f"No missing data ranges for {symbol} {interval}")
            return 0
@@ -205,7 +340,6 @@ class BinanceDataCollector:
                symbol, interval, range_start, range_end
            )
            total_new_records += records_in_range
-
            self.logger.info(
                f"Downloaded {records_in_range} records for range {idx}/{len(missing_ranges)}"
            )
@@ -249,104 +383,132 @@ class BinanceDataCollector:
                "T": int(kline_row[6]),
                "s": symbol.upper(),
                "i": interval,
-                "f": None,       # first trade id (unknown from REST row)
-                "L": None,       # last trade id (unknown)
+                "f": None,  # first trade id
+                "L": None,  # last trade id
                "o": str(kline_row[1]),
                "c": str(kline_row[4]),
                "h": str(kline_row[2]),
                "l": str(kline_row[3]),
                "v": str(kline_row[5]),
                "n": int(kline_row[8]),
-                "x": True,       # REST klines are for closed candles
+                "x": True,  # closed candle
                "q": str(kline_row[7]),
-                "V": None,       # taker buy base asset volume (optional)
-                "Q": None,       # taker buy quote asset volume (optional)
+                "V": None,  # taker buy base vol (optional)
+                "Q": None,  # taker buy quote vol (optional)
                "B": None   # ignore
            }
        }

+    async def _get_historical_klines_async(
+        self, symbol: str, interval: str, start_ms: int, end_ms: int, limit: int
+    ) -> List[List[Any]]:
+        """
+        Run python-binance get_historical_klines in a worker thread to avoid blocking the event loop.
+        """
+        def call():
+            return self.client.get_historical_klines(
+                symbol=symbol,
+                interval=interval,
+                start_str=start_ms,
+                end_str=end_ms,
+                limit=limit
+            )
+        return await asyncio.to_thread(call)
+
    async def _download_time_range(
        self, symbol: str, interval: str, start_date: datetime, end_date: datetime
    ) -> int:
        """
        Download data for a specific time range (internal method).
-        This is the actual download logic extracted from the original collect_historical_klines.
        Returns:
            Number of records downloaded and inserted
        """
        global config, db_manager

-        chunk_size = config.get("collection", {}).get("bulk_chunk_size", 1000)
-        max_retries = config.get("collection", {}).get("max_retries", 3)
-        retry_delay = config.get("collection", {}).get("retry_delay", 1)
+        # Resolve batch size and retry policy, prefer config then env
+        chunk_size = int(config.get("collection", {}).get("bulk_chunk_size",
+                       int(os.getenv("BULK_DOWNLOAD_BATCH_SIZE", "1000"))))
+        max_retries = int(config.get("collection", {}).get("max_retries",
+                        int(os.getenv("MAX_RETRIES", "3"))))
+        retry_delay = float(config.get("collection", {}).get("retry_delay", 1))

-        # Normalize time inputs that might be naive time objects
+        # Normalize time inputs
        from datetime import time as dt_time
        if isinstance(start_date, dt_time):
-            # Use today's date in UTC for safety if only a time is provided
            start_date = datetime.combine(datetime.now(timezone.utc).date(), start_date)
        if isinstance(end_date, dt_time):
-            # Use the same date as start_date if possible for consistency
            base_date = start_date.date() if isinstance(start_date, datetime) else datetime.now(timezone.utc).date()
            end_date = datetime.combine(base_date, end_date)
-
        if start_date.tzinfo is None:
            start_date = start_date.replace(tzinfo=timezone.utc)
        if end_date.tzinfo is None:
            end_date = end_date.replace(tzinfo=timezone.utc)

-        # Convert to naive UTC for Binance API
-        current_start = start_date.replace(tzinfo=None)
-        end = end_date.replace(tzinfo=None)
+        # Binance API expects naive UTC timestamps (ms)
+        current_start = start_date.replace(tzinfo=timezone.utc)
+        end = end_date.replace(tzinfo=timezone.utc)

        total_records = 0
-        retry_count = 0
+        consecutive_empty = 0

        while current_start < end:
            try:
                # Calculate chunk end time based on interval
                chunk_end = self._calculate_chunk_end(current_start, interval, chunk_size)
-                chunk_end = min(chunk_end, end)
+                if chunk_end > end:
+                    chunk_end = end

-                # Get klines from Binance with retry logic
                klines: Optional[List[List[Any]]] = None
+                # Try with retry policy; also handle rate-limit backoffs
                for attempt in range(max_retries):
                    try:
-                        klines = self.client.get_historical_klines(
+                        klines = await self._get_historical_klines_async(
                            symbol=symbol,
                            interval=interval,
-                            start_str=int(current_start.timestamp() * 1000),
-                            end_str=int(chunk_end.timestamp() * 1000),
+                            start_ms=int(current_start.timestamp() * 1000),
+                            end_ms=int(chunk_end.timestamp() * 1000),
                            limit=chunk_size
                        )
                        break
                    except BinanceAPIException as e:
                        if e.code == -1003:  # Rate limit
                            wait_time = retry_delay * (2 ** attempt)
-                            self.logger.warning(f"Rate limit hit, waiting {wait_time}s before retry")
+                            self.logger.warning(f"Rate limit hit for {symbol} {interval}, waiting {wait_time}s")
                            await asyncio.sleep(wait_time)
                        else:
-                            raise
-                    except Exception as e:
+                            self.logger.error(f"Binance API exception for {symbol} {interval}: {e}")
+                            if attempt == max_retries - 1:
+                                raise
+                            await asyncio.sleep(retry_delay)
+                    except Exception as e:
+                        self.logger.warning(f"Attempt {attempt + 1}/{max_retries} failed for {symbol} {interval}: {e}")
                        if attempt == max_retries - 1:
                            raise
-                        self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
                        await asyncio.sleep(retry_delay)

+                # No data returned; advance conservatively or terminate
                if not klines or len(klines) == 0:
-                    self.logger.info(f"No more data available for {symbol} {interval}")
+                    consecutive_empty += 1
+                    # If multiple consecutive empty chunks, assume past available history
+                    if consecutive_empty >= 2:
+                        self.logger.info(f"No more data available for {symbol} {interval}; ending range loop")
                        break
+                    # Otherwise, advance by one chunk and continue
+                    current_start = chunk_end + timedelta(milliseconds=1)
+                    await asyncio.sleep(0.05)
+                    continue
+
+                # Reset empty counter on success
+                consecutive_empty = 0

                # Parse and store klines
                ohlcv_data: List[Dict[str, Any]] = []
                for kline in klines:
                    try:
-                        # Normalize to WebSocket-style event expected by parse_kline_data
                        ws_event = self._rest_kline_to_ws_event(symbol, interval, kline)
                        parsed_data = parse_kline_data(ws_event)
                        ohlcv_data.append(parsed_data)
                    except Exception as e:
-                        # Keep original message to aid debugging if structure differs
                        self.logger.error(f"Error parsing kline data: {e} | raw={kline!r}")
                        continue

@@ -356,57 +518,63 @@ class BinanceDataCollector:
                    total_records += len(ohlcv_data)

                    # Update progress
-                    if symbol in self.download_progress and interval in self.download_progress[symbol]["intervals"]:
+                    if symbol in self.download_progress and \
+                       interval in self.download_progress[symbol].get("intervals", {}):
                        self.download_progress[symbol]["intervals"][interval]["records"] = total_records

-                    self.logger.debug(f"Stored {len(ohlcv_data)} {interval} candles for {symbol} (total: {total_records})")
+                    self.logger.debug(
+                        f"Stored {len(ohlcv_data)} {interval} candles for {symbol} (total: {total_records})"
+                    )

-                # Update current_start for next chunk
+                # Update current_start based on last candle close time
                if klines:
-                    last_close_time_ms = klines[-1][6]  # Use the close time of the last kline
-                    current_start = datetime.utcfromtimestamp((last_close_time_ms + 1) / 1000)
+                    last_close_time_ms = int(klines[-1][6])  # close time ms
+                    # Advance by 1 ms past last close to avoid duplicate fetch
+                    current_start = datetime.utcfromtimestamp((last_close_time_ms + 1) / 1000).replace(tzinfo=timezone.utc)
                else:
                    break

-                # Delay to respect rate limits
-                await asyncio.sleep(0.2)
-                retry_count = 0  # Reset retry count on success
-
-            except BinanceAPIException as e:
-                retry_count += 1
-                self.logger.error(f"Binance API error (attempt {retry_count}): {e}")
-                if retry_count >= max_retries:
-                    self.logger.error(f"Max retries reached for {symbol} {interval}")
-                    raise
-                # Exponential backoff
-                wait_time = retry_delay * (2 ** retry_count)
-                await asyncio.sleep(wait_time)
+                # Light delay to avoid hammering
+                await asyncio.sleep(0.05)

            except asyncio.CancelledError:
                self.logger.info(f"Download for {symbol} {interval} cancelled")
                break
-
            except Exception as e:
                self.logger.error(f"Error collecting {interval} data for {symbol}: {e}", exc_info=True)
-                raise
+                # Backoff before continuing or aborting loop
+                await asyncio.sleep(max(0.5, retry_delay))
+                # Decide: keep attempting next loop iteration to avoid a hard stop
+                # If persistent errors, the retries around REST will surface again
+                # Optional: could break here if desired, but continuing is safer for coverage

        return total_records

+    # ---------------------------
+    # Technical indicators
+    # ---------------------------
+
    async def _calculate_and_store_indicators(self, symbol: str, interval: str):
        """Calculate and store technical indicators for a symbol and interval"""
        try:
            # Check if indicators are enabled for this interval
            indicator_config = config.get('technical_indicators', {})
-            calc_intervals = indicator_config.get('calculation_intervals', ['1m', '5m', '15m', '1h', '4h', '1d'])
+            calc_intervals = indicator_config.get('calculation_intervals',
+                                                  ['1m', '5m', '15m', '1h', '4h', '1d'])
            if interval not in calc_intervals:
-                self.logger.debug(f"Skipping indicators for {symbol} {interval} (not in calculation_intervals)")
+                self.logger.debug(
+                    f"Skipping indicators for {symbol} {interval} (not in calculation_intervals)"
+                )
                return

            # Get OHLCV data from database (need enough for longest indicator period)
-            max_period = 200  # Maximum period for indicators like SMA-200
+            max_period = 200  # SMA-200, etc.
            ohlcv_data = await db_manager.get_ohlcv_data(symbol, interval, limit=max_period + 50)
-            if len(ohlcv_data) < 50:  # Need minimum data for indicators
-                self.logger.warning(f"Not enough data for indicators: {symbol} {interval} ({len(ohlcv_data)} records)")
+
+            if len(ohlcv_data) < 50:
+                self.logger.warning(
+                    f"Not enough data for indicators: {symbol} {interval} ({len(ohlcv_data)} records)"
+                )
                return

            # Convert to DataFrame
@@ -429,11 +597,21 @@ class BinanceDataCollector:
            # Store indicators in database
            if indicators_data:
                await db_manager.insert_indicators_batch(symbol, interval, indicators_data)
-                self.logger.info(f"Stored {len(indicators_data)} indicator values for {symbol} {interval}")
+                self.logger.info(
+                    f"Stored {len(indicators_data)} indicator values for {symbol} {interval}"
+                )
+
        except asyncio.CancelledError:
            self.logger.info(f"Indicator calculation cancelled for {symbol} {interval}")
        except Exception as e:
-            self.logger.error(f"Error calculating indicators for {symbol} {interval}: {e}", exc_info=True)
+            self.logger.error(
+                f"Error calculating indicators for {symbol} {interval}: {e}",
+                exc_info=True
+            )
+
+    # ---------------------------
+    # Gap detection and filling
+    # ---------------------------

    async def auto_fill_gaps(
        self,
@@ -443,6 +621,7 @@ class BinanceDataCollector:
    ) -> Dict[str, Any]:
        """
        Automatically fill gaps for a symbol
+
        Args:
            symbol: Trading pair symbol
            intervals: List of intervals to fill (default: from config)
@@ -455,9 +634,11 @@ class BinanceDataCollector:
            global config, db_manager

            if intervals is None:
-                intervals = config.get('gap_filling', {}).get('intervals_to_monitor', ['1m', '5m', '15m', '1h', '4h', '1d'])
+                intervals = config.get('gap_filling', {}).get('intervals_to_monitor',
+                                                              ['1m', '5m', '15m', '1h', '4h', '1d'])

            self.logger.info(f"Starting auto gap fill for {symbol} on intervals: {intervals}")
+
            results: Dict[str, Any] = {
                'symbol': symbol,
                'intervals': {},
@@ -472,20 +653,23 @@ class BinanceDataCollector:
                    self.logger.warning(f"Symbol {symbol} not found in config")
                    return results

-                record_from_date = pair_config.get('record_from_date')
-                if not record_from_date:
-                    record_from_date = config.get('collection', {}).get('default_record_from_date', '2020-01-01T00:00:00Z')
-                _ = datetime.fromisoformat(record_from_date.replace('Z', '+00:00'))  # kept for future use
+                record_from_date_iso = pair_config.get('record_from_date') or \
+                                       config.get('collection', {}).get('default_record_from_date', '2020-01-01T00:00:00Z')
+                _ = datetime.fromisoformat(record_from_date_iso.replace('Z', '+00:00'))  # reserved

                gap_config = config.get('gap_filling', {})
                max_gap_size = gap_config.get('max_gap_size_candles', 1000)
+                max_attempts = int(gap_config.get('max_fill_attempts',
+                                  int(config.get("collection", {}).get("max_retries",
+                                      int(os.getenv("MAX_RETRIES", "3"))))))
+                averaging_lookback = gap_config.get('averaging_lookback_candles', 10)
+                max_empty_seq = gap_config.get('max_consecutive_empty_candles', 5)

                for interval in intervals:
                    self.logger.info(f"Checking gaps for {symbol} {interval}")
                    # Detect gaps
                    gaps_info = await db_manager.detect_gaps(symbol, interval)
                    gaps = gaps_info.get('gaps', [])
-
                    interval_result = {
                        'gaps_found': len(gaps),
                        'gaps_filled': 0,
@@ -493,7 +677,6 @@ class BinanceDataCollector:
                        'errors': []
                    }

-                    # Fill downloadable gaps
                    for gap in gaps:
                        missing_candles = gap['missing_candles']
                        # Skip if gap is too large
@@ -503,32 +686,48 @@ class BinanceDataCollector:
                            continue

                        try:
-                            # Download missing data
                            gap_start = datetime.fromisoformat(gap['gap_start'])
                            gap_end = datetime.fromisoformat(gap['gap_end'])
                            self.logger.info(f"Filling gap: {gap_start} to {gap_end}")

-                            records_count = await self._collect_historical_klines(
-                                symbol, interval, gap_start, gap_end
+                            # Attempt multiple real fills before resorting to averaging
+                            real_filled_records = 0
+                            for attempt in range(1, max_attempts + 1):
+                                try:
+                                    # Small buffer around the gap to ensure edges are covered
+                                    buffered_start = gap_start - timedelta(milliseconds=1)
+                                    buffered_end = gap_end + timedelta(milliseconds=1)
+                                    added = await self._collect_historical_klines(
+                                        symbol, interval, buffered_start, buffered_end
                                    )
+                                    real_filled_records += added
+                                    if added > 0:
+                                        # A successful fill; break early
+                                        break
+                                except Exception as e:
+                                    # Log and continue attempts
+                                    interval_result['errors'].append(
+                                        f"Attempt {attempt} failed for gap {gap_start}->{gap_end}: {e}"
+                                    )
+                                    await asyncio.sleep(0.5 * attempt)

-                            if records_count > 0:
+                            if real_filled_records > 0:
                                interval_result['gaps_filled'] += 1
                                results['total_gaps_filled'] += 1
-                                self.logger.info(f"Successfully filled gap with {records_count} records")
+                                self.logger.info(f"Successfully filled gap with {real_filled_records} records")
                            else:
                                # Genuine empty gap - fill with averages if enabled
-                                if fill_genuine_gaps:
+                                if fill_genuine_gaps and gap_config.get('enable_intelligent_averaging', True):
                                    filled = await db_manager.fill_genuine_gaps_with_averages(
                                        symbol, interval,
-                                        gap_config.get('max_consecutive_empty_candles', 5),
-                                        gap_config.get('averaging_lookback_candles', 10)
+                                        max_empty_seq,
+                                        averaging_lookback
                                    )
                                    interval_result['genuine_filled'] += filled
                                    results['total_genuine_filled'] += filled

                            # Small delay between gaps
-                            await asyncio.sleep(0.5)
+                            await asyncio.sleep(0.2)

                        except Exception as e:
                            error_msg = f"Error filling gap: {str(e)}"
@@ -537,7 +736,7 @@ class BinanceDataCollector:

                    results['intervals'][interval] = interval_result

-                    # Calculate and store indicators after filling gaps
+                    # Calculate and store indicators after any fills
                    if interval_result['gaps_filled'] > 0 or interval_result['genuine_filled'] > 0:
                        try:
                            await self._calculate_and_store_indicators(symbol, interval)
@@ -554,7 +753,7 @@ class BinanceDataCollector:

    async def start_auto_gap_fill_scheduler(self):
        """Start background task for automatic gap filling"""
-        global config, db_manager
+        global config
        gap_config = config.get('gap_filling', {})
        if not gap_config.get('enable_auto_gap_filling', False):
            self.logger.info("Auto gap filling is disabled")
@@ -562,6 +761,7 @@ class BinanceDataCollector:

        schedule_hours = gap_config.get('auto_fill_schedule_hours', 24)
        self.logger.info(f"Starting auto gap fill scheduler (every {schedule_hours} hours)")
+        self.is_collecting = True

        while self.is_collecting:
            try:
@@ -590,6 +790,10 @@ class BinanceDataCollector:
                self.logger.error(f"Error in auto gap fill scheduler: {e}", exc_info=True)
                await asyncio.sleep(3600)  # Wait 1 hour on error

+    # ---------------------------
+    # WebSocket continuous streams
+    # ---------------------------
+
    async def start_continuous_collection(self):
        """Start continuous data collection via WebSocket"""
        if self.websocket_collection_running:
@@ -667,7 +871,6 @@ class BinanceDataCollector:

                            # Parse kline data
                            ohlcv_data = parse_kline_data(data)
-
                            # Store in database
                            await db_manager.insert_ohlcv_single(ohlcv_data)

@@ -720,7 +923,10 @@ class BinanceDataCollector:
                    websocket_connections[stream_name] = websocket

                    tick_batch: List[Dict[str, Any]] = []
-                    batch_size = config.get('collection', {}).get('tick_batch_size', 100)
+                    batch_size = int(config.get('collection', {}).get(
+                        'tick_batch_size',
+                        int(os.getenv("TICK_BATCH_SIZE", "100"))
+                    ))

                    async for message in websocket:
                        if not self.websocket_collection_running:
@@ -798,6 +1004,10 @@ class BinanceDataCollector:
        websocket_connections.clear()
        self.logger.info("Continuous data collection stopped")

+    # ---------------------------
+    # Candle generation from ticks
+    # ---------------------------
+
    async def generate_candles_from_ticks(
        self,
        symbol: str,
@@ -864,6 +1074,10 @@ class BinanceDataCollector:
        else:
            self.logger.warning(f"No candles generated for {symbol} {interval}")

+    # ---------------------------
+    # Progress and cleanup
+    # ---------------------------
+
    async def get_download_progress(self, symbol: str = None) -> Dict[str, Any]:
        """Get download progress for a symbol or all symbols"""
        if symbol:
@@ -873,15 +1087,21 @@ class BinanceDataCollector:
    async def cleanup(self):
        """Clean up resources"""
        await self.stop_continuous_collection()
+
        # db_manager may have a close method; guard if absent
        try:
            if db_manager and hasattr(db_manager, "close"):
                await db_manager.close()
        except Exception as e:
            self.logger.warning(f"Error closing database manager: {e}")
+
        self.logger.info("BinanceDataCollector cleanup complete")


+# ---------------------------
+# UI process management
+# ---------------------------
+
 def start_ui_server():
    """Start the UI server as a subprocess"""
    global ui_process
@@ -916,7 +1136,7 @@ def start_ui_server():
            elif ' - DEBUG - ' in line or 'DEBUG:' in line:
                return logging.DEBUG
            else:
-                # Default to INFO for all other lines (including INFO: and standard messages)
+                # Default to INFO for all other lines
                return logging.INFO

        def log_ui_output():
@@ -925,7 +1145,7 @@ def start_ui_server():
                return
            for line in ui_process.stdout:
                line = line.rstrip()
-                if line:  # Only log non-empty lines
+                if line:
                    log_level = parse_log_level(line)
                    logger.log(log_level, f"[UI] {line}")

@@ -935,7 +1155,7 @@ def start_ui_server():
                return
            for line in ui_process.stderr:
                line = line.rstrip()
-                if line:  # Only log non-empty lines
+                if line:
                    log_level = parse_log_level(line)
                    logger.log(log_level, f"[UI] {line}")

@@ -974,7 +1194,10 @@ def stop_ui_server():
        logger.debug("UI server process not running")


+# ---------------------------
 # Global signal handlers
+# ---------------------------
+
 def signal_handler(signum, frame):
    """Handle shutdown signals"""
    logger = logging.getLogger(__name__)
@@ -988,6 +1211,10 @@ def signal_handler(signum, frame):
        task.cancel()


+# ---------------------------
+# Main entry point
+# ---------------------------
+
 async def main():
    """Main application entry point"""
    # Setup signal handlers
@@ -1003,7 +1230,14 @@ async def main():
        # Start UI server
        start_ui_server()

-        # Start continuous collection
+        # Optionally kick off an initial full backfill for all configured pairs
+        if config.get("collection", {}).get("initial_full_backfill", True):
+            # Launch as a background task so WebSockets can start immediately
+            task_name = "initial_full_backfill"
+            backfill_task = asyncio.create_task(collector.start_bulk_download_for_all_pairs(), name=task_name)
+            running_tasks[task_name] = backfill_task
+
+        # Start continuous collection (kline + trade streams) and gap scheduler
        await collector.start_continuous_collection()

        # Keep the application running