Intermediate

Data Collection

Fetch historical stock prices with yfinance, collect financial news headlines from NewsAPI, and engineer features for the prediction model.

Step 1: Stock Price Data

# app/data_collector.py
import yfinance as yf
import pandas as pd
import logging
from datetime import datetime, timedelta

logger = logging.getLogger(__name__)


class StockDataCollector:
    def fetch_history(self, ticker: str, days: int = 730) -> pd.DataFrame:
        """Fetch historical OHLCV data."""
        stock = yf.Ticker(ticker)
        end = datetime.now()
        start = end - timedelta(days=days)
        df = stock.history(start=start, end=end)
        df.index = pd.to_datetime(df.index)
        df = df[["Open", "High", "Low", "Close", "Volume"]]
        logger.info(f"Fetched {len(df)} days for {ticker}")
        return df

    def fetch_multiple(self, tickers: list[str], days: int = 730) -> dict[str, pd.DataFrame]:
        """Fetch data for multiple tickers."""
        return {t: self.fetch_history(t, days) for t in tickers}

Step 2: News Headlines

# News collection with NewsAPI
from newsapi import NewsApiClient

class NewsCollector:
    def __init__(self, api_key: str):
        self.api = NewsApiClient(api_key=api_key)

    def fetch_headlines(self, query: str, days: int = 30) -> list[dict]:
        """Fetch recent news headlines for a company."""
        from_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
        result = self.api.get_everything(
            q=query, from_param=from_date,
            language="en", sort_by="relevancy", page_size=100,
        )
        articles = []
        for a in result.get("articles", []):
            articles.append({
                "title": a["title"],
                "description": a.get("description", ""),
                "published_at": a["publishedAt"],
                "source": a["source"]["name"],
            })
        logger.info(f"Fetched {len(articles)} articles for {query}")
        return articles

Step 3: Feature Engineering

class FeatureEngineer:
    def add_returns(self, df: pd.DataFrame) -> pd.DataFrame:
        """Add return columns."""
        df["daily_return"] = df["Close"].pct_change()
        df["log_return"] = np.log(df["Close"] / df["Close"].shift(1))
        df["volatility_20d"] = df["daily_return"].rolling(20).std()
        return df

    def add_lag_features(self, df: pd.DataFrame, lags: int = 5) -> pd.DataFrame:
        """Add lagged price features."""
        for i in range(1, lags + 1):
            df[f"close_lag_{i}"] = df["Close"].shift(i)
            df[f"return_lag_{i}"] = df["daily_return"].shift(i)
        return df

    def add_volume_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Add volume-based features."""
        df["volume_sma_20"] = df["Volume"].rolling(20).mean()
        df["volume_ratio"] = df["Volume"] / df["volume_sma_20"]
        return df

Testing

collector = StockDataCollector()
df = collector.fetch_history("AAPL", days=365)
print(f"Shape: {df.shape}")
print(df.tail())
💡
Data quality: Always check for missing values, stock splits, and dividends. yfinance adjusts for splits by default, but verify with df.isnull().sum() and forward-fill any gaps.

Key Takeaways

  • yfinance provides free OHLCV data with automatic adjustment for splits and dividends.
  • NewsAPI free tier allows 100 requests/day, sufficient for daily sentiment updates.
  • Feature engineering adds returns, volatility, lag features, and volume ratios.
  • Always validate data quality before feeding it to the model.