Source code for app.core.data_factory

import os
import logging
from io import BytesIO
from typing import Union, Dict

import pandas as pd


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)



[docs]
class DataFactory:

[docs]
    @staticmethod
    def merge_dfs(*args) -> pd.DataFrame:
        try:
            return pd.concat([*args], ignore_index=True).drop_duplicates()
        except Exception as e:
            logger.exception(e)
        return args[0]



[docs]
    @staticmethod
    def from_dict(d: Dict) -> Union[pd.DataFrame, None]:
        try:
            return pd.DataFrame([d])
        except Exception as e:
            logger.exception(f"Couldn't make df from python dict: {e}")
        return



[docs]
    @staticmethod
    def from_bytes(filename: str, file_bytes: bytes) -> Union[pd.DataFrame, None]:
        """
        Create a pandas DataFrame from file bytes based on file extension.

        Supports CSV, Excel, JSON, and Parquet formats.

        :param filename: Original filename (used for format detection)
        :param file_bytes: File content in bytes
        :return: Pandas DataFrame or None if unsupported file format provided.
        """
        file_stream = BytesIO(file_bytes)
        filename, extension = os.path.splitext(filename)

        if extension == ".csv":
            return pd.read_csv(file_stream)  # type: ignore
        elif extension in [".xls", ".xlsx"]:
            return pd.read_excel(file_stream, engine="openpyxl")
        elif extension == ".json":
            return pd.read_json(file_stream)
        elif extension == ".parquet":
            return pd.read_parquet(file_stream, engine="pyarrow")

        logger.exception(
            "Unsupported file format. Supported formats: CSV, Excel, JSON, Parquet."
        )
        return