services.csv_analyzer

Módulo csv_analyzer.py

[PT-BR] Responsável por analisar o CSV gerado pelo sistema de captura de dados dos Pokémon, identificando dados ausentes, tipos de dados e gerando um sumário básico de informações.

[EN] Responsible for analyzing the CSV generated by the Pokémon data collection system, identifying missing data, data types, and generating a basic dataset summary.

Uso típico / Typical usage:

analyzer = PokemonCSVAnalyzer("output/pokemons.csv")
analyzer.run_full_report()
 1"""
 2Módulo csv_analyzer.py
 3======================
 4
 5[PT-BR] Responsável por analisar o CSV gerado pelo sistema de captura de dados dos Pokémon,
 6identificando dados ausentes, tipos de dados e gerando um sumário básico de informações.
 7
 8[EN] Responsible for analyzing the CSV generated by the Pokémon data collection system,
 9identifying missing data, data types, and generating a basic dataset summary.
10
11Uso típico / Typical usage:
12
13    analyzer = PokemonCSVAnalyzer("output/pokemons.csv")
14    analyzer.run_full_report()
15"""
16import logging
17from typing import Optional
18import pandas as pd  # type: ignore
19from pandas.errors import EmptyDataError, ParserError # type: ignore
20
21class PokemonCSVAnalyzer:
22    """
23    [PT-BR] Classe utilitária para executar diferentes validações em um CSV de Pokémon.
24    [EN] Utility class to run various validations on a Pokémon CSV.
25    """
26
27    def __init__(self, csv_path: str, encoding: str = "utf-8"):
28        self.csv_path = csv_path
29        self.encoding = encoding
30        try:
31            self.df = pd.read_csv(csv_path, encoding=encoding)
32            logging.info("CSV loaded successfully from '%s' (%d rows, %d columns)",
33                         csv_path, len(self.df), len(self.df.columns))
34        except (FileNotFoundError, UnicodeDecodeError, EmptyDataError, ParserError) as e:
35            logging.error("Failed to load CSV '%s': %s", csv_path, str(e), exc_info=True)
36            raise
37
38    def log_summary(self, head_n: int = 5) -> None:
39        """
40        [PT-BR] Exibe no log um resumo estatístico e estrutural do conjunto de dados.
41        [EN] Logs a structural and statistical summary of the dataset.
42        """
43        logging.info("=== Dataset summary ===")
44        logging.info("Total rows   : %d", len(self.df))
45        logging.info("Total columns: %d", len(self.df.columns))
46        logging.info("Column names : %s", list(self.df.columns))
47
48        logging.info("Column dtypes:")
49        for col, dtype in self.df.dtypes.items():
50            logging.info("  • %-20s %s", col, dtype)
51
52        logging.info("First %d rows:", head_n)
53        for idx, row in self.df.head(head_n).iterrows():
54            logging.info("  Row %d%s", idx, row.to_dict())
55
56        stats = self.df.describe(include='all').transpose()
57        logging.info("Statistical summary:")
58        for col in stats.index:
59            logging.info("  • %-20s %s", col, stats.loc[col].dropna().to_dict())
60
61    def log_missing_values(self) -> None:
62        """
63        [PT-BR] Exibe no log a quantidade de valores ausentes por coluna.
64        [EN] Logs the number of missing (blank or empty) values per column.
65        """
66        missing_mask = self.df.isna() | (self.df == "")
67        missing_counts = missing_mask.sum()
68        total_missing = missing_counts.sum()
69
70        logging.info("=== Missing-value check ===")
71        if total_missing == 0:
72            logging.info("All columns are fully populated.")
73            return
74
75        logging.warning("Found %d missing value(s) across the dataset.", total_missing)
76        for col, cnt in missing_counts[missing_counts > 0].sort_values(ascending=False).items():
77            logging.warning("  • %-20s %d missing", col, cnt)
78
79    def run_full_report(self, show_summary: bool = True, check_missing: bool = True, head_n: int = 5) -> None:
80        """
81        [PT-BR] Executa as análises selecionadas em sequência.
82        [EN] Runs selected analyses in sequence.
83
84        Parâmetros:
85            show_summary (bool): mostra o resumo estrutural e estatístico.
86            check_missing (bool): verifica valores ausentes.
87            head_n (int): número de linhas a exibir no resumo.
88        """
89        if show_summary:
90            self.log_summary(head_n=head_n)
91        if check_missing:
92            self.log_missing_values()
class PokemonCSVAnalyzer:
22class PokemonCSVAnalyzer:
23    """
24    [PT-BR] Classe utilitária para executar diferentes validações em um CSV de Pokémon.
25    [EN] Utility class to run various validations on a Pokémon CSV.
26    """
27
28    def __init__(self, csv_path: str, encoding: str = "utf-8"):
29        self.csv_path = csv_path
30        self.encoding = encoding
31        try:
32            self.df = pd.read_csv(csv_path, encoding=encoding)
33            logging.info("CSV loaded successfully from '%s' (%d rows, %d columns)",
34                         csv_path, len(self.df), len(self.df.columns))
35        except (FileNotFoundError, UnicodeDecodeError, EmptyDataError, ParserError) as e:
36            logging.error("Failed to load CSV '%s': %s", csv_path, str(e), exc_info=True)
37            raise
38
39    def log_summary(self, head_n: int = 5) -> None:
40        """
41        [PT-BR] Exibe no log um resumo estatístico e estrutural do conjunto de dados.
42        [EN] Logs a structural and statistical summary of the dataset.
43        """
44        logging.info("=== Dataset summary ===")
45        logging.info("Total rows   : %d", len(self.df))
46        logging.info("Total columns: %d", len(self.df.columns))
47        logging.info("Column names : %s", list(self.df.columns))
48
49        logging.info("Column dtypes:")
50        for col, dtype in self.df.dtypes.items():
51            logging.info("  • %-20s %s", col, dtype)
52
53        logging.info("First %d rows:", head_n)
54        for idx, row in self.df.head(head_n).iterrows():
55            logging.info("  Row %d%s", idx, row.to_dict())
56
57        stats = self.df.describe(include='all').transpose()
58        logging.info("Statistical summary:")
59        for col in stats.index:
60            logging.info("  • %-20s %s", col, stats.loc[col].dropna().to_dict())
61
62    def log_missing_values(self) -> None:
63        """
64        [PT-BR] Exibe no log a quantidade de valores ausentes por coluna.
65        [EN] Logs the number of missing (blank or empty) values per column.
66        """
67        missing_mask = self.df.isna() | (self.df == "")
68        missing_counts = missing_mask.sum()
69        total_missing = missing_counts.sum()
70
71        logging.info("=== Missing-value check ===")
72        if total_missing == 0:
73            logging.info("All columns are fully populated.")
74            return
75
76        logging.warning("Found %d missing value(s) across the dataset.", total_missing)
77        for col, cnt in missing_counts[missing_counts > 0].sort_values(ascending=False).items():
78            logging.warning("  • %-20s %d missing", col, cnt)
79
80    def run_full_report(self, show_summary: bool = True, check_missing: bool = True, head_n: int = 5) -> None:
81        """
82        [PT-BR] Executa as análises selecionadas em sequência.
83        [EN] Runs selected analyses in sequence.
84
85        Parâmetros:
86            show_summary (bool): mostra o resumo estrutural e estatístico.
87            check_missing (bool): verifica valores ausentes.
88            head_n (int): número de linhas a exibir no resumo.
89        """
90        if show_summary:
91            self.log_summary(head_n=head_n)
92        if check_missing:
93            self.log_missing_values()

[PT-BR] Classe utilitária para executar diferentes validações em um CSV de Pokémon. [EN] Utility class to run various validations on a Pokémon CSV.

PokemonCSVAnalyzer(csv_path: str, encoding: str = 'utf-8')
28    def __init__(self, csv_path: str, encoding: str = "utf-8"):
29        self.csv_path = csv_path
30        self.encoding = encoding
31        try:
32            self.df = pd.read_csv(csv_path, encoding=encoding)
33            logging.info("CSV loaded successfully from '%s' (%d rows, %d columns)",
34                         csv_path, len(self.df), len(self.df.columns))
35        except (FileNotFoundError, UnicodeDecodeError, EmptyDataError, ParserError) as e:
36            logging.error("Failed to load CSV '%s': %s", csv_path, str(e), exc_info=True)
37            raise
csv_path
encoding
def log_summary(self, head_n: int = 5) -> None:
39    def log_summary(self, head_n: int = 5) -> None:
40        """
41        [PT-BR] Exibe no log um resumo estatístico e estrutural do conjunto de dados.
42        [EN] Logs a structural and statistical summary of the dataset.
43        """
44        logging.info("=== Dataset summary ===")
45        logging.info("Total rows   : %d", len(self.df))
46        logging.info("Total columns: %d", len(self.df.columns))
47        logging.info("Column names : %s", list(self.df.columns))
48
49        logging.info("Column dtypes:")
50        for col, dtype in self.df.dtypes.items():
51            logging.info("  • %-20s %s", col, dtype)
52
53        logging.info("First %d rows:", head_n)
54        for idx, row in self.df.head(head_n).iterrows():
55            logging.info("  Row %d%s", idx, row.to_dict())
56
57        stats = self.df.describe(include='all').transpose()
58        logging.info("Statistical summary:")
59        for col in stats.index:
60            logging.info("  • %-20s %s", col, stats.loc[col].dropna().to_dict())

[PT-BR] Exibe no log um resumo estatístico e estrutural do conjunto de dados. [EN] Logs a structural and statistical summary of the dataset.

def log_missing_values(self) -> None:
62    def log_missing_values(self) -> None:
63        """
64        [PT-BR] Exibe no log a quantidade de valores ausentes por coluna.
65        [EN] Logs the number of missing (blank or empty) values per column.
66        """
67        missing_mask = self.df.isna() | (self.df == "")
68        missing_counts = missing_mask.sum()
69        total_missing = missing_counts.sum()
70
71        logging.info("=== Missing-value check ===")
72        if total_missing == 0:
73            logging.info("All columns are fully populated.")
74            return
75
76        logging.warning("Found %d missing value(s) across the dataset.", total_missing)
77        for col, cnt in missing_counts[missing_counts > 0].sort_values(ascending=False).items():
78            logging.warning("  • %-20s %d missing", col, cnt)

[PT-BR] Exibe no log a quantidade de valores ausentes por coluna. [EN] Logs the number of missing (blank or empty) values per column.

def run_full_report( self, show_summary: bool = True, check_missing: bool = True, head_n: int = 5) -> None:
80    def run_full_report(self, show_summary: bool = True, check_missing: bool = True, head_n: int = 5) -> None:
81        """
82        [PT-BR] Executa as análises selecionadas em sequência.
83        [EN] Runs selected analyses in sequence.
84
85        Parâmetros:
86            show_summary (bool): mostra o resumo estrutural e estatístico.
87            check_missing (bool): verifica valores ausentes.
88            head_n (int): número de linhas a exibir no resumo.
89        """
90        if show_summary:
91            self.log_summary(head_n=head_n)
92        if check_missing:
93            self.log_missing_values()

[PT-BR] Executa as análises selecionadas em sequência. [EN] Runs selected analyses in sequence.

Parâmetros: show_summary (bool): mostra o resumo estrutural e estatístico. check_missing (bool): verifica valores ausentes. head_n (int): número de linhas a exibir no resumo.