services.csv_analyzer
Módulo csv_analyzer.py
[PT-BR] Responsável por analisar o CSV gerado pelo sistema de captura de dados dos Pokémon, identificando dados ausentes, tipos de dados e gerando um sumário básico de informações.
[EN] Responsible for analyzing the CSV generated by the Pokémon data collection system, identifying missing data, data types, and generating a basic dataset summary.
Uso típico / Typical usage:
analyzer = PokemonCSVAnalyzer("output/pokemons.csv")
analyzer.run_full_report()
1""" 2Módulo csv_analyzer.py 3====================== 4 5[PT-BR] Responsável por analisar o CSV gerado pelo sistema de captura de dados dos Pokémon, 6identificando dados ausentes, tipos de dados e gerando um sumário básico de informações. 7 8[EN] Responsible for analyzing the CSV generated by the Pokémon data collection system, 9identifying missing data, data types, and generating a basic dataset summary. 10 11Uso típico / Typical usage: 12 13 analyzer = PokemonCSVAnalyzer("output/pokemons.csv") 14 analyzer.run_full_report() 15""" 16import logging 17from typing import Optional 18import pandas as pd # type: ignore 19from pandas.errors import EmptyDataError, ParserError # type: ignore 20 21class PokemonCSVAnalyzer: 22 """ 23 [PT-BR] Classe utilitária para executar diferentes validações em um CSV de Pokémon. 24 [EN] Utility class to run various validations on a Pokémon CSV. 25 """ 26 27 def __init__(self, csv_path: str, encoding: str = "utf-8"): 28 self.csv_path = csv_path 29 self.encoding = encoding 30 try: 31 self.df = pd.read_csv(csv_path, encoding=encoding) 32 logging.info("CSV loaded successfully from '%s' (%d rows, %d columns)", 33 csv_path, len(self.df), len(self.df.columns)) 34 except (FileNotFoundError, UnicodeDecodeError, EmptyDataError, ParserError) as e: 35 logging.error("Failed to load CSV '%s': %s", csv_path, str(e), exc_info=True) 36 raise 37 38 def log_summary(self, head_n: int = 5) -> None: 39 """ 40 [PT-BR] Exibe no log um resumo estatístico e estrutural do conjunto de dados. 41 [EN] Logs a structural and statistical summary of the dataset. 42 """ 43 logging.info("=== Dataset summary ===") 44 logging.info("Total rows : %d", len(self.df)) 45 logging.info("Total columns: %d", len(self.df.columns)) 46 logging.info("Column names : %s", list(self.df.columns)) 47 48 logging.info("Column dtypes:") 49 for col, dtype in self.df.dtypes.items(): 50 logging.info(" • %-20s %s", col, dtype) 51 52 logging.info("First %d rows:", head_n) 53 for idx, row in self.df.head(head_n).iterrows(): 54 logging.info(" Row %d → %s", idx, row.to_dict()) 55 56 stats = self.df.describe(include='all').transpose() 57 logging.info("Statistical summary:") 58 for col in stats.index: 59 logging.info(" • %-20s %s", col, stats.loc[col].dropna().to_dict()) 60 61 def log_missing_values(self) -> None: 62 """ 63 [PT-BR] Exibe no log a quantidade de valores ausentes por coluna. 64 [EN] Logs the number of missing (blank or empty) values per column. 65 """ 66 missing_mask = self.df.isna() | (self.df == "") 67 missing_counts = missing_mask.sum() 68 total_missing = missing_counts.sum() 69 70 logging.info("=== Missing-value check ===") 71 if total_missing == 0: 72 logging.info("All columns are fully populated.") 73 return 74 75 logging.warning("Found %d missing value(s) across the dataset.", total_missing) 76 for col, cnt in missing_counts[missing_counts > 0].sort_values(ascending=False).items(): 77 logging.warning(" • %-20s %d missing", col, cnt) 78 79 def run_full_report(self, show_summary: bool = True, check_missing: bool = True, head_n: int = 5) -> None: 80 """ 81 [PT-BR] Executa as análises selecionadas em sequência. 82 [EN] Runs selected analyses in sequence. 83 84 Parâmetros: 85 show_summary (bool): mostra o resumo estrutural e estatístico. 86 check_missing (bool): verifica valores ausentes. 87 head_n (int): número de linhas a exibir no resumo. 88 """ 89 if show_summary: 90 self.log_summary(head_n=head_n) 91 if check_missing: 92 self.log_missing_values()
22class PokemonCSVAnalyzer: 23 """ 24 [PT-BR] Classe utilitária para executar diferentes validações em um CSV de Pokémon. 25 [EN] Utility class to run various validations on a Pokémon CSV. 26 """ 27 28 def __init__(self, csv_path: str, encoding: str = "utf-8"): 29 self.csv_path = csv_path 30 self.encoding = encoding 31 try: 32 self.df = pd.read_csv(csv_path, encoding=encoding) 33 logging.info("CSV loaded successfully from '%s' (%d rows, %d columns)", 34 csv_path, len(self.df), len(self.df.columns)) 35 except (FileNotFoundError, UnicodeDecodeError, EmptyDataError, ParserError) as e: 36 logging.error("Failed to load CSV '%s': %s", csv_path, str(e), exc_info=True) 37 raise 38 39 def log_summary(self, head_n: int = 5) -> None: 40 """ 41 [PT-BR] Exibe no log um resumo estatístico e estrutural do conjunto de dados. 42 [EN] Logs a structural and statistical summary of the dataset. 43 """ 44 logging.info("=== Dataset summary ===") 45 logging.info("Total rows : %d", len(self.df)) 46 logging.info("Total columns: %d", len(self.df.columns)) 47 logging.info("Column names : %s", list(self.df.columns)) 48 49 logging.info("Column dtypes:") 50 for col, dtype in self.df.dtypes.items(): 51 logging.info(" • %-20s %s", col, dtype) 52 53 logging.info("First %d rows:", head_n) 54 for idx, row in self.df.head(head_n).iterrows(): 55 logging.info(" Row %d → %s", idx, row.to_dict()) 56 57 stats = self.df.describe(include='all').transpose() 58 logging.info("Statistical summary:") 59 for col in stats.index: 60 logging.info(" • %-20s %s", col, stats.loc[col].dropna().to_dict()) 61 62 def log_missing_values(self) -> None: 63 """ 64 [PT-BR] Exibe no log a quantidade de valores ausentes por coluna. 65 [EN] Logs the number of missing (blank or empty) values per column. 66 """ 67 missing_mask = self.df.isna() | (self.df == "") 68 missing_counts = missing_mask.sum() 69 total_missing = missing_counts.sum() 70 71 logging.info("=== Missing-value check ===") 72 if total_missing == 0: 73 logging.info("All columns are fully populated.") 74 return 75 76 logging.warning("Found %d missing value(s) across the dataset.", total_missing) 77 for col, cnt in missing_counts[missing_counts > 0].sort_values(ascending=False).items(): 78 logging.warning(" • %-20s %d missing", col, cnt) 79 80 def run_full_report(self, show_summary: bool = True, check_missing: bool = True, head_n: int = 5) -> None: 81 """ 82 [PT-BR] Executa as análises selecionadas em sequência. 83 [EN] Runs selected analyses in sequence. 84 85 Parâmetros: 86 show_summary (bool): mostra o resumo estrutural e estatístico. 87 check_missing (bool): verifica valores ausentes. 88 head_n (int): número de linhas a exibir no resumo. 89 """ 90 if show_summary: 91 self.log_summary(head_n=head_n) 92 if check_missing: 93 self.log_missing_values()
[PT-BR] Classe utilitária para executar diferentes validações em um CSV de Pokémon. [EN] Utility class to run various validations on a Pokémon CSV.
28 def __init__(self, csv_path: str, encoding: str = "utf-8"): 29 self.csv_path = csv_path 30 self.encoding = encoding 31 try: 32 self.df = pd.read_csv(csv_path, encoding=encoding) 33 logging.info("CSV loaded successfully from '%s' (%d rows, %d columns)", 34 csv_path, len(self.df), len(self.df.columns)) 35 except (FileNotFoundError, UnicodeDecodeError, EmptyDataError, ParserError) as e: 36 logging.error("Failed to load CSV '%s': %s", csv_path, str(e), exc_info=True) 37 raise
39 def log_summary(self, head_n: int = 5) -> None: 40 """ 41 [PT-BR] Exibe no log um resumo estatístico e estrutural do conjunto de dados. 42 [EN] Logs a structural and statistical summary of the dataset. 43 """ 44 logging.info("=== Dataset summary ===") 45 logging.info("Total rows : %d", len(self.df)) 46 logging.info("Total columns: %d", len(self.df.columns)) 47 logging.info("Column names : %s", list(self.df.columns)) 48 49 logging.info("Column dtypes:") 50 for col, dtype in self.df.dtypes.items(): 51 logging.info(" • %-20s %s", col, dtype) 52 53 logging.info("First %d rows:", head_n) 54 for idx, row in self.df.head(head_n).iterrows(): 55 logging.info(" Row %d → %s", idx, row.to_dict()) 56 57 stats = self.df.describe(include='all').transpose() 58 logging.info("Statistical summary:") 59 for col in stats.index: 60 logging.info(" • %-20s %s", col, stats.loc[col].dropna().to_dict())
[PT-BR] Exibe no log um resumo estatístico e estrutural do conjunto de dados. [EN] Logs a structural and statistical summary of the dataset.
62 def log_missing_values(self) -> None: 63 """ 64 [PT-BR] Exibe no log a quantidade de valores ausentes por coluna. 65 [EN] Logs the number of missing (blank or empty) values per column. 66 """ 67 missing_mask = self.df.isna() | (self.df == "") 68 missing_counts = missing_mask.sum() 69 total_missing = missing_counts.sum() 70 71 logging.info("=== Missing-value check ===") 72 if total_missing == 0: 73 logging.info("All columns are fully populated.") 74 return 75 76 logging.warning("Found %d missing value(s) across the dataset.", total_missing) 77 for col, cnt in missing_counts[missing_counts > 0].sort_values(ascending=False).items(): 78 logging.warning(" • %-20s %d missing", col, cnt)
[PT-BR] Exibe no log a quantidade de valores ausentes por coluna. [EN] Logs the number of missing (blank or empty) values per column.
80 def run_full_report(self, show_summary: bool = True, check_missing: bool = True, head_n: int = 5) -> None: 81 """ 82 [PT-BR] Executa as análises selecionadas em sequência. 83 [EN] Runs selected analyses in sequence. 84 85 Parâmetros: 86 show_summary (bool): mostra o resumo estrutural e estatístico. 87 check_missing (bool): verifica valores ausentes. 88 head_n (int): número de linhas a exibir no resumo. 89 """ 90 if show_summary: 91 self.log_summary(head_n=head_n) 92 if check_missing: 93 self.log_missing_values()
[PT-BR] Executa as análises selecionadas em sequência. [EN] Runs selected analyses in sequence.
Parâmetros: show_summary (bool): mostra o resumo estrutural e estatístico. check_missing (bool): verifica valores ausentes. head_n (int): número de linhas a exibir no resumo.