main
Módulo main.py
[PT-BR] Ponto de entrada para o sistema de captura de dados Pokémon.
Fluxo principal:
- Carrega variáveis de configuração do arquivo
.env
(START_PAGE, OUTPUT_FILE). - Configura sistema de logging.
- Descobre todas as páginas de listagem de Pokémon a partir da página inicial.
- Faz o crawl de cada página, parseia as tabelas e cria objetos
Pokemon
. - Exporta o conjunto consolidado para um CSV.
- Executa uma análise opcional do CSV gerado.
Variáveis de ambiente esperadas (definidas em .env
):
START_PAGE URL da página inicial a ser rastreada.
OUTPUT_FILE Caminho do arquivo CSV de saída.
[EN] Entry point for the Pokémon data capture system.
Main flow:
- Loads configuration variables from the
.env
file (START_PAGE, OUTPUT_FILE). - Sets up the logging system.
- Discovers all Pokémon listing pages starting from the initial one.
- Crawls each page, parses the tables, and creates
Pokemon
objects. - Exports the consolidated data to a CSV file.
- Optionally runs an analysis of the generated CSV.
Expected environment variables (defined in .env
):
START_PAGE Initial page URL to be crawled.
OUTPUT_FILE Output CSV file path.
Usage: $ python main.py
1""" 2Módulo main.py 3============== 4 5[PT-BR] 6Ponto de entrada para o sistema de captura de dados Pokémon. 7 8Fluxo principal: 91. Carrega variáveis de configuração do arquivo ``.env`` (START_PAGE, OUTPUT_FILE). 102. Configura sistema de logging. 113. Descobre todas as páginas de listagem de Pokémon a partir da página inicial. 124. Faz o *crawl* de cada página, parseia as tabelas e cria objetos ``Pokemon``. 135. Exporta o conjunto consolidado para um CSV. 146. Executa uma análise opcional do CSV gerado. 15 16Variáveis de ambiente esperadas (definidas em ``.env``): 17 START_PAGE URL da página inicial a ser rastreada. 18 OUTPUT_FILE Caminho do arquivo CSV de saída. 19 20[EN] 21Entry point for the Pokémon data capture system. 22 23Main flow: 241. Loads configuration variables from the ``.env`` file (START_PAGE, OUTPUT_FILE). 252. Sets up the logging system. 263. Discovers all Pokémon listing pages starting from the initial one. 274. Crawls each page, parses the tables, and creates ``Pokemon`` objects. 285. Exports the consolidated data to a CSV file. 296. Optionally runs an analysis of the generated CSV. 30 31Expected environment variables (defined in ``.env``): 32 START_PAGE Initial page URL to be crawled. 33 OUTPUT_FILE Output CSV file path. 34 35Usage: 36 $ python main.py 37""" 38import os 39import logging 40from pathlib import Path 41from dotenv import load_dotenv # type: ignore 42 43from services.pokemon_crawler import PokemonCrawler 44from services.quests import QuestPokemon 45from services.logging import setup_logging 46from services.csv_writer import write_pokemon_csv # type: ignore 47from services.csv_analyzer import PokemonCSVAnalyzer 48 49# ---------------------------------------------------------------------------- 50# [PT-BR] Carregamento de variáveis de ambiente e preparação de pastas 51# [EN] Load environment variables and prepare folders 52# ---------------------------------------------------------------------------- 53 54load_dotenv() 55START_PAGE: str = os.getenv( 56 "START_PAGE", "https://pokemythology.net/conteudo/pokemon/lista01.htm" 57) 58OUTPUT_FILE: str = os.getenv("OUTPUT_FILE", "output/pokemons.csv") 59Path(OUTPUT_FILE).parent.mkdir(parents=True, exist_ok=True) 60 61 62# ---------------------------------------------------------------------------- 63# [PT-BR] Descobre todas as páginas a partir da página inicial 64# [EN] Discover all listing pages from the start page 65# ---------------------------------------------------------------------------- 66def discover_urls(start_url: str) -> list[str]: 67 urls = PokemonCrawler.discover_pages(start_url) 68 if start_url not in urls: 69 urls.insert(0, start_url) 70 return urls 71 72 73# ---------------------------------------------------------------------------- 74# [PT-BR] Realiza o crawling de todas as páginas e retorna a lista consolidada 75# [EN] Crawls all discovered pages and returns the consolidated list 76# ---------------------------------------------------------------------------- 77def crawl_all_pages(urls: list[str]) -> list[dict]: 78 all_pokemons = [] 79 for url in urls: 80 logging.info("Crawling Pokémon from: %s", url) 81 print(print(QuestPokemon(url).to_text())) 82 83 try: 84 pokemons = PokemonCrawler(url).crawl() 85 all_pokemons.extend(pokemons) 86 print(f" + {len(pokemons)} Pokémon captured on this page.\n") 87 logging.info(" + %d Pokémon captured on this page.", len(pokemons)) 88 except Exception: 89 logging.error("Failed to process %s", url, exc_info=True) 90 return all_pokemons 91 92 93# ---------------------------------------------------------------------------- 94# [PT-BR] Função principal 95# [EN] Main function 96# ---------------------------------------------------------------------------- 97def main() -> None: 98 setup_logging() 99 urls = discover_urls(START_PAGE) 100 print(f"{len(urls)} pages found. Starting capture…") 101 102 all_pokemons = crawl_all_pages(urls) 103 104 if not all_pokemons: 105 logging.warning("No Pokémon captured.") 106 return 107 108 written = write_pokemon_csv(all_pokemons, OUTPUT_FILE) 109 print(f"\n{written} Pokémon exported to '{OUTPUT_FILE}'.") 110 111 PokemonCSVAnalyzer(OUTPUT_FILE).run_full_report() 112 113 114# ---------------------------------------------------------------------------- 115# [PT-BR] Execução direta 116# [EN] Direct execution 117# ---------------------------------------------------------------------------- 118if __name__ == "__main__": 119 main()
START_PAGE: str =
$START_PAGE
OUTPUT_FILE: str =
$OUTPUT_FILE
def
discover_urls(start_url: str) -> list[str]:
def
crawl_all_pages(urls: list[str]) -> list[dict]:
78def crawl_all_pages(urls: list[str]) -> list[dict]: 79 all_pokemons = [] 80 for url in urls: 81 logging.info("Crawling Pokémon from: %s", url) 82 print(print(QuestPokemon(url).to_text())) 83 84 try: 85 pokemons = PokemonCrawler(url).crawl() 86 all_pokemons.extend(pokemons) 87 print(f" + {len(pokemons)} Pokémon captured on this page.\n") 88 logging.info(" + %d Pokémon captured on this page.", len(pokemons)) 89 except Exception: 90 logging.error("Failed to process %s", url, exc_info=True) 91 return all_pokemons
def
main() -> None:
98def main() -> None: 99 setup_logging() 100 urls = discover_urls(START_PAGE) 101 print(f"{len(urls)} pages found. Starting capture…") 102 103 all_pokemons = crawl_all_pages(urls) 104 105 if not all_pokemons: 106 logging.warning("No Pokémon captured.") 107 return 108 109 written = write_pokemon_csv(all_pokemons, OUTPUT_FILE) 110 print(f"\n{written} Pokémon exported to '{OUTPUT_FILE}'.") 111 112 PokemonCSVAnalyzer(OUTPUT_FILE).run_full_report()