main

Módulo main.py

[PT-BR] Ponto de entrada para o sistema de captura de dados Pokémon.

Fluxo principal:

  1. Carrega variáveis de configuração do arquivo .env (START_PAGE, OUTPUT_FILE).
  2. Configura sistema de logging.
  3. Descobre todas as páginas de listagem de Pokémon a partir da página inicial.
  4. Faz o crawl de cada página, parseia as tabelas e cria objetos Pokemon.
  5. Exporta o conjunto consolidado para um CSV.
  6. Executa uma análise opcional do CSV gerado.

Variáveis de ambiente esperadas (definidas em .env): START_PAGE URL da página inicial a ser rastreada. OUTPUT_FILE Caminho do arquivo CSV de saída.

[EN] Entry point for the Pokémon data capture system.

Main flow:

  1. Loads configuration variables from the .env file (START_PAGE, OUTPUT_FILE).
  2. Sets up the logging system.
  3. Discovers all Pokémon listing pages starting from the initial one.
  4. Crawls each page, parses the tables, and creates Pokemon objects.
  5. Exports the consolidated data to a CSV file.
  6. Optionally runs an analysis of the generated CSV.

Expected environment variables (defined in .env): START_PAGE Initial page URL to be crawled. OUTPUT_FILE Output CSV file path.

Usage: $ python main.py

  1"""
  2Módulo main.py
  3==============
  4
  5[PT-BR]
  6Ponto de entrada para o sistema de captura de dados Pokémon.
  7
  8Fluxo principal:
  91. Carrega variáveis de configuração do arquivo ``.env`` (START_PAGE, OUTPUT_FILE).
 102. Configura sistema de logging.
 113. Descobre todas as páginas de listagem de Pokémon a partir da página inicial.
 124. Faz o *crawl* de cada página, parseia as tabelas e cria objetos ``Pokemon``.
 135. Exporta o conjunto consolidado para um CSV.
 146. Executa uma análise opcional do CSV gerado.
 15
 16Variáveis de ambiente esperadas (definidas em ``.env``):
 17    START_PAGE   URL da página inicial a ser rastreada.
 18    OUTPUT_FILE  Caminho do arquivo CSV de saída.
 19
 20[EN]
 21Entry point for the Pokémon data capture system.
 22
 23Main flow:
 241. Loads configuration variables from the ``.env`` file (START_PAGE, OUTPUT_FILE).
 252. Sets up the logging system.
 263. Discovers all Pokémon listing pages starting from the initial one.
 274. Crawls each page, parses the tables, and creates ``Pokemon`` objects.
 285. Exports the consolidated data to a CSV file.
 296. Optionally runs an analysis of the generated CSV.
 30
 31Expected environment variables (defined in ``.env``):
 32    START_PAGE   Initial page URL to be crawled.
 33    OUTPUT_FILE  Output CSV file path.
 34
 35Usage:
 36    $ python main.py
 37"""
 38import os
 39import logging
 40from pathlib import Path
 41from dotenv import load_dotenv  # type: ignore
 42
 43from services.pokemon_crawler import PokemonCrawler
 44from services.quests import QuestPokemon
 45from services.logging import setup_logging
 46from services.csv_writer import write_pokemon_csv  # type: ignore
 47from services.csv_analyzer import PokemonCSVAnalyzer
 48
 49# ----------------------------------------------------------------------------
 50# [PT-BR] Carregamento de variáveis de ambiente e preparação de pastas
 51# [EN]   Load environment variables and prepare folders
 52# ----------------------------------------------------------------------------
 53
 54load_dotenv()
 55START_PAGE: str = os.getenv(
 56    "START_PAGE", "https://pokemythology.net/conteudo/pokemon/lista01.htm"
 57)
 58OUTPUT_FILE: str = os.getenv("OUTPUT_FILE", "output/pokemons.csv")
 59Path(OUTPUT_FILE).parent.mkdir(parents=True, exist_ok=True)
 60
 61
 62# ----------------------------------------------------------------------------
 63# [PT-BR] Descobre todas as páginas a partir da página inicial
 64# [EN]   Discover all listing pages from the start page
 65# ----------------------------------------------------------------------------
 66def discover_urls(start_url: str) -> list[str]:
 67    urls = PokemonCrawler.discover_pages(start_url)
 68    if start_url not in urls:
 69        urls.insert(0, start_url)
 70    return urls
 71
 72
 73# ----------------------------------------------------------------------------
 74# [PT-BR] Realiza o crawling de todas as páginas e retorna a lista consolidada
 75# [EN]   Crawls all discovered pages and returns the consolidated list
 76# ----------------------------------------------------------------------------
 77def crawl_all_pages(urls: list[str]) -> list[dict]:
 78    all_pokemons = []
 79    for url in urls:
 80        logging.info("Crawling Pokémon from: %s", url)
 81        print(print(QuestPokemon(url).to_text()))
 82
 83        try:
 84            pokemons = PokemonCrawler(url).crawl()
 85            all_pokemons.extend(pokemons)
 86            print(f"  + {len(pokemons)} Pokémon captured on this page.\n")
 87            logging.info("  + %d Pokémon captured on this page.", len(pokemons))
 88        except Exception:
 89            logging.error("Failed to process %s", url, exc_info=True)
 90    return all_pokemons
 91
 92
 93# ----------------------------------------------------------------------------
 94# [PT-BR] Função principal
 95# [EN]   Main function
 96# ----------------------------------------------------------------------------
 97def main() -> None:
 98    setup_logging()
 99    urls = discover_urls(START_PAGE)
100    print(f"{len(urls)} pages found. Starting capture…")
101
102    all_pokemons = crawl_all_pages(urls)
103
104    if not all_pokemons:
105        logging.warning("No Pokémon captured.")
106        return
107
108    written = write_pokemon_csv(all_pokemons, OUTPUT_FILE)
109    print(f"\n{written} Pokémon exported to '{OUTPUT_FILE}'.")
110
111    PokemonCSVAnalyzer(OUTPUT_FILE).run_full_report()
112
113
114# ----------------------------------------------------------------------------
115# [PT-BR] Execução direta
116# [EN]   Direct execution
117# ----------------------------------------------------------------------------
118if __name__ == "__main__":
119    main()
START_PAGE: str = $START_PAGE
OUTPUT_FILE: str = $OUTPUT_FILE
def discover_urls(start_url: str) -> list[str]:
67def discover_urls(start_url: str) -> list[str]:
68    urls = PokemonCrawler.discover_pages(start_url)
69    if start_url not in urls:
70        urls.insert(0, start_url)
71    return urls
def crawl_all_pages(urls: list[str]) -> list[dict]:
78def crawl_all_pages(urls: list[str]) -> list[dict]:
79    all_pokemons = []
80    for url in urls:
81        logging.info("Crawling Pokémon from: %s", url)
82        print(print(QuestPokemon(url).to_text()))
83
84        try:
85            pokemons = PokemonCrawler(url).crawl()
86            all_pokemons.extend(pokemons)
87            print(f"  + {len(pokemons)} Pokémon captured on this page.\n")
88            logging.info("  + %d Pokémon captured on this page.", len(pokemons))
89        except Exception:
90            logging.error("Failed to process %s", url, exc_info=True)
91    return all_pokemons
def main() -> None:
 98def main() -> None:
 99    setup_logging()
100    urls = discover_urls(START_PAGE)
101    print(f"{len(urls)} pages found. Starting capture…")
102
103    all_pokemons = crawl_all_pages(urls)
104
105    if not all_pokemons:
106        logging.warning("No Pokémon captured.")
107        return
108
109    written = write_pokemon_csv(all_pokemons, OUTPUT_FILE)
110    print(f"\n{written} Pokémon exported to '{OUTPUT_FILE}'.")
111
112    PokemonCSVAnalyzer(OUTPUT_FILE).run_full_report()