services.pokemon_crawler

Módulo pokemon_crawler.py

[PT-BR] Módulo responsável por varrer ("crawlear") páginas do site pokemythology.net e extrair informações tabulares sobre Pokémon.
O fluxo de uso previsto é:

Instanciar PokemonCrawler com a URL de uma lista ou página individual.
Chamar crawl() - que devolve uma list[Pokemon].

O módulo também oferece discover_pages (método estático) para, a partir da página lista01.htm, descobrir todos os demais HTML relevantes.

[EN] Module responsible for crawling pokemythology.net pages and extracting tabular Pokémon information.
Typical usage flow:

Instantiate PokemonCrawler with a list or individual page URL.
Call crawl() - returns a list[Pokemon].

The module also provides discover_pages (static method), which, starting from lista01.htm, discovers all relevant HTML pages.

Uso típico / Typical usage: from services.pokemon_crawler import PokemonCrawler

urls = PokemonCrawler.discover_pages("https://pokemythology.net/conteudo/pokemon/lista01.htm")
for url in urls:
    pokemons = PokemonCrawler(url).crawl()
    for p in pokemons:
        print(p.to_dict())

View Source

  1"""
  2Módulo pokemon_crawler.py
  3==========================
  4
  5[PT-BR]
  6Módulo responsável por varrer ("crawlear") páginas do site **pokemythology.net**
  7e extrair informações tabulares sobre Pokémon.  
  8O fluxo de uso previsto é:
  9
 10- Instanciar ``PokemonCrawler`` com a URL de uma lista ou página individual.
 11- Chamar ``crawl()`` - que devolve uma ``list[Pokemon]``.
 12
 13O módulo também oferece ``discover_pages`` (método estático) para, a partir da
 14página *lista01.htm*, descobrir todos os demais HTML relevantes.
 15
 16[EN]
 17Module responsible for crawling **pokemythology.net** pages and extracting
 18tabular Pokémon information.  
 19Typical usage flow:
 20
 21- Instantiate ``PokemonCrawler`` with a list or individual page URL.
 22- Call ``crawl()`` - returns a ``list[Pokemon]``.
 23
 24The module also provides ``discover_pages`` (static method), which, starting from
 25*lista01.htm*, discovers all relevant HTML pages.
 26
 27Uso típico / Typical usage:
 28    from services.pokemon_crawler import PokemonCrawler
 29
 30    urls = PokemonCrawler.discover_pages("https://pokemythology.net/conteudo/pokemon/lista01.htm")
 31    for url in urls:
 32        pokemons = PokemonCrawler(url).crawl()
 33        for p in pokemons:
 34            print(p.to_dict())
 35"""
 36from __future__ import annotations
 37
 38import logging
 39from typing import Iterable
 40from urllib.error import HTTPError, URLError
 41from urllib.parse import urljoin
 42from urllib.request import Request, urlopen
 43
 44import requests  # type: ignore
 45from bs4 import BeautifulSoup, Tag  # type: ignore
 46
 47from models.pokemon import Pokemon
 48from models.pokemon_builder import PokemonBuilder
 49
 50class PokemonFields:
 51    NUM = "Nº"
 52    NAME = "Nome"
 53    TYPE = "Tipo"
 54    IMAGE = "Imagem"
 55    SHINY = "Coloração Shiny"
 56
 57class PokemonCrawler:
 58    BASE_URL = "https://pokemythology.net"
 59
 60    def __init__(self, url: str) -> None:
 61        self.url = url
 62
 63    @staticmethod
 64    def discover_pages(start_page: str) -> list[str]:
 65        resp = requests.get(start_page, headers={"User-Agent": "Mozilla/5.0"}, timeout=15)
 66        resp.encoding = "utf-8"
 67        soup = BeautifulSoup(resp.text, "html.parser")
 68
 69        links = [urljoin(PokemonCrawler.BASE_URL, a["href"])
 70                 for a in soup.find_all("a", href=True)
 71                 if a["href"].startswith("/conteudo/pokemon/") and a["href"].endswith(".htm")]
 72
 73        return sorted(set(links))
 74
 75    # ------------------------------------------------------------------
 76    # [PT-BR] Faz download do HTML da URL com *user-agent* customizado.
 77    # [EN] Downloads HTML content from the given URL with a custom user-agent.
 78    # ------------------------------------------------------------------
 79    def fetch_html(self) -> str:
 80        try:
 81            req = Request(self.url, headers={"User-Agent": "Mozilla/5.0"})
 82            with urlopen(req) as resp:
 83                return resp.read().decode("latin1")
 84        except (URLError, HTTPError) as e:
 85            logging.error("Error accessing URL %s: %s", self.url, str(e), exc_info=True)
 86            raise
 87
 88    # ------------------------------------------------------------------
 89    # [PT-BR] Pipeline público
 90    # [EN] Public pipeline
 91    # ------------------------------------------------------------------
 92    def crawl(self) -> list[Pokemon]:
 93        html = self.fetch_html()
 94        return list(self._parse_tables(html))
 95
 96    # ------------------------------------------------------------------
 97    # [PT-BR] Parsing interno
 98    # [EN] Internal parsing
 99    # ------------------------------------------------------------------
100    def _parse_tables(self, html: str) -> Iterable[Pokemon]:
101        soup = BeautifulSoup(html, "html.parser")
102        for table in soup.find_all("table", id=True):
103            try:
104                yield self._parse_single_table(table)
105            except (AttributeError, IndexError, TypeError) as e:
106                logging.error("Error parsing table on URL %s: %s", self.url, str(e), exc_info=True)
107
108    def _parse_single_table(self, table: Tag) -> Pokemon:
109        row_data: dict[str, str] = {}
110        trs = table.find_all("tr")
111
112        for pos, tr in enumerate(trs):
113            tds = tr.find_all("td")
114            if not tds:
115                continue
116
117            self._maybe_extract_main_image(tds, row_data)
118            self._maybe_extract_number(tds, row_data)
119            self._maybe_extract_shiny(tr, trs, pos, row_data)
120            self._maybe_extract_label_value_pairs(tds, row_data)
121
122        return self._build_pokemon(row_data)
123
124    def _maybe_extract_main_image(self, tds: list[Tag], row_data: dict[str, str]) -> None:
125        if PokemonFields.IMAGE not in row_data:
126            img_tag = tds[0].find("img")
127            if img_tag and img_tag.get("src"):
128                row_data[PokemonFields.IMAGE] = urljoin(self.BASE_URL, img_tag["src"])
129
130    def _maybe_extract_number(self, tds: list[Tag], row_data: dict[str, str]) -> None:
131        if len(tds) >= 3 and tds[1].get_text(strip=True) == f"{PokemonFields.NUM}:":
132            row_data[PokemonFields.NUM] = tds[2].get_text(strip=True)
133        elif len(tds) >= 2 and PokemonFields.NUM in tds[0].get_text():
134            row_data[PokemonFields.NUM] = tds[1].get_text(strip=True)
135
136    def _maybe_extract_shiny(self, tr: Tag, trs: list[Tag], pos: int, row_data: dict[str, str]) -> None:
137        line_txt = tr.get_text(" ", strip=True).lower()
138        if "coloração shiny" in line_txt:
139            shiny_img = tr.find("img") or (trs[pos + 1].find("img") if pos + 1 < len(trs) else None)
140            if shiny_img and shiny_img.get("src"):
141                row_data[PokemonFields.SHINY] = urljoin(self.BASE_URL, shiny_img["src"])
142        elif len(tr.find_all("td")) >= 2 and "Nome:" in tr.find_all("td")[0].get_text():
143            img = tr.find("img")
144            if img and img.get("src"):
145                row_data[PokemonFields.SHINY] = urljoin(self.BASE_URL, img["src"])
146
147    def _maybe_extract_label_value_pairs(self, tds: list[Tag], row_data: dict[str, str]) -> None:
148        for i in range(0, len(tds) - 1, 2):
149            label = tds[i].get_text(strip=True)
150            if not label.endswith(":"):
151                continue
152            value = " ".join(tds[i + 1].get_text(" ", strip=True).split())
153            row_data[label.rstrip(":")] = value
154
155    def _build_pokemon(self, data: dict[str, str]) -> Pokemon:
156        builder = PokemonBuilder()
157
158        if PokemonFields.NUM in data:
159            builder.number(data[PokemonFields.NUM])
160        if PokemonFields.NAME in data:
161            builder.name(data[PokemonFields.NAME])
162        if PokemonFields.TYPE in data:
163            for t in data[PokemonFields.TYPE].split("/"):
164                builder.add_type(t.strip())
165        if PokemonFields.IMAGE in data:
166            builder.image(data[PokemonFields.IMAGE])
167
168        for k, v in data.items():
169            if k not in {PokemonFields.NUM, PokemonFields.NAME, PokemonFields.TYPE, PokemonFields.IMAGE}:
170                builder.add_attribute(k, v)
171
172        return builder.build()

class PokemonFields: View Source

51class PokemonFields:
52    NUM = "Nº"
53    NAME = "Nome"
54    TYPE = "Tipo"
55    IMAGE = "Imagem"
56    SHINY = "Coloração Shiny"

NUM = 'Nº'

NAME = 'Nome'

TYPE = 'Tipo'

IMAGE = 'Imagem'

SHINY = 'Coloração Shiny'

class PokemonCrawler: View Source

 58class PokemonCrawler:
 59    BASE_URL = "https://pokemythology.net"
 60
 61    def __init__(self, url: str) -> None:
 62        self.url = url
 63
 64    @staticmethod
 65    def discover_pages(start_page: str) -> list[str]:
 66        resp = requests.get(start_page, headers={"User-Agent": "Mozilla/5.0"}, timeout=15)
 67        resp.encoding = "utf-8"
 68        soup = BeautifulSoup(resp.text, "html.parser")
 69
 70        links = [urljoin(PokemonCrawler.BASE_URL, a["href"])
 71                 for a in soup.find_all("a", href=True)
 72                 if a["href"].startswith("/conteudo/pokemon/") and a["href"].endswith(".htm")]
 73
 74        return sorted(set(links))
 75
 76    # ------------------------------------------------------------------
 77    # [PT-BR] Faz download do HTML da URL com *user-agent* customizado.
 78    # [EN] Downloads HTML content from the given URL with a custom user-agent.
 79    # ------------------------------------------------------------------
 80    def fetch_html(self) -> str:
 81        try:
 82            req = Request(self.url, headers={"User-Agent": "Mozilla/5.0"})
 83            with urlopen(req) as resp:
 84                return resp.read().decode("latin1")
 85        except (URLError, HTTPError) as e:
 86            logging.error("Error accessing URL %s: %s", self.url, str(e), exc_info=True)
 87            raise
 88
 89    # ------------------------------------------------------------------
 90    # [PT-BR] Pipeline público
 91    # [EN] Public pipeline
 92    # ------------------------------------------------------------------
 93    def crawl(self) -> list[Pokemon]:
 94        html = self.fetch_html()
 95        return list(self._parse_tables(html))
 96
 97    # ------------------------------------------------------------------
 98    # [PT-BR] Parsing interno
 99    # [EN] Internal parsing
100    # ------------------------------------------------------------------
101    def _parse_tables(self, html: str) -> Iterable[Pokemon]:
102        soup = BeautifulSoup(html, "html.parser")
103        for table in soup.find_all("table", id=True):
104            try:
105                yield self._parse_single_table(table)
106            except (AttributeError, IndexError, TypeError) as e:
107                logging.error("Error parsing table on URL %s: %s", self.url, str(e), exc_info=True)
108
109    def _parse_single_table(self, table: Tag) -> Pokemon:
110        row_data: dict[str, str] = {}
111        trs = table.find_all("tr")
112
113        for pos, tr in enumerate(trs):
114            tds = tr.find_all("td")
115            if not tds:
116                continue
117
118            self._maybe_extract_main_image(tds, row_data)
119            self._maybe_extract_number(tds, row_data)
120            self._maybe_extract_shiny(tr, trs, pos, row_data)
121            self._maybe_extract_label_value_pairs(tds, row_data)
122
123        return self._build_pokemon(row_data)
124
125    def _maybe_extract_main_image(self, tds: list[Tag], row_data: dict[str, str]) -> None:
126        if PokemonFields.IMAGE not in row_data:
127            img_tag = tds[0].find("img")
128            if img_tag and img_tag.get("src"):
129                row_data[PokemonFields.IMAGE] = urljoin(self.BASE_URL, img_tag["src"])
130
131    def _maybe_extract_number(self, tds: list[Tag], row_data: dict[str, str]) -> None:
132        if len(tds) >= 3 and tds[1].get_text(strip=True) == f"{PokemonFields.NUM}:":
133            row_data[PokemonFields.NUM] = tds[2].get_text(strip=True)
134        elif len(tds) >= 2 and PokemonFields.NUM in tds[0].get_text():
135            row_data[PokemonFields.NUM] = tds[1].get_text(strip=True)
136
137    def _maybe_extract_shiny(self, tr: Tag, trs: list[Tag], pos: int, row_data: dict[str, str]) -> None:
138        line_txt = tr.get_text(" ", strip=True).lower()
139        if "coloração shiny" in line_txt:
140            shiny_img = tr.find("img") or (trs[pos + 1].find("img") if pos + 1 < len(trs) else None)
141            if shiny_img and shiny_img.get("src"):
142                row_data[PokemonFields.SHINY] = urljoin(self.BASE_URL, shiny_img["src"])
143        elif len(tr.find_all("td")) >= 2 and "Nome:" in tr.find_all("td")[0].get_text():
144            img = tr.find("img")
145            if img and img.get("src"):
146                row_data[PokemonFields.SHINY] = urljoin(self.BASE_URL, img["src"])
147
148    def _maybe_extract_label_value_pairs(self, tds: list[Tag], row_data: dict[str, str]) -> None:
149        for i in range(0, len(tds) - 1, 2):
150            label = tds[i].get_text(strip=True)
151            if not label.endswith(":"):
152                continue
153            value = " ".join(tds[i + 1].get_text(" ", strip=True).split())
154            row_data[label.rstrip(":")] = value
155
156    def _build_pokemon(self, data: dict[str, str]) -> Pokemon:
157        builder = PokemonBuilder()
158
159        if PokemonFields.NUM in data:
160            builder.number(data[PokemonFields.NUM])
161        if PokemonFields.NAME in data:
162            builder.name(data[PokemonFields.NAME])
163        if PokemonFields.TYPE in data:
164            for t in data[PokemonFields.TYPE].split("/"):
165                builder.add_type(t.strip())
166        if PokemonFields.IMAGE in data:
167            builder.image(data[PokemonFields.IMAGE])
168
169        for k, v in data.items():
170            if k not in {PokemonFields.NUM, PokemonFields.NAME, PokemonFields.TYPE, PokemonFields.IMAGE}:
171                builder.add_attribute(k, v)
172
173        return builder.build()

PokemonCrawler(url: str) View Source

61    def __init__(self, url: str) -> None:
62        self.url = url

BASE_URL = 'https://pokemythology.net'

url

@staticmethod

def discover_pages(start_page: str) -> list[str]: View Source

64    @staticmethod
65    def discover_pages(start_page: str) -> list[str]:
66        resp = requests.get(start_page, headers={"User-Agent": "Mozilla/5.0"}, timeout=15)
67        resp.encoding = "utf-8"
68        soup = BeautifulSoup(resp.text, "html.parser")
69
70        links = [urljoin(PokemonCrawler.BASE_URL, a["href"])
71                 for a in soup.find_all("a", href=True)
72                 if a["href"].startswith("/conteudo/pokemon/") and a["href"].endswith(".htm")]
73
74        return sorted(set(links))

def fetch_html(self) -> str: View Source

80    def fetch_html(self) -> str:
81        try:
82            req = Request(self.url, headers={"User-Agent": "Mozilla/5.0"})
83            with urlopen(req) as resp:
84                return resp.read().decode("latin1")
85        except (URLError, HTTPError) as e:
86            logging.error("Error accessing URL %s: %s", self.url, str(e), exc_info=True)
87            raise

def crawl(self) -> list[models.pokemon.Pokemon]: View Source

93    def crawl(self) -> list[Pokemon]:
94        html = self.fetch_html()
95        return list(self._parse_tables(html))