services.pokemon_crawler
Módulo pokemon_crawler.py
[PT-BR]
Módulo responsável por varrer ("crawlear") páginas do site pokemythology.net
e extrair informações tabulares sobre Pokémon.
O fluxo de uso previsto é:
- Instanciar
PokemonCrawler
com a URL de uma lista ou página individual. - Chamar
crawl()
- que devolve umalist[Pokemon]
.
O módulo também oferece discover_pages
(método estático) para, a partir da
página lista01.htm, descobrir todos os demais HTML relevantes.
[EN]
Module responsible for crawling pokemythology.net pages and extracting
tabular Pokémon information.
Typical usage flow:
- Instantiate
PokemonCrawler
with a list or individual page URL. - Call
crawl()
- returns alist[Pokemon]
.
The module also provides discover_pages
(static method), which, starting from
lista01.htm, discovers all relevant HTML pages.
Uso típico / Typical usage: from services.pokemon_crawler import PokemonCrawler
urls = PokemonCrawler.discover_pages("https://pokemythology.net/conteudo/pokemon/lista01.htm")
for url in urls:
pokemons = PokemonCrawler(url).crawl()
for p in pokemons:
print(p.to_dict())
1""" 2Módulo pokemon_crawler.py 3========================== 4 5[PT-BR] 6Módulo responsável por varrer ("crawlear") páginas do site **pokemythology.net** 7e extrair informações tabulares sobre Pokémon. 8O fluxo de uso previsto é: 9 10- Instanciar ``PokemonCrawler`` com a URL de uma lista ou página individual. 11- Chamar ``crawl()`` - que devolve uma ``list[Pokemon]``. 12 13O módulo também oferece ``discover_pages`` (método estático) para, a partir da 14página *lista01.htm*, descobrir todos os demais HTML relevantes. 15 16[EN] 17Module responsible for crawling **pokemythology.net** pages and extracting 18tabular Pokémon information. 19Typical usage flow: 20 21- Instantiate ``PokemonCrawler`` with a list or individual page URL. 22- Call ``crawl()`` - returns a ``list[Pokemon]``. 23 24The module also provides ``discover_pages`` (static method), which, starting from 25*lista01.htm*, discovers all relevant HTML pages. 26 27Uso típico / Typical usage: 28 from services.pokemon_crawler import PokemonCrawler 29 30 urls = PokemonCrawler.discover_pages("https://pokemythology.net/conteudo/pokemon/lista01.htm") 31 for url in urls: 32 pokemons = PokemonCrawler(url).crawl() 33 for p in pokemons: 34 print(p.to_dict()) 35""" 36from __future__ import annotations 37 38import logging 39from typing import Iterable 40from urllib.error import HTTPError, URLError 41from urllib.parse import urljoin 42from urllib.request import Request, urlopen 43 44import requests # type: ignore 45from bs4 import BeautifulSoup, Tag # type: ignore 46 47from models.pokemon import Pokemon 48from models.pokemon_builder import PokemonBuilder 49 50class PokemonFields: 51 NUM = "Nº" 52 NAME = "Nome" 53 TYPE = "Tipo" 54 IMAGE = "Imagem" 55 SHINY = "Coloração Shiny" 56 57class PokemonCrawler: 58 BASE_URL = "https://pokemythology.net" 59 60 def __init__(self, url: str) -> None: 61 self.url = url 62 63 @staticmethod 64 def discover_pages(start_page: str) -> list[str]: 65 resp = requests.get(start_page, headers={"User-Agent": "Mozilla/5.0"}, timeout=15) 66 resp.encoding = "utf-8" 67 soup = BeautifulSoup(resp.text, "html.parser") 68 69 links = [urljoin(PokemonCrawler.BASE_URL, a["href"]) 70 for a in soup.find_all("a", href=True) 71 if a["href"].startswith("/conteudo/pokemon/") and a["href"].endswith(".htm")] 72 73 return sorted(set(links)) 74 75 # ------------------------------------------------------------------ 76 # [PT-BR] Faz download do HTML da URL com *user-agent* customizado. 77 # [EN] Downloads HTML content from the given URL with a custom user-agent. 78 # ------------------------------------------------------------------ 79 def fetch_html(self) -> str: 80 try: 81 req = Request(self.url, headers={"User-Agent": "Mozilla/5.0"}) 82 with urlopen(req) as resp: 83 return resp.read().decode("latin1") 84 except (URLError, HTTPError) as e: 85 logging.error("Error accessing URL %s: %s", self.url, str(e), exc_info=True) 86 raise 87 88 # ------------------------------------------------------------------ 89 # [PT-BR] Pipeline público 90 # [EN] Public pipeline 91 # ------------------------------------------------------------------ 92 def crawl(self) -> list[Pokemon]: 93 html = self.fetch_html() 94 return list(self._parse_tables(html)) 95 96 # ------------------------------------------------------------------ 97 # [PT-BR] Parsing interno 98 # [EN] Internal parsing 99 # ------------------------------------------------------------------ 100 def _parse_tables(self, html: str) -> Iterable[Pokemon]: 101 soup = BeautifulSoup(html, "html.parser") 102 for table in soup.find_all("table", id=True): 103 try: 104 yield self._parse_single_table(table) 105 except (AttributeError, IndexError, TypeError) as e: 106 logging.error("Error parsing table on URL %s: %s", self.url, str(e), exc_info=True) 107 108 def _parse_single_table(self, table: Tag) -> Pokemon: 109 row_data: dict[str, str] = {} 110 trs = table.find_all("tr") 111 112 for pos, tr in enumerate(trs): 113 tds = tr.find_all("td") 114 if not tds: 115 continue 116 117 self._maybe_extract_main_image(tds, row_data) 118 self._maybe_extract_number(tds, row_data) 119 self._maybe_extract_shiny(tr, trs, pos, row_data) 120 self._maybe_extract_label_value_pairs(tds, row_data) 121 122 return self._build_pokemon(row_data) 123 124 def _maybe_extract_main_image(self, tds: list[Tag], row_data: dict[str, str]) -> None: 125 if PokemonFields.IMAGE not in row_data: 126 img_tag = tds[0].find("img") 127 if img_tag and img_tag.get("src"): 128 row_data[PokemonFields.IMAGE] = urljoin(self.BASE_URL, img_tag["src"]) 129 130 def _maybe_extract_number(self, tds: list[Tag], row_data: dict[str, str]) -> None: 131 if len(tds) >= 3 and tds[1].get_text(strip=True) == f"{PokemonFields.NUM}:": 132 row_data[PokemonFields.NUM] = tds[2].get_text(strip=True) 133 elif len(tds) >= 2 and PokemonFields.NUM in tds[0].get_text(): 134 row_data[PokemonFields.NUM] = tds[1].get_text(strip=True) 135 136 def _maybe_extract_shiny(self, tr: Tag, trs: list[Tag], pos: int, row_data: dict[str, str]) -> None: 137 line_txt = tr.get_text(" ", strip=True).lower() 138 if "coloração shiny" in line_txt: 139 shiny_img = tr.find("img") or (trs[pos + 1].find("img") if pos + 1 < len(trs) else None) 140 if shiny_img and shiny_img.get("src"): 141 row_data[PokemonFields.SHINY] = urljoin(self.BASE_URL, shiny_img["src"]) 142 elif len(tr.find_all("td")) >= 2 and "Nome:" in tr.find_all("td")[0].get_text(): 143 img = tr.find("img") 144 if img and img.get("src"): 145 row_data[PokemonFields.SHINY] = urljoin(self.BASE_URL, img["src"]) 146 147 def _maybe_extract_label_value_pairs(self, tds: list[Tag], row_data: dict[str, str]) -> None: 148 for i in range(0, len(tds) - 1, 2): 149 label = tds[i].get_text(strip=True) 150 if not label.endswith(":"): 151 continue 152 value = " ".join(tds[i + 1].get_text(" ", strip=True).split()) 153 row_data[label.rstrip(":")] = value 154 155 def _build_pokemon(self, data: dict[str, str]) -> Pokemon: 156 builder = PokemonBuilder() 157 158 if PokemonFields.NUM in data: 159 builder.number(data[PokemonFields.NUM]) 160 if PokemonFields.NAME in data: 161 builder.name(data[PokemonFields.NAME]) 162 if PokemonFields.TYPE in data: 163 for t in data[PokemonFields.TYPE].split("/"): 164 builder.add_type(t.strip()) 165 if PokemonFields.IMAGE in data: 166 builder.image(data[PokemonFields.IMAGE]) 167 168 for k, v in data.items(): 169 if k not in {PokemonFields.NUM, PokemonFields.NAME, PokemonFields.TYPE, PokemonFields.IMAGE}: 170 builder.add_attribute(k, v) 171 172 return builder.build()
class
PokemonFields:
class
PokemonCrawler:
58class PokemonCrawler: 59 BASE_URL = "https://pokemythology.net" 60 61 def __init__(self, url: str) -> None: 62 self.url = url 63 64 @staticmethod 65 def discover_pages(start_page: str) -> list[str]: 66 resp = requests.get(start_page, headers={"User-Agent": "Mozilla/5.0"}, timeout=15) 67 resp.encoding = "utf-8" 68 soup = BeautifulSoup(resp.text, "html.parser") 69 70 links = [urljoin(PokemonCrawler.BASE_URL, a["href"]) 71 for a in soup.find_all("a", href=True) 72 if a["href"].startswith("/conteudo/pokemon/") and a["href"].endswith(".htm")] 73 74 return sorted(set(links)) 75 76 # ------------------------------------------------------------------ 77 # [PT-BR] Faz download do HTML da URL com *user-agent* customizado. 78 # [EN] Downloads HTML content from the given URL with a custom user-agent. 79 # ------------------------------------------------------------------ 80 def fetch_html(self) -> str: 81 try: 82 req = Request(self.url, headers={"User-Agent": "Mozilla/5.0"}) 83 with urlopen(req) as resp: 84 return resp.read().decode("latin1") 85 except (URLError, HTTPError) as e: 86 logging.error("Error accessing URL %s: %s", self.url, str(e), exc_info=True) 87 raise 88 89 # ------------------------------------------------------------------ 90 # [PT-BR] Pipeline público 91 # [EN] Public pipeline 92 # ------------------------------------------------------------------ 93 def crawl(self) -> list[Pokemon]: 94 html = self.fetch_html() 95 return list(self._parse_tables(html)) 96 97 # ------------------------------------------------------------------ 98 # [PT-BR] Parsing interno 99 # [EN] Internal parsing 100 # ------------------------------------------------------------------ 101 def _parse_tables(self, html: str) -> Iterable[Pokemon]: 102 soup = BeautifulSoup(html, "html.parser") 103 for table in soup.find_all("table", id=True): 104 try: 105 yield self._parse_single_table(table) 106 except (AttributeError, IndexError, TypeError) as e: 107 logging.error("Error parsing table on URL %s: %s", self.url, str(e), exc_info=True) 108 109 def _parse_single_table(self, table: Tag) -> Pokemon: 110 row_data: dict[str, str] = {} 111 trs = table.find_all("tr") 112 113 for pos, tr in enumerate(trs): 114 tds = tr.find_all("td") 115 if not tds: 116 continue 117 118 self._maybe_extract_main_image(tds, row_data) 119 self._maybe_extract_number(tds, row_data) 120 self._maybe_extract_shiny(tr, trs, pos, row_data) 121 self._maybe_extract_label_value_pairs(tds, row_data) 122 123 return self._build_pokemon(row_data) 124 125 def _maybe_extract_main_image(self, tds: list[Tag], row_data: dict[str, str]) -> None: 126 if PokemonFields.IMAGE not in row_data: 127 img_tag = tds[0].find("img") 128 if img_tag and img_tag.get("src"): 129 row_data[PokemonFields.IMAGE] = urljoin(self.BASE_URL, img_tag["src"]) 130 131 def _maybe_extract_number(self, tds: list[Tag], row_data: dict[str, str]) -> None: 132 if len(tds) >= 3 and tds[1].get_text(strip=True) == f"{PokemonFields.NUM}:": 133 row_data[PokemonFields.NUM] = tds[2].get_text(strip=True) 134 elif len(tds) >= 2 and PokemonFields.NUM in tds[0].get_text(): 135 row_data[PokemonFields.NUM] = tds[1].get_text(strip=True) 136 137 def _maybe_extract_shiny(self, tr: Tag, trs: list[Tag], pos: int, row_data: dict[str, str]) -> None: 138 line_txt = tr.get_text(" ", strip=True).lower() 139 if "coloração shiny" in line_txt: 140 shiny_img = tr.find("img") or (trs[pos + 1].find("img") if pos + 1 < len(trs) else None) 141 if shiny_img and shiny_img.get("src"): 142 row_data[PokemonFields.SHINY] = urljoin(self.BASE_URL, shiny_img["src"]) 143 elif len(tr.find_all("td")) >= 2 and "Nome:" in tr.find_all("td")[0].get_text(): 144 img = tr.find("img") 145 if img and img.get("src"): 146 row_data[PokemonFields.SHINY] = urljoin(self.BASE_URL, img["src"]) 147 148 def _maybe_extract_label_value_pairs(self, tds: list[Tag], row_data: dict[str, str]) -> None: 149 for i in range(0, len(tds) - 1, 2): 150 label = tds[i].get_text(strip=True) 151 if not label.endswith(":"): 152 continue 153 value = " ".join(tds[i + 1].get_text(" ", strip=True).split()) 154 row_data[label.rstrip(":")] = value 155 156 def _build_pokemon(self, data: dict[str, str]) -> Pokemon: 157 builder = PokemonBuilder() 158 159 if PokemonFields.NUM in data: 160 builder.number(data[PokemonFields.NUM]) 161 if PokemonFields.NAME in data: 162 builder.name(data[PokemonFields.NAME]) 163 if PokemonFields.TYPE in data: 164 for t in data[PokemonFields.TYPE].split("/"): 165 builder.add_type(t.strip()) 166 if PokemonFields.IMAGE in data: 167 builder.image(data[PokemonFields.IMAGE]) 168 169 for k, v in data.items(): 170 if k not in {PokemonFields.NUM, PokemonFields.NAME, PokemonFields.TYPE, PokemonFields.IMAGE}: 171 builder.add_attribute(k, v) 172 173 return builder.build()
@staticmethod
def
discover_pages(start_page: str) -> list[str]:
64 @staticmethod 65 def discover_pages(start_page: str) -> list[str]: 66 resp = requests.get(start_page, headers={"User-Agent": "Mozilla/5.0"}, timeout=15) 67 resp.encoding = "utf-8" 68 soup = BeautifulSoup(resp.text, "html.parser") 69 70 links = [urljoin(PokemonCrawler.BASE_URL, a["href"]) 71 for a in soup.find_all("a", href=True) 72 if a["href"].startswith("/conteudo/pokemon/") and a["href"].endswith(".htm")] 73 74 return sorted(set(links))
def
fetch_html(self) -> str:
80 def fetch_html(self) -> str: 81 try: 82 req = Request(self.url, headers={"User-Agent": "Mozilla/5.0"}) 83 with urlopen(req) as resp: 84 return resp.read().decode("latin1") 85 except (URLError, HTTPError) as e: 86 logging.error("Error accessing URL %s: %s", self.url, str(e), exc_info=True) 87 raise