intelli3text
intelli3text public API.
This package provides a high-level, opinionated pipeline for:
- Text ingestion (Web/PDF/DOCX/TXT)
- Cleaning & normalization
- Per-paragraph Language Identification (PT/EN/ES)
- Optional export (e.g., PDF reports)
Typical usage (Python):
from intelli3text import PipelineBuilder, Intelli3Config
cfg = Intelli3Config(
cleaners=["ftfy", "clean_text", "pdf_breaks"],
nlp_model_pref="md",
export={"pdf": {"path": "report.pdf", "include_global_normalized": True}},
)
pipeline = PipelineBuilder(cfg).build()
result = pipeline.process("https://en.wikipedia.org/wiki/NLP")
print(result["language_global"], len(result["paragraphs"]))
Public Re-exports
- Intelli3Config: Configuration dataclass for the pipeline (cleaners, LID, spaCy model size, export, etc.)
- PipelineBuilder: Builder that wires up extractors, cleaners, LID, normalizer, and exporters into a ready-to-use Pipeline
1""" 2intelli3text public API. 3 4This package provides a high-level, opinionated pipeline for: 5- Text ingestion (Web/PDF/DOCX/TXT) 6- Cleaning & normalization 7- Per-paragraph Language Identification (PT/EN/ES) 8- Optional export (e.g., PDF reports) 9 10Typical usage (Python): 11 12 from intelli3text import PipelineBuilder, Intelli3Config 13 14 cfg = Intelli3Config( 15 cleaners=["ftfy", "clean_text", "pdf_breaks"], 16 nlp_model_pref="md", 17 export={"pdf": {"path": "report.pdf", "include_global_normalized": True}}, 18 ) 19 pipeline = PipelineBuilder(cfg).build() 20 result = pipeline.process("https://en.wikipedia.org/wiki/NLP") 21 print(result["language_global"], len(result["paragraphs"])) 22 23Public Re-exports 24----------------- 25- Intelli3Config: Configuration dataclass for the pipeline (cleaners, LID, spaCy model size, export, etc.) 26- PipelineBuilder: Builder that wires up extractors, cleaners, LID, normalizer, and exporters into a ready-to-use Pipeline 27""" 28 29from .config import Intelli3Config 30from .builder import PipelineBuilder 31 32# Optional: expose package version (helps docs and users introspecting versions) 33# Uses importlib.metadata on Python 3.8+ (available in stdlib). 34try: # pragma: no cover - version retrieval is environment-dependent 35 from importlib.metadata import version, PackageNotFoundError # type: ignore 36 try: 37 __version__ = version("intelli3text") 38 except PackageNotFoundError: 39 # When running from a source tree without an installed distribution. 40 __version__ = "0.0.0.dev0" 41except Exception: # defensive fallback 42 __version__ = "0.0.0" 43 44# Explicitly define the public surface of the package. 45__all__ = [ 46 "Intelli3Config", 47 "PipelineBuilder", 48 "__version__", 49]
@dataclass
class
Intelli3Config:
12@dataclass 13class Intelli3Config: 14 """Configuration object for the intelli3text pipeline. 15 16 This dataclass centralizes all user-tunable parameters that control 17 ingestion, cleaning, language identification (LID), spaCy model 18 preferences, and exporters. 19 20 Attributes: 21 cleaners: 22 Ordered list of cleaner names to apply (Chain of Responsibility). 23 Defaults to ``["ftfy", "clean_text", "pdf_breaks"]``. 24 lid_primary: 25 Primary language detector strategy. Supported values: 26 - ``"fasttext"`` (default): uses fastText LID (auto-downloads `lid.176.bin`). 27 - ``"cld3"``: optional, requires the extra dependency. 28 lid_fallback: 29 Optional fallback language detector (e.g., ``"cld3"``). Set to 30 ``None`` (default) to disable fallback. 31 languages_supported: 32 Set of languages expected/considered by the pipeline. This does not 33 hard-filter LID output but is used by downstream heuristics/presenters. 34 Defaults to ``{"pt", "en", "es"}``. 35 nlp_model_pref: 36 Preferred spaCy model size: ``"lg"`` (default), ``"md"``, or ``"sm"``. 37 The system falls back md→sm→blank if the requested size isn't available. 38 paragraph_min_chars: 39 Minimum length (in characters) for a block to be considered a paragraph 40 after PDF/HTML heuristics. Helps filter headers/footers. Default: 30. 41 lid_min_chars: 42 Minimum character length for the cleaned text sample to be used by the 43 LID model (shorter ones may yield unreliable predictions). Default: 60. 44 lid_threshold: 45 Reserved threshold for downstream decisions based on LID confidence. 46 Not strictly enforced by detectors; consumers may use it to flag low confidence. 47 Default: 0.60. 48 export: 49 Optional exporters configuration. For example: 50 ``{"pdf": {"path": "report.pdf", "include_global_normalized": True}}``. 51 If ``None`` (default), no exporters are invoked. 52 53 Example: 54 >>> cfg = Intelli3Config( 55 ... cleaners=["ftfy", "clean_text", "pdf_breaks"], 56 ... lid_primary="fasttext", 57 ... nlp_model_pref="md", 58 ... export={"pdf": {"path": "out.pdf", "include_global_normalized": True}}, 59 ... ) 60 """ 61 62 # Cleaning pipeline (ordered) 63 cleaners: list[str] = field(default_factory=lambda: [ 64 "ftfy", 65 "ocr_tilde_fix", 66 "pdf_breaks", 67 "pt_diacritics_repair", 68 "clean_text", 69 "strip_accents", 70 ]) 71 72 # Language identification strategy 73 lid_primary: str = "fasttext" # "fasttext" | "cld3" 74 lid_fallback: Optional[str] = None # "cld3" | None 75 76 # Supported/expected languages for reporting and heuristics 77 languages_supported: Set[str] = field(default_factory=lambda: set(DEFAULT_LANGS)) 78 79 # spaCy model preference: large → medium → small (with blank fallback) 80 nlp_model_pref: str = "lg" # "lg" | "md" | "sm" 81 82 # Paragraph processing and LID sampling thresholds 83 paragraph_min_chars: int = 30 84 lid_min_chars: int = 60 85 lid_threshold: float = 0.60 86 lid_max_chars: int = 2500 87 lid_threshold: float = 0.65 88 lid_use_cld3: bool = True 89 lid_cld3_weight: float = 0.4 90 91 # Exporters configuration (e.g., PDF) 92 export: Optional[Dict[str, Any]] = None
Configuration object for the intelli3text pipeline.
This dataclass centralizes all user-tunable parameters that control ingestion, cleaning, language identification (LID), spaCy model preferences, and exporters.
Attributes:
- cleaners: Ordered list of cleaner names to apply (Chain of Responsibility).
Defaults to
["ftfy", "clean_text", "pdf_breaks"]. - lid_primary: Primary language detector strategy. Supported values:
"fasttext"(default): uses fastText LID (auto-downloadslid.176.bin)."cld3": optional, requires the extra dependency.
- lid_fallback: Optional fallback language detector (e.g.,
"cld3"). Set toNone(default) to disable fallback. - languages_supported: Set of languages expected/considered by the pipeline. This does not
hard-filter LID output but is used by downstream heuristics/presenters.
Defaults to
{"pt", "en", "es"}. - nlp_model_pref: Preferred spaCy model size:
"lg"(default),"md", or"sm". The system falls back md→sm→blank if the requested size isn't available. - paragraph_min_chars: Minimum length (in characters) for a block to be considered a paragraph after PDF/HTML heuristics. Helps filter headers/footers. Default: 30.
- lid_min_chars: Minimum character length for the cleaned text sample to be used by the LID model (shorter ones may yield unreliable predictions). Default: 60.
- lid_threshold: Reserved threshold for downstream decisions based on LID confidence. Not strictly enforced by detectors; consumers may use it to flag low confidence. Default: 0.60.
- export: Optional exporters configuration. For example:
{"pdf": {"path": "report.pdf", "include_global_normalized": True}}. IfNone(default), no exporters are invoked.
Example:
>>> cfg = Intelli3Config( ... cleaners=["ftfy", "clean_text", "pdf_breaks"], ... lid_primary="fasttext", ... nlp_model_pref="md", ... export={"pdf": {"path": "out.pdf", "include_global_normalized": True}}, ... )
Intelli3Config( cleaners: list[str] = <factory>, lid_primary: str = 'fasttext', lid_fallback: Optional[str] = None, languages_supported: Set[str] = <factory>, nlp_model_pref: str = 'lg', paragraph_min_chars: int = 30, lid_min_chars: int = 60, lid_threshold: float = 0.65, lid_max_chars: int = 2500, lid_use_cld3: bool = True, lid_cld3_weight: float = 0.4, export: Optional[Dict[str, Any]] = None)
class
PipelineBuilder:
18class PipelineBuilder: 19 """Builder for the high-level processing :class:`Pipeline`. 20 21 Wires strategy components (extractors, cleaners, LID, normalizer, exporters) 22 according to an :class:`Intelli3Config`. Fluent API allows overriding parts 23 before :meth:`build`. 24 25 Defaults: 26 - Extractors: Web/PDF/DOCX/TXT 27 - Cleaners: names from ``cfg.cleaners`` (via :class:`CleanerChain`) 28 - LID primary: :class:`FastTextLID` (auto-download of ``lid.176.ftz``) 29 - LID fallback: ``None`` (optional CLD3 if installed) 30 - Normalizer: :class:`SpacyNormalizer` (size per config, with fallbacks) 31 - Exporters: PDF when configured in ``cfg.export['pdf']`` with a valid ``path`` 32 33 Example: 34 >>> builder = PipelineBuilder(Intelli3Config()) 35 >>> pipeline = builder.with_lid(primary="fasttext").build() 36 >>> result = pipeline.process("https://example.com") 37 """ 38 39 def __init__(self, cfg: Optional[Intelli3Config] = None) -> None: 40 self.cfg: Intelli3Config = cfg or Intelli3Config() 41 42 # Extractors (Strategy) 43 self._extractors = { 44 "web": WebExtractor(), 45 "pdf": PDFExtractor(), 46 "docx": DocxExtractor(), 47 "text": TextExtractor(), 48 } 49 50 # Cleaners (Chain of Responsibility) 51 self._cleaners: CleanerChain = CleanerChain.from_names(self.cfg.cleaners) 52 53 # Language ID (Strategy): fastText por padrão; CLD3 é opcional 54 self._lid_primary = FastTextLID() 55 self._lid_fallback = None # configurável via with_lid() 56 57 # Normalizer (Strategy) 58 self._normalizer = SpacyNormalizer(self.cfg) 59 60 # Exporters (Strategy) 61 self._exporters: Dict[str, Any] = {} 62 if self.cfg.export and "pdf" in self.cfg.export and self.cfg.export["pdf"].get("path"): 63 self._exporters["pdf"] = PDFExporter(**self.cfg.export["pdf"]) 64 65 # ----------------------- 66 # Fluent customization API 67 # ----------------------- 68 69 def with_cleaners(self, names: List[str]) -> "PipelineBuilder": 70 """Override the cleaners chain by names.""" 71 self._cleaners = CleanerChain.from_names(names) 72 return self 73 74 def with_lid(self, primary: str, fallback: Optional[str] = None) -> "PipelineBuilder": 75 """Configure language identification strategies. 76 77 Args: 78 primary: "fasttext" (default) or "cld3" (requires optional extra). 79 fallback: optional fallback ("cld3" or None). 80 """ 81 if primary == "cld3": 82 try: 83 from .lid.cld3_lid import CLD3LID 84 self._lid_primary = CLD3LID(self.cfg) 85 except Exception: 86 # Se CLD3 não estiver disponível, fica no fastText 87 self._lid_primary = FastTextLID() 88 else: 89 self._lid_primary = FastTextLID() 90 91 if fallback == "cld3": 92 try: 93 from .lid.cld3_lid import CLD3LID 94 self._lid_fallback = CLD3LID(self.cfg) 95 except Exception: 96 self._lid_fallback = None 97 else: 98 self._lid_fallback = None 99 100 return self 101 102 def with_exporter(self, name: str, **kwargs: Any) -> "PipelineBuilder": 103 """Add or override an exporter by name.""" 104 if name == "pdf": 105 from .export.pdf_reportlab import PDFExporter as _PDFExporter 106 self._exporters["pdf"] = _PDFExporter(**kwargs) 107 return self 108 109 # ------------- 110 # Finalization 111 # ------------- 112 113 def build(self) -> Pipeline: 114 """Create a :class:`Pipeline` instance with the current configuration and strategies.""" 115 return Pipeline( 116 cfg=self.cfg, 117 extractors=self._extractors, 118 cleaners=self._cleaners, 119 lid_primary=self._lid_primary, 120 lid_fallback=self._lid_fallback, 121 normalizer=self._normalizer, 122 exporters=self._exporters, 123 )
Builder for the high-level processing Pipeline.
Wires strategy components (extractors, cleaners, LID, normalizer, exporters)
according to an Intelli3Config. Fluent API allows overriding parts
before build().
Defaults:
- Extractors: Web/PDF/DOCX/TXT
- Cleaners: names from
cfg.cleaners(viaCleanerChain)- LID primary:
FastTextLID(auto-download oflid.176.ftz)- LID fallback:
None(optional CLD3 if installed)- Normalizer:
SpacyNormalizer(size per config, with fallbacks)- Exporters: PDF when configured in
cfg.export['pdf']with a validpath
Example:
>>> builder = PipelineBuilder(Intelli3Config()) >>> pipeline = builder.with_lid(primary="fasttext").build() >>> result = pipeline.process("https://example.com")
PipelineBuilder(cfg: Optional[Intelli3Config] = None)
39 def __init__(self, cfg: Optional[Intelli3Config] = None) -> None: 40 self.cfg: Intelli3Config = cfg or Intelli3Config() 41 42 # Extractors (Strategy) 43 self._extractors = { 44 "web": WebExtractor(), 45 "pdf": PDFExtractor(), 46 "docx": DocxExtractor(), 47 "text": TextExtractor(), 48 } 49 50 # Cleaners (Chain of Responsibility) 51 self._cleaners: CleanerChain = CleanerChain.from_names(self.cfg.cleaners) 52 53 # Language ID (Strategy): fastText por padrão; CLD3 é opcional 54 self._lid_primary = FastTextLID() 55 self._lid_fallback = None # configurável via with_lid() 56 57 # Normalizer (Strategy) 58 self._normalizer = SpacyNormalizer(self.cfg) 59 60 # Exporters (Strategy) 61 self._exporters: Dict[str, Any] = {} 62 if self.cfg.export and "pdf" in self.cfg.export and self.cfg.export["pdf"].get("path"): 63 self._exporters["pdf"] = PDFExporter(**self.cfg.export["pdf"])
cfg: Intelli3Config
69 def with_cleaners(self, names: List[str]) -> "PipelineBuilder": 70 """Override the cleaners chain by names.""" 71 self._cleaners = CleanerChain.from_names(names) 72 return self
Override the cleaners chain by names.
74 def with_lid(self, primary: str, fallback: Optional[str] = None) -> "PipelineBuilder": 75 """Configure language identification strategies. 76 77 Args: 78 primary: "fasttext" (default) or "cld3" (requires optional extra). 79 fallback: optional fallback ("cld3" or None). 80 """ 81 if primary == "cld3": 82 try: 83 from .lid.cld3_lid import CLD3LID 84 self._lid_primary = CLD3LID(self.cfg) 85 except Exception: 86 # Se CLD3 não estiver disponível, fica no fastText 87 self._lid_primary = FastTextLID() 88 else: 89 self._lid_primary = FastTextLID() 90 91 if fallback == "cld3": 92 try: 93 from .lid.cld3_lid import CLD3LID 94 self._lid_fallback = CLD3LID(self.cfg) 95 except Exception: 96 self._lid_fallback = None 97 else: 98 self._lid_fallback = None 99 100 return self
Configure language identification strategies.
Arguments:
- primary: "fasttext" (default) or "cld3" (requires optional extra).
- fallback: optional fallback ("cld3" or None).
102 def with_exporter(self, name: str, **kwargs: Any) -> "PipelineBuilder": 103 """Add or override an exporter by name.""" 104 if name == "pdf": 105 from .export.pdf_reportlab import PDFExporter as _PDFExporter 106 self._exporters["pdf"] = _PDFExporter(**kwargs) 107 return self
Add or override an exporter by name.
def
build(self) -> intelli3text.pipeline.Pipeline:
113 def build(self) -> Pipeline: 114 """Create a :class:`Pipeline` instance with the current configuration and strategies.""" 115 return Pipeline( 116 cfg=self.cfg, 117 extractors=self._extractors, 118 cleaners=self._cleaners, 119 lid_primary=self._lid_primary, 120 lid_fallback=self._lid_fallback, 121 normalizer=self._normalizer, 122 exporters=self._exporters, 123 )
Create a Pipeline instance with the current configuration and strategies.
__version__ =
'0.2.6'