intelli3text

intelli3text public API.

This package provides a high-level, opinionated pipeline for:

  • Text ingestion (Web/PDF/DOCX/TXT)
  • Cleaning & normalization
  • Per-paragraph Language Identification (PT/EN/ES)
  • Optional export (e.g., PDF reports)

Typical usage (Python):

from intelli3text import PipelineBuilder, Intelli3Config

cfg = Intelli3Config(
    cleaners=["ftfy", "clean_text", "pdf_breaks"],
    nlp_model_pref="md",
    export={"pdf": {"path": "report.pdf", "include_global_normalized": True}},
)
pipeline = PipelineBuilder(cfg).build()
result = pipeline.process("https://en.wikipedia.org/wiki/NLP")
print(result["language_global"], len(result["paragraphs"]))

Public Re-exports

  • Intelli3Config: Configuration dataclass for the pipeline (cleaners, LID, spaCy model size, export, etc.)
  • PipelineBuilder: Builder that wires up extractors, cleaners, LID, normalizer, and exporters into a ready-to-use Pipeline
 1"""
 2intelli3text public API.
 3
 4This package provides a high-level, opinionated pipeline for:
 5- Text ingestion (Web/PDF/DOCX/TXT)
 6- Cleaning & normalization
 7- Per-paragraph Language Identification (PT/EN/ES)
 8- Optional export (e.g., PDF reports)
 9
10Typical usage (Python):
11
12    from intelli3text import PipelineBuilder, Intelli3Config
13
14    cfg = Intelli3Config(
15        cleaners=["ftfy", "clean_text", "pdf_breaks"],
16        nlp_model_pref="md",
17        export={"pdf": {"path": "report.pdf", "include_global_normalized": True}},
18    )
19    pipeline = PipelineBuilder(cfg).build()
20    result = pipeline.process("https://en.wikipedia.org/wiki/NLP")
21    print(result["language_global"], len(result["paragraphs"]))
22
23Public Re-exports
24-----------------
25- Intelli3Config: Configuration dataclass for the pipeline (cleaners, LID, spaCy model size, export, etc.)
26- PipelineBuilder: Builder that wires up extractors, cleaners, LID, normalizer, and exporters into a ready-to-use Pipeline
27"""
28
29from .config import Intelli3Config
30from .builder import PipelineBuilder
31
32# Optional: expose package version (helps docs and users introspecting versions)
33# Uses importlib.metadata on Python 3.8+ (available in stdlib).
34try:  # pragma: no cover - version retrieval is environment-dependent
35    from importlib.metadata import version, PackageNotFoundError  # type: ignore
36    try:
37        __version__ = version("intelli3text")
38    except PackageNotFoundError:
39        # When running from a source tree without an installed distribution.
40        __version__ = "0.0.0.dev0"
41except Exception:  # defensive fallback
42    __version__ = "0.0.0"
43
44# Explicitly define the public surface of the package.
45__all__ = [
46    "Intelli3Config",
47    "PipelineBuilder",
48    "__version__",
49]
@dataclass
class Intelli3Config:
12@dataclass
13class Intelli3Config:
14    """Configuration object for the intelli3text pipeline.
15
16    This dataclass centralizes all user-tunable parameters that control
17    ingestion, cleaning, language identification (LID), spaCy model
18    preferences, and exporters.
19
20    Attributes:
21        cleaners:
22            Ordered list of cleaner names to apply (Chain of Responsibility).
23            Defaults to ``["ftfy", "clean_text", "pdf_breaks"]``.
24        lid_primary:
25            Primary language detector strategy. Supported values:
26            - ``"fasttext"`` (default): uses fastText LID (auto-downloads `lid.176.bin`).
27            - ``"cld3"``: optional, requires the extra dependency.
28        lid_fallback:
29            Optional fallback language detector (e.g., ``"cld3"``). Set to
30            ``None`` (default) to disable fallback.
31        languages_supported:
32            Set of languages expected/considered by the pipeline. This does not
33            hard-filter LID output but is used by downstream heuristics/presenters.
34            Defaults to ``{"pt", "en", "es"}``.
35        nlp_model_pref:
36            Preferred spaCy model size: ``"lg"`` (default), ``"md"``, or ``"sm"``.
37            The system falls back md→sm→blank if the requested size isn't available.
38        paragraph_min_chars:
39            Minimum length (in characters) for a block to be considered a paragraph
40            after PDF/HTML heuristics. Helps filter headers/footers. Default: 30.
41        lid_min_chars:
42            Minimum character length for the cleaned text sample to be used by the
43            LID model (shorter ones may yield unreliable predictions). Default: 60.
44        lid_threshold:
45            Reserved threshold for downstream decisions based on LID confidence.
46            Not strictly enforced by detectors; consumers may use it to flag low confidence.
47            Default: 0.60.
48        export:
49            Optional exporters configuration. For example:
50            ``{"pdf": {"path": "report.pdf", "include_global_normalized": True}}``.
51            If ``None`` (default), no exporters are invoked.
52
53    Example:
54        >>> cfg = Intelli3Config(
55        ...     cleaners=["ftfy", "clean_text", "pdf_breaks"],
56        ...     lid_primary="fasttext",
57        ...     nlp_model_pref="md",
58        ...     export={"pdf": {"path": "out.pdf", "include_global_normalized": True}},
59        ... )
60    """
61
62    # Cleaning pipeline (ordered)
63    cleaners: list[str] = field(default_factory=lambda: [
64        "ftfy",
65        "ocr_tilde_fix",
66        "pdf_breaks",
67        "pt_diacritics_repair",
68        "clean_text",
69        "strip_accents",
70    ])
71
72    # Language identification strategy
73    lid_primary: str = "fasttext"                 # "fasttext" | "cld3"
74    lid_fallback: Optional[str] = None            # "cld3" | None
75
76    # Supported/expected languages for reporting and heuristics
77    languages_supported: Set[str] = field(default_factory=lambda: set(DEFAULT_LANGS))
78
79    # spaCy model preference: large → medium → small (with blank fallback)
80    nlp_model_pref: str = "lg"                    # "lg" | "md" | "sm"
81
82    # Paragraph processing and LID sampling thresholds
83    paragraph_min_chars: int = 30
84    lid_min_chars: int = 60
85    lid_threshold: float = 0.60
86    lid_max_chars: int = 2500
87    lid_threshold: float = 0.65
88    lid_use_cld3: bool = True
89    lid_cld3_weight: float = 0.4    
90
91    # Exporters configuration (e.g., PDF)
92    export: Optional[Dict[str, Any]] = None

Configuration object for the intelli3text pipeline.

This dataclass centralizes all user-tunable parameters that control ingestion, cleaning, language identification (LID), spaCy model preferences, and exporters.

Attributes:
  • cleaners: Ordered list of cleaner names to apply (Chain of Responsibility). Defaults to ["ftfy", "clean_text", "pdf_breaks"].
  • lid_primary: Primary language detector strategy. Supported values:
    • "fasttext" (default): uses fastText LID (auto-downloads lid.176.bin).
    • "cld3": optional, requires the extra dependency.
  • lid_fallback: Optional fallback language detector (e.g., "cld3"). Set to None (default) to disable fallback.
  • languages_supported: Set of languages expected/considered by the pipeline. This does not hard-filter LID output but is used by downstream heuristics/presenters. Defaults to {"pt", "en", "es"}.
  • nlp_model_pref: Preferred spaCy model size: "lg" (default), "md", or "sm". The system falls back md→sm→blank if the requested size isn't available.
  • paragraph_min_chars: Minimum length (in characters) for a block to be considered a paragraph after PDF/HTML heuristics. Helps filter headers/footers. Default: 30.
  • lid_min_chars: Minimum character length for the cleaned text sample to be used by the LID model (shorter ones may yield unreliable predictions). Default: 60.
  • lid_threshold: Reserved threshold for downstream decisions based on LID confidence. Not strictly enforced by detectors; consumers may use it to flag low confidence. Default: 0.60.
  • export: Optional exporters configuration. For example: {"pdf": {"path": "report.pdf", "include_global_normalized": True}}. If None (default), no exporters are invoked.
Example:
>>> cfg = Intelli3Config(
...     cleaners=["ftfy", "clean_text", "pdf_breaks"],
...     lid_primary="fasttext",
...     nlp_model_pref="md",
...     export={"pdf": {"path": "out.pdf", "include_global_normalized": True}},
... )
Intelli3Config( cleaners: list[str] = <factory>, lid_primary: str = 'fasttext', lid_fallback: Optional[str] = None, languages_supported: Set[str] = <factory>, nlp_model_pref: str = 'lg', paragraph_min_chars: int = 30, lid_min_chars: int = 60, lid_threshold: float = 0.65, lid_max_chars: int = 2500, lid_use_cld3: bool = True, lid_cld3_weight: float = 0.4, export: Optional[Dict[str, Any]] = None)
cleaners: list[str]
lid_primary: str = 'fasttext'
lid_fallback: Optional[str] = None
languages_supported: Set[str]
nlp_model_pref: str = 'lg'
paragraph_min_chars: int = 30
lid_min_chars: int = 60
lid_threshold: float = 0.65
lid_max_chars: int = 2500
lid_use_cld3: bool = True
lid_cld3_weight: float = 0.4
export: Optional[Dict[str, Any]] = None
class PipelineBuilder:
 18class PipelineBuilder:
 19    """Builder for the high-level processing :class:`Pipeline`.
 20
 21    Wires strategy components (extractors, cleaners, LID, normalizer, exporters)
 22    according to an :class:`Intelli3Config`. Fluent API allows overriding parts
 23    before :meth:`build`.
 24
 25    Defaults:
 26        - Extractors: Web/PDF/DOCX/TXT
 27        - Cleaners: names from ``cfg.cleaners`` (via :class:`CleanerChain`)
 28        - LID primary: :class:`FastTextLID` (auto-download of ``lid.176.ftz``)
 29        - LID fallback: ``None`` (optional CLD3 if installed)
 30        - Normalizer: :class:`SpacyNormalizer` (size per config, with fallbacks)
 31        - Exporters: PDF when configured in ``cfg.export['pdf']`` with a valid ``path``
 32
 33    Example:
 34        >>> builder = PipelineBuilder(Intelli3Config())
 35        >>> pipeline = builder.with_lid(primary="fasttext").build()
 36        >>> result = pipeline.process("https://example.com")
 37    """
 38
 39    def __init__(self, cfg: Optional[Intelli3Config] = None) -> None:
 40        self.cfg: Intelli3Config = cfg or Intelli3Config()
 41
 42        # Extractors (Strategy)
 43        self._extractors = {
 44            "web": WebExtractor(),
 45            "pdf": PDFExtractor(),
 46            "docx": DocxExtractor(),
 47            "text": TextExtractor(),
 48        }
 49
 50        # Cleaners (Chain of Responsibility)
 51        self._cleaners: CleanerChain = CleanerChain.from_names(self.cfg.cleaners)
 52
 53        # Language ID (Strategy): fastText por padrão; CLD3 é opcional
 54        self._lid_primary = FastTextLID()
 55        self._lid_fallback = None  # configurável via with_lid()
 56
 57        # Normalizer (Strategy)
 58        self._normalizer = SpacyNormalizer(self.cfg)
 59
 60        # Exporters (Strategy)
 61        self._exporters: Dict[str, Any] = {}
 62        if self.cfg.export and "pdf" in self.cfg.export and self.cfg.export["pdf"].get("path"):
 63            self._exporters["pdf"] = PDFExporter(**self.cfg.export["pdf"])
 64
 65    # -----------------------
 66    # Fluent customization API
 67    # -----------------------
 68
 69    def with_cleaners(self, names: List[str]) -> "PipelineBuilder":
 70        """Override the cleaners chain by names."""
 71        self._cleaners = CleanerChain.from_names(names)
 72        return self
 73
 74    def with_lid(self, primary: str, fallback: Optional[str] = None) -> "PipelineBuilder":
 75        """Configure language identification strategies.
 76
 77        Args:
 78            primary: "fasttext" (default) or "cld3" (requires optional extra).
 79            fallback: optional fallback ("cld3" or None).
 80        """
 81        if primary == "cld3":
 82            try:
 83                from .lid.cld3_lid import CLD3LID
 84                self._lid_primary = CLD3LID(self.cfg)
 85            except Exception:
 86                # Se CLD3 não estiver disponível, fica no fastText
 87                self._lid_primary = FastTextLID()
 88        else:
 89            self._lid_primary = FastTextLID()
 90
 91        if fallback == "cld3":
 92            try:
 93                from .lid.cld3_lid import CLD3LID
 94                self._lid_fallback = CLD3LID(self.cfg)
 95            except Exception:
 96                self._lid_fallback = None
 97        else:
 98            self._lid_fallback = None
 99
100        return self
101
102    def with_exporter(self, name: str, **kwargs: Any) -> "PipelineBuilder":
103        """Add or override an exporter by name."""
104        if name == "pdf":
105            from .export.pdf_reportlab import PDFExporter as _PDFExporter
106            self._exporters["pdf"] = _PDFExporter(**kwargs)
107        return self
108
109    # -------------
110    # Finalization
111    # -------------
112
113    def build(self) -> Pipeline:
114        """Create a :class:`Pipeline` instance with the current configuration and strategies."""
115        return Pipeline(
116            cfg=self.cfg,
117            extractors=self._extractors,
118            cleaners=self._cleaners,
119            lid_primary=self._lid_primary,
120            lid_fallback=self._lid_fallback,
121            normalizer=self._normalizer,
122            exporters=self._exporters,
123        )

Builder for the high-level processing Pipeline.

Wires strategy components (extractors, cleaners, LID, normalizer, exporters) according to an Intelli3Config. Fluent API allows overriding parts before build().

Defaults:
  • Extractors: Web/PDF/DOCX/TXT
  • Cleaners: names from cfg.cleaners (via CleanerChain)
  • LID primary: FastTextLID (auto-download of lid.176.ftz)
  • LID fallback: None (optional CLD3 if installed)
  • Normalizer: SpacyNormalizer (size per config, with fallbacks)
  • Exporters: PDF when configured in cfg.export['pdf'] with a valid path
Example:
>>> builder = PipelineBuilder(Intelli3Config())
>>> pipeline = builder.with_lid(primary="fasttext").build()
>>> result = pipeline.process("https://example.com")
PipelineBuilder(cfg: Optional[Intelli3Config] = None)
39    def __init__(self, cfg: Optional[Intelli3Config] = None) -> None:
40        self.cfg: Intelli3Config = cfg or Intelli3Config()
41
42        # Extractors (Strategy)
43        self._extractors = {
44            "web": WebExtractor(),
45            "pdf": PDFExtractor(),
46            "docx": DocxExtractor(),
47            "text": TextExtractor(),
48        }
49
50        # Cleaners (Chain of Responsibility)
51        self._cleaners: CleanerChain = CleanerChain.from_names(self.cfg.cleaners)
52
53        # Language ID (Strategy): fastText por padrão; CLD3 é opcional
54        self._lid_primary = FastTextLID()
55        self._lid_fallback = None  # configurável via with_lid()
56
57        # Normalizer (Strategy)
58        self._normalizer = SpacyNormalizer(self.cfg)
59
60        # Exporters (Strategy)
61        self._exporters: Dict[str, Any] = {}
62        if self.cfg.export and "pdf" in self.cfg.export and self.cfg.export["pdf"].get("path"):
63            self._exporters["pdf"] = PDFExporter(**self.cfg.export["pdf"])
def with_cleaners(self, names: List[str]) -> PipelineBuilder:
69    def with_cleaners(self, names: List[str]) -> "PipelineBuilder":
70        """Override the cleaners chain by names."""
71        self._cleaners = CleanerChain.from_names(names)
72        return self

Override the cleaners chain by names.

def with_lid( self, primary: str, fallback: Optional[str] = None) -> PipelineBuilder:
 74    def with_lid(self, primary: str, fallback: Optional[str] = None) -> "PipelineBuilder":
 75        """Configure language identification strategies.
 76
 77        Args:
 78            primary: "fasttext" (default) or "cld3" (requires optional extra).
 79            fallback: optional fallback ("cld3" or None).
 80        """
 81        if primary == "cld3":
 82            try:
 83                from .lid.cld3_lid import CLD3LID
 84                self._lid_primary = CLD3LID(self.cfg)
 85            except Exception:
 86                # Se CLD3 não estiver disponível, fica no fastText
 87                self._lid_primary = FastTextLID()
 88        else:
 89            self._lid_primary = FastTextLID()
 90
 91        if fallback == "cld3":
 92            try:
 93                from .lid.cld3_lid import CLD3LID
 94                self._lid_fallback = CLD3LID(self.cfg)
 95            except Exception:
 96                self._lid_fallback = None
 97        else:
 98            self._lid_fallback = None
 99
100        return self

Configure language identification strategies.

Arguments:
  • primary: "fasttext" (default) or "cld3" (requires optional extra).
  • fallback: optional fallback ("cld3" or None).
def with_exporter(self, name: str, **kwargs: Any) -> PipelineBuilder:
102    def with_exporter(self, name: str, **kwargs: Any) -> "PipelineBuilder":
103        """Add or override an exporter by name."""
104        if name == "pdf":
105            from .export.pdf_reportlab import PDFExporter as _PDFExporter
106            self._exporters["pdf"] = _PDFExporter(**kwargs)
107        return self

Add or override an exporter by name.

def build(self) -> intelli3text.pipeline.Pipeline:
113    def build(self) -> Pipeline:
114        """Create a :class:`Pipeline` instance with the current configuration and strategies."""
115        return Pipeline(
116            cfg=self.cfg,
117            extractors=self._extractors,
118            cleaners=self._cleaners,
119            lid_primary=self._lid_primary,
120            lid_fallback=self._lid_fallback,
121            normalizer=self._normalizer,
122            exporters=self._exporters,
123        )

Create a Pipeline instance with the current configuration and strategies.

__version__ = '0.2.6'