custom_cleaners

Custom cleaners example.

This script shows how to plug custom cleaners into the pipeline. It is safe to import (pdoc won't execute it), and only runs when executed as a script.

Run:

python -m examples.custom_cleaners

View Source

 1"""Custom cleaners example.
 2
 3This script shows how to plug custom cleaners into the pipeline.
 4It is safe to import (pdoc won't execute it), and only runs when executed as a script.
 5
 6Run:
 7    python -m examples.custom_cleaners
 8"""
 9
10from intelli3text import PipelineBuilder, Intelli3Config
11
12def main() -> None:
13    cfg = Intelli3Config(
14        cleaners=["ftfy", "clean_text", "pdf_breaks"],  # extend/replace with your custom cleaner names
15        nlp_model_pref="md",
16        export=None,  # or {"pdf": {"path": "out.pdf", "include_global_normalized": True}}
17    )
18
19    pipeline = PipelineBuilder(cfg).build()
20    # Use a robust source. If the URL fails or yields no paragraphs, we handle it below.
21    source = "https://pt.wikipedia.org/wiki/Howard_Gardner"
22    res = pipeline.process(source)
23
24    print("Global language:", res.get("language_global"))
25    paras = res.get("paragraphs", [])
26
27    if not paras:
28        print("No paragraphs were extracted. This can happen if the URL failed to fetch or is blocked.")
29        return
30
31    # Safe preview
32    first = paras[0]
33    print("First cleaned paragraph snippet:", (first.get("cleaned") or "")[:200])
34
35if __name__ == "__main__":
36    main()

def main() -> None: View Source

13def main() -> None:
14    cfg = Intelli3Config(
15        cleaners=["ftfy", "clean_text", "pdf_breaks"],  # extend/replace with your custom cleaner names
16        nlp_model_pref="md",
17        export=None,  # or {"pdf": {"path": "out.pdf", "include_global_normalized": True}}
18    )
19
20    pipeline = PipelineBuilder(cfg).build()
21    # Use a robust source. If the URL fails or yields no paragraphs, we handle it below.
22    source = "https://pt.wikipedia.org/wiki/Howard_Gardner"
23    res = pipeline.process(source)
24
25    print("Global language:", res.get("language_global"))
26    paras = res.get("paragraphs", [])
27
28    if not paras:
29        print("No paragraphs were extracted. This can happen if the URL failed to fetch or is blocked.")
30        return
31
32    # Safe preview
33    first = paras[0]
34    print("First cleaned paragraph snippet:", (first.get("cleaned") or "")[:200])