custom_cleaners
Custom cleaners example.
This script shows how to plug custom cleaners into the pipeline. It is safe to import (pdoc won't execute it), and only runs when executed as a script.
Run:
python -m examples.custom_cleaners
1"""Custom cleaners example. 2 3This script shows how to plug custom cleaners into the pipeline. 4It is safe to import (pdoc won't execute it), and only runs when executed as a script. 5 6Run: 7 python -m examples.custom_cleaners 8""" 9 10from intelli3text import PipelineBuilder, Intelli3Config 11 12def main() -> None: 13 cfg = Intelli3Config( 14 cleaners=["ftfy", "clean_text", "pdf_breaks"], # extend/replace with your custom cleaner names 15 nlp_model_pref="md", 16 export=None, # or {"pdf": {"path": "out.pdf", "include_global_normalized": True}} 17 ) 18 19 pipeline = PipelineBuilder(cfg).build() 20 # Use a robust source. If the URL fails or yields no paragraphs, we handle it below. 21 source = "https://pt.wikipedia.org/wiki/Howard_Gardner" 22 res = pipeline.process(source) 23 24 print("Global language:", res.get("language_global")) 25 paras = res.get("paragraphs", []) 26 27 if not paras: 28 print("No paragraphs were extracted. This can happen if the URL failed to fetch or is blocked.") 29 return 30 31 # Safe preview 32 first = paras[0] 33 print("First cleaned paragraph snippet:", (first.get("cleaned") or "")[:200]) 34 35if __name__ == "__main__": 36 main()
def
main() -> None:
13def main() -> None: 14 cfg = Intelli3Config( 15 cleaners=["ftfy", "clean_text", "pdf_breaks"], # extend/replace with your custom cleaner names 16 nlp_model_pref="md", 17 export=None, # or {"pdf": {"path": "out.pdf", "include_global_normalized": True}} 18 ) 19 20 pipeline = PipelineBuilder(cfg).build() 21 # Use a robust source. If the URL fails or yields no paragraphs, we handle it below. 22 source = "https://pt.wikipedia.org/wiki/Howard_Gardner" 23 res = pipeline.process(source) 24 25 print("Global language:", res.get("language_global")) 26 paras = res.get("paragraphs", []) 27 28 if not paras: 29 print("No paragraphs were extracted. This can happen if the URL failed to fetch or is blocked.") 30 return 31 32 # Safe preview 33 first = paras[0] 34 print("First cleaned paragraph snippet:", (first.get("cleaned") or "")[:200])