Source code for urduhack.preprocessing.character

# coding: utf8
"""
Urduhack Character preprocess functions
"""

from .regexes import _SPACE_AFTER_ALL_PUNCTUATIONS_RE, _SPACE_BEFORE_ALL_PUNCTUATIONS_RE
from .regexes import _SPACE_AFTER_DIGITS_RE, _SPACE_BEFORE_DIGITS_RE
from .regexes import _SPACE_BEFORE_ENG_CHAR_RE, _SPACE_AFTER_ENG_CHAR_RE


[docs]def digits_space(text: str) -> str: """ Add spaces before|after numeric and urdu digits Args: text (str): ``Urdu`` text Returns: str: Returns a ``str`` object containing normalized text. Examples: >>> from urduhack.preprocessing import digits_space >>> text = "20فیصد" >>> normalized_text = digits_space(text) >>> normalized_text 20 فیصد """ text = _SPACE_BEFORE_DIGITS_RE.sub(' ', text) text = _SPACE_AFTER_DIGITS_RE.sub(' ', text) return text
[docs]def english_characters_space(text: str) -> str: """ Functionality to add spaces before and after English words in the given Urdu text. It is an important step in normalization of the Urdu data. this function returns a :py:class:`String` object which contains the original text with spaces before & after English words. Args: text (str): ``Urdu`` text Returns: str: Returns a ``str`` object containing normalized text. Examples: >>> from urduhack.preprocessing import english_characters_space >>> text = "خاتون Aliyaنے بچوںUzma and Aliyaکے قتل کا اعترافConfession کیا ہے۔" >>> normalized_text = english_characters_space(text) >>> normalized_text خاتون Aliya نے بچوں Uzma and Aliya کے قتل کا اعتراف Confession کیا ہے۔ """ text = _SPACE_BEFORE_ENG_CHAR_RE.sub(' ', text) text = _SPACE_AFTER_ENG_CHAR_RE.sub(' ', text) return text
[docs]def all_punctuations_space(text: str) -> str: """ Add spaces after punctuations used in ``urdu`` writing Args: text (str): ``Urdu`` text Returns: str: Returns a ``str`` object containing normalized text. """ text = _SPACE_BEFORE_ALL_PUNCTUATIONS_RE.sub(' ', text) text = _SPACE_AFTER_ALL_PUNCTUATIONS_RE.sub(' ', text) return text
[docs]def preprocess(text: str) -> str: """ To preprocess some text, all you need to do pass ``unicode`` text. It will return a ``str`` with proper spaces after digits and punctuations. Args: text (str): ``Urdu`` text Returns: str: urdu text Raises: TypeError: If text param is not not str Type. Examples: >>> from urduhack.preprocessing import preprocess >>> text = "اَباُوگل پاکستان ﻤﯿﮟ 20 سال ﺳﮯ ، وسائل کی کوئی کمی نہیں ﮨﮯ۔" >>> normalized_text = preprocess(text) >>> # The text now contains proper spaces after digits and punctuations, >>> # normalized characters and no diacritics! >>> normalized_text اباوگل پاکستان ﻤﯿﮟ 20 سال ﺳﮯ ، وسائل کی کوئی کمی نہیں ﮨﮯ ۔ """ if not isinstance(text, str): raise TypeError("text must be str type.") text = digits_space(text) text = all_punctuations_space(text) text = english_characters_space(text) return text