Source code for urduhack.preprocessing.character

# coding: utf8
"""
Urduhack Character preprocess functions
"""

from .regexes import _SPACE_AFTER_ALL_PUNCTUATIONS_RE, _SPACE_BEFORE_ALL_PUNCTUATIONS_RE
from .regexes import _SPACE_AFTER_DIGITS_RE, _SPACE_BEFORE_DIGITS_RE
from .regexes import _SPACE_BEFORE_ENG_CHAR_RE, _SPACE_AFTER_ENG_CHAR_RE


[docs]def digits_space(text: str) -> str:
    """
    Add spaces before|after numeric and urdu digits

    Args:
        text (str): ``Urdu`` text
    Returns:
        str: Returns a ``str`` object containing normalized text.
    Examples:
        >>> from urduhack.preprocessing import digits_space
        >>> text = "20فیصد"
        >>> normalized_text = digits_space(text)
        >>> normalized_text
        20 فیصد
    """
    text = _SPACE_BEFORE_DIGITS_RE.sub(' ', text)
    text = _SPACE_AFTER_DIGITS_RE.sub(' ', text)

    return text


[docs]def english_characters_space(text: str) -> str:
    """
    Functionality to add spaces before and after English words in the given Urdu text. It is an important step in
    normalization of the Urdu data.

    this function returns a :py:class:`String` object which contains the original text with spaces before & after
    English words.

    Args:
        text (str): ``Urdu`` text
    Returns:
        str: Returns a ``str`` object containing normalized text.
    Examples:
        >>> from urduhack.preprocessing import english_characters_space
        >>> text = "خاتون Aliyaنے بچوںUzma and Aliyaکے قتل کا اعترافConfession کیا ہے۔"
        >>> normalized_text = english_characters_space(text)
        >>> normalized_text
        خاتون Aliya نے بچوں Uzma and Aliya کے قتل کا اعتراف Confession کیا ہے۔
    """
    text = _SPACE_BEFORE_ENG_CHAR_RE.sub(' ', text)
    text = _SPACE_AFTER_ENG_CHAR_RE.sub(' ', text)

    return text


[docs]def all_punctuations_space(text: str) -> str:
    """
    Add spaces after punctuations used in ``urdu`` writing

    Args:
        text (str): ``Urdu`` text
    Returns:
        str: Returns a ``str`` object containing normalized text.
    """
    text = _SPACE_BEFORE_ALL_PUNCTUATIONS_RE.sub(' ', text)
    text = _SPACE_AFTER_ALL_PUNCTUATIONS_RE.sub(' ', text)
    return text


[docs]def preprocess(text: str) -> str:
    """
    To preprocess some text, all you need to do pass ``unicode`` text. It will return a ``str``
    with proper spaces after digits and punctuations.

    Args:
        text (str): ``Urdu`` text
    Returns:
        str: urdu text
    Raises:
        TypeError: If text param is not not str Type.
    Examples:
        >>> from urduhack.preprocessing import preprocess
        >>> text = "اَباُوگل پاکستان ﻤﯿﮟ 20 سال ﺳﮯ ، وسائل کی کوئی کمی نہیں ﮨﮯ۔"
        >>> normalized_text = preprocess(text)
        >>> # The text now contains proper spaces after digits and punctuations,
        >>> # normalized characters and no diacritics!
        >>> normalized_text
        اباوگل پاکستان ﻤﯿﮟ 20 سال ﺳﮯ ، وسائل کی کوئی کمی نہیں ﮨﮯ ۔
    """
    if not isinstance(text, str):
        raise TypeError("text must be str type.")

    text = digits_space(text)
    text = all_punctuations_space(text)
    text = english_characters_space(text)
    return text