Source code for app.components.text_handling

"""Text handling utilities for cleaning and normalizing strings.

Utilities for:
- Removing accent marks from characters
- Replacing special characters with specified replacements
- Combined accent and special character handling
- Simplified text cleaning interface
"""

from typing import Optional, Dict
import re
import unidecode

[docs] def remove_accent_characters(text: str) -> str: """Replace accented characters with their unaccented equivalents. :param text: Input string containing accented characters. :returns: String with accented characters replaced by unaccented equivalents. """ return unidecode.unidecode(text)
[docs] def replace_special_characters( text: str, replacewith: str = '.', dict_and_re: bool = False, replacement_dict: Optional[Dict[str, str]] = None, stripresult: bool = True, remove_duplicates: bool = False, make_lowercase: bool = True, allow_numbers: bool = True, allow_space: bool = False, mask_first_digit: str|None = None ) -> str: """Replace special characters in a string with specified replacements. :param text: Input string containing special characters. :param replacewith: Character to use for replacement. :param dict_and_re: Whether to apply both dictionary replacements and regex. :param replacement_dict: Mapping of specific substrings to replacements. :param stripresult: Strip whitespace and replacement characters from result. :param remove_duplicates: Collapse consecutive replacement characters. :param make_lowercase: Convert result to lowercase. :param allow_numbers: Allow numbers in the result. :param allow_space: Allow spaces in the result. :param mask_first_digit: Character to prefix when first char is a digit. :returns: String with special characters replaced. """ ret: str regex_pat = r'[^a-zA-Z0-9]' if allow_space: regex_pat = r'[^a-zA-Z0-9 ]' if not allow_numbers: regex_pat = regex_pat.replace('0-9', '') if not replacement_dict: ret = re.sub(regex_pat, replacewith, text) else: # Sort replacement keys by length (longest first) to handle overlapping patterns for key in sorted(list(replacement_dict.keys()), key=lambda x: len(x), reverse=True): if key in text: text = text.replace(key, replacement_dict[key]) if dict_and_re: ret = re.sub(regex_pat, replacewith, text) else: new_text: list[str] = [] for character in text: if not character.isalnum(): new_text.append(replacewith) else: new_text.append(character) ret = ''.join(new_text) if stripresult: curlen: int = -1 while len(ret) != curlen: curlen = len(ret) ret = ret.strip() ret = ret.strip(replacewith) if remove_duplicates: curlen: int = -1 while len(ret) != curlen: curlen = len(ret) ret = ret.replace(f'{replacewith}{replacewith}', replacewith) if make_lowercase: ret = ret.lower() if mask_first_digit: if ret[0].isdigit(): ret = mask_first_digit + ret return ret
[docs] def replace_accent_and_special_characters( text: str, **kwargs ) -> str: """Replace both accented and special characters in a string. :param text: Input string containing accented and special characters. :param kwargs: Passed through to ``replace_special_characters``. :returns: Cleaned string. """ return replace_special_characters(remove_accent_characters(text), **kwargs)
[docs] def clean_text(text: str) -> str: """Simplified alias for ``replace_accent_and_special_characters``. :param text: Input string to clean. :returns: Cleaned string with default handling. """ return replace_accent_and_special_characters(text)
[docs] def sanitize_for_database_use(text: str) -> str: """Sanitize a string for use in a database column name. :param text: Input string to sanitize. :returns: Sanitized string (alnum/underscore, prefixed if starting with digit). """ return replace_special_characters(remove_accent_characters(text), replacewith='_', allow_numbers=False, mask_first_digit = 'c')