"""Text handling utilities for cleaning and normalizing strings.
Utilities for:
- Removing accent marks from characters
- Replacing special characters with specified replacements
- Combined accent and special character handling
- Simplified text cleaning interface
"""
from typing import Optional, Dict
import re
import unidecode
[docs]
def remove_accent_characters(text: str) -> str:
"""Replace accented characters with their unaccented equivalents.
:param text: Input string containing accented characters.
:returns: String with accented characters replaced by unaccented equivalents.
"""
return unidecode.unidecode(text)
[docs]
def replace_special_characters(
text: str,
replacewith: str = '.',
dict_and_re: bool = False,
replacement_dict: Optional[Dict[str, str]] = None,
stripresult: bool = True,
remove_duplicates: bool = False,
make_lowercase: bool = True,
allow_numbers: bool = True,
allow_space: bool = False,
mask_first_digit: str|None = None
) -> str:
"""Replace special characters in a string with specified replacements.
:param text: Input string containing special characters.
:param replacewith: Character to use for replacement.
:param dict_and_re: Whether to apply both dictionary replacements and regex.
:param replacement_dict: Mapping of specific substrings to replacements.
:param stripresult: Strip whitespace and replacement characters from result.
:param remove_duplicates: Collapse consecutive replacement characters.
:param make_lowercase: Convert result to lowercase.
:param allow_numbers: Allow numbers in the result.
:param allow_space: Allow spaces in the result.
:param mask_first_digit: Character to prefix when first char is a digit.
:returns: String with special characters replaced.
"""
ret: str
regex_pat = r'[^a-zA-Z0-9]'
if allow_space:
regex_pat = r'[^a-zA-Z0-9 ]'
if not allow_numbers:
regex_pat = regex_pat.replace('0-9', '')
if not replacement_dict:
ret = re.sub(regex_pat, replacewith, text)
else:
# Sort replacement keys by length (longest first) to handle overlapping patterns
for key in sorted(list(replacement_dict.keys()), key=lambda x: len(x), reverse=True):
if key in text:
text = text.replace(key, replacement_dict[key])
if dict_and_re:
ret = re.sub(regex_pat, replacewith, text)
else:
new_text: list[str] = []
for character in text:
if not character.isalnum():
new_text.append(replacewith)
else:
new_text.append(character)
ret = ''.join(new_text)
if stripresult:
curlen: int = -1
while len(ret) != curlen:
curlen = len(ret)
ret = ret.strip()
ret = ret.strip(replacewith)
if remove_duplicates:
curlen: int = -1
while len(ret) != curlen:
curlen = len(ret)
ret = ret.replace(f'{replacewith}{replacewith}', replacewith)
if make_lowercase:
ret = ret.lower()
if mask_first_digit:
if ret[0].isdigit():
ret = mask_first_digit + ret
return ret
[docs]
def replace_accent_and_special_characters(
text: str,
**kwargs
) -> str:
"""Replace both accented and special characters in a string.
:param text: Input string containing accented and special characters.
:param kwargs: Passed through to ``replace_special_characters``.
:returns: Cleaned string.
"""
return replace_special_characters(remove_accent_characters(text), **kwargs)
[docs]
def clean_text(text: str) -> str:
"""Simplified alias for ``replace_accent_and_special_characters``.
:param text: Input string to clean.
:returns: Cleaned string with default handling.
"""
return replace_accent_and_special_characters(text)
[docs]
def sanitize_for_database_use(text: str) -> str:
"""Sanitize a string for use in a database column name.
:param text: Input string to sanitize.
:returns: Sanitized string (alnum/underscore, prefixed if starting with digit).
"""
return replace_special_characters(remove_accent_characters(text), replacewith='_', allow_numbers=False, mask_first_digit = 'c')