diff --git a/src/encoder/records/text.py b/src/encoder/records/text.py index 73d0fc5..7c3c71c 100644 --- a/src/encoder/records/text.py +++ b/src/encoder/records/text.py @@ -1,14 +1,32 @@ import base64 import datetime +import re import struct from html.entities import codepoint2name from typing import Self +from xml.sax.saxutils import escape as xml_escape from .constants import DICTIONARY from .datatypes import Decimal, MultiByteInt31 from .record import record +# Regex to match invalid XML 1.0 characters (control chars except tab, newline, carriage return) +_INVALID_XML_CHARS = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f]') + + +def _sanitize_xml_text(value: str) -> str: + """Sanitize a string for safe inclusion in XML. + + 1. Removes invalid XML 1.0 control characters + 2. Escapes XML special characters (<, >, &, ", ') + """ + # Remove invalid control characters + clean = _INVALID_XML_CHARS.sub('', value) + # Escape XML special characters + return xml_escape(clean, entities={'"': '"', "'": '''}) + + class Text(record): ... @@ -150,7 +168,7 @@ def to_bytes(self) -> bytes: return bytes def __str__(self): - return self.value + return _sanitize_xml_text(self.value) @classmethod def parse(cls, fp) -> Self: @@ -170,7 +188,7 @@ def to_bytes(self) -> bytes: return bytes def __str__(self): - return self.value + return _sanitize_xml_text(self.value) @classmethod def parse(cls, fp) -> Self: @@ -190,7 +208,7 @@ def to_bytes(self) -> bytes: return bytes def __str__(self): - return self.value + return _sanitize_xml_text(self.value) @classmethod def parse(cls, fp) -> Self: @@ -332,11 +350,7 @@ def __init__(self, value: str): self.value = value def __str__(self): - # TODO: check if having unexcaped value is a problem? - # removed the return excape(self.value) because str() was used - # in the print_records function, so it printed stuff like - # amp(value) and stuff. - return self.value + return _sanitize_xml_text(self.value) def to_bytes(self) -> bytes: data = self.value.encode("utf-8") @@ -360,7 +374,7 @@ def __init__(self, value: str): self.value = value def __str__(self): - return self.value + return _sanitize_xml_text(self.value) def to_bytes(self) -> bytes: data = self.value.encode("utf-8") @@ -384,7 +398,7 @@ def __init__(self, value: str): self.value = value def __str__(self): - return self.value + return _sanitize_xml_text(self.value) def to_bytes(self) -> bytes: data = self.value.encode("utf-8")