From 98decc932d620305e3cc19c65d36cbf62f707176 Mon Sep 17 00:00:00 2001 From: "larry.spohn" Date: Fri, 16 Jan 2026 09:10:44 -0500 Subject: [PATCH] Fix XML parsing errors caused by invalid characters in AD attributes Text records now sanitize values before XML serialization by: - Removing invalid XML 1.0 control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F) - Escaping XML special characters (<, >, &, ", ') This fixes "not well-formed (invalid token)" errors when querying AD environments where user attributes contain control characters. Co-Authored-By: Claude Opus 4.5 --- src/encoder/records/text.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/src/encoder/records/text.py b/src/encoder/records/text.py index 73d0fc5..7c3c71c 100644 --- a/src/encoder/records/text.py +++ b/src/encoder/records/text.py @@ -1,14 +1,32 @@ import base64 import datetime +import re import struct from html.entities import codepoint2name from typing import Self +from xml.sax.saxutils import escape as xml_escape from .constants import DICTIONARY from .datatypes import Decimal, MultiByteInt31 from .record import record +# Regex to match invalid XML 1.0 characters (control chars except tab, newline, carriage return) +_INVALID_XML_CHARS = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f]') + + +def _sanitize_xml_text(value: str) -> str: + """Sanitize a string for safe inclusion in XML. + + 1. Removes invalid XML 1.0 control characters + 2. Escapes XML special characters (<, >, &, ", ') + """ + # Remove invalid control characters + clean = _INVALID_XML_CHARS.sub('', value) + # Escape XML special characters + return xml_escape(clean, entities={'"': '"', "'": '''}) + + class Text(record): ... @@ -150,7 +168,7 @@ def to_bytes(self) -> bytes: return bytes def __str__(self): - return self.value + return _sanitize_xml_text(self.value) @classmethod def parse(cls, fp) -> Self: @@ -170,7 +188,7 @@ def to_bytes(self) -> bytes: return bytes def __str__(self): - return self.value + return _sanitize_xml_text(self.value) @classmethod def parse(cls, fp) -> Self: @@ -190,7 +208,7 @@ def to_bytes(self) -> bytes: return bytes def __str__(self): - return self.value + return _sanitize_xml_text(self.value) @classmethod def parse(cls, fp) -> Self: @@ -332,11 +350,7 @@ def __init__(self, value: str): self.value = value def __str__(self): - # TODO: check if having unexcaped value is a problem? - # removed the return excape(self.value) because str() was used - # in the print_records function, so it printed stuff like - # amp(value) and stuff. - return self.value + return _sanitize_xml_text(self.value) def to_bytes(self) -> bytes: data = self.value.encode("utf-8") @@ -360,7 +374,7 @@ def __init__(self, value: str): self.value = value def __str__(self): - return self.value + return _sanitize_xml_text(self.value) def to_bytes(self) -> bytes: data = self.value.encode("utf-8") @@ -384,7 +398,7 @@ def __init__(self, value: str): self.value = value def __str__(self): - return self.value + return _sanitize_xml_text(self.value) def to_bytes(self) -> bytes: data = self.value.encode("utf-8")