diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml new file mode 100644 index 0000000..28e2561 --- /dev/null +++ b/.github/workflows/go.yml @@ -0,0 +1,26 @@ +name: Go + +on: + push: + branches: [ main ] + tags: ['v*'] + pull_request: + branches: [ main ] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v5 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Build + run: go build ./... + + - name: Test + run: go test -v ./... diff --git a/README.md b/README.md new file mode 100644 index 0000000..2c7ef23 --- /dev/null +++ b/README.md @@ -0,0 +1,167 @@ +[![Go Reference](https://pkg.go.dev/badge/github.com/imflog/xmlcompare.svg)](https://pkg.go.dev/github.com/imflog/xmlcompare) +[![CI](https://github.com/ImFlog/xml-compare/actions/workflows/go.yml/badge.svg?branch=main)](https://github.com/ImFlog/xml-compare/actions/workflows/go.yml) + +# xmlcompare + +A tiny, focused Go library to compare two XML documents for structural equality. + +It is designed for tests and validation code where you want to assert that two +XML snippets are the same regardless of child elements order, attribute +order, or incidental whitespace differences. + +Key properties: + +- Order-independent comparison of child elements +- Attribute order does not matter; names and values must match +- Text nodes are compared with whitespace normalization +- Namespace-aware tag matching (qualified name = `prefix:local` semantics) +- Helpful mismatch messages printed to stdout describing the first difference + +## Installation + +```bash +go get github.com/imflog/xmlcompare +``` + +## Quick start + +```go +package main + +import ( + "fmt" + xmlcmp "github.com/imflog/xmlcompare" +) + +func main() { + a := `hello world` + b := `hello world` + + equal, err := xmlcmp.Equal(a, b) + if err != nil { + panic(err) + } + fmt.Println(equal) // true +} +``` + +## API + +```go +func Equal(actual, expected string) (bool, error) +``` + +Parses both XML strings and performs an order‑independent, namespace-aware +comparison. It returns: + +- `true, nil` when documents are considered equal +- `false, nil` when a difference is found +- `false, err` if either XML cannot be parsed + +On the first difference, a human‑readable explanation is printed to stdout to +aid debugging (see examples below). This is convenient in tests because your +test logs will show exactly what differed. + +## What “equal” means here + +This library purposefully defines equality in a practical testing‑friendly way: + +- Element order is ignored. Siblings are matched by qualified tag name and + then paired using a similarity heuristic (attributes and child tags) to make + diagnostics meaningful. +- Attributes are compared by qualified name and value, ignoring attribute + order. Missing, extra, or different values are reported. +- Text content is compared after whitespace normalization (collapsing runs of + whitespace to a single space and trimming ends). This avoids false negatives + from indentation and formatting. +- Namespaces matter. The qualified element name must match, both prefix + (namespace) and local tag must be the same for a match. + +## Examples + +Order and whitespace insensitivity: + +```go +ok, err := xmlcmp.Equal( +` hello world `, +`hello world`, +) +// ok == true, err == nil +``` + +Mismatch examples (messages go to stdout): + +```go +ok, _ := xmlcmp.Equal(``, ``) +// Output (example): +// Missing child at /root/b +// ok == false + +ok, _ = xmlcmp.Equal(``, ``) +// Output: +// Attribute value differs at /root: @id actual="123" expected="999" +// ok == false + +ok, _ = xmlcmp.Equal( +``, +``, +) +// Output: +// XML mismatch at /ns1:root: different tags: actual= expected= +// ok == false +``` + +Parsing errors: + +```go +ok, err := xmlcmp.Equal(``, ``) +// ok == false, err != nil (invalid XML) +``` + +## Behavior details + +- Child matching strategy: For each actual child, candidates in the expected + document with the same qualified tag are considered. If there is a single + candidate, it is selected. If multiple candidates exist, a similarity score + based on attributes, child tag sets, and direct text is used to pick the best + match before recursing. +- Attributes: comparison is exact on both name and value. Attribute order is + irrelevant. Namespace declaration attributes (e.g., `xmlns` / `xmlns:prefix`) + are currently treated like regular attributes during equality checks. +- Text normalization: `strings.Fields` is used to collapse runs of whitespace + into single spaces and trim at both ends before comparison. + +## Limitations and notes + +- Only element and text nodes are considered. Comments, processing instructions, + and CDATA are not explicitly handled and may affect parsing depending on your + inputs. +- Namespace comparison currently requires that the element’s qualified name + (prefix plus local) matches between actual and expected; it does not attempt to + canonicalize or resolve different prefixes that bind to the same URI. + This is something we will work on in the future. +- The function prints the first detected mismatch to stdout for simplicity, as it is + intended for use in tests. This could be improved in the future to return a + structured result. + +## Testing + +The repository includes unit tests illustrating typical success and failure +cases. Run: + +```bash +go test ./... +``` + +## Version compatibility + +- Go: tested with Go 1.25+ +- XML parser: [`github.com/beevik/etree`](https://github.com/beevik/etree) + +## Contributing + +Contributions and ideas are welcome—please open an issue to discuss. + +## License + +This project is licensed under the MIT License. See `LICENSE` for details. diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..7c1d2a6 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module github.com/imflog/xmlcompare + +go 1.25.0 + +require github.com/beevik/etree v1.6.0 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..5e553d1 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/beevik/etree v1.6.0 h1:u8Kwy8pp9D9XeITj2Z0XtA5qqZEmtJtuXZRQi+j03eE= +github.com/beevik/etree v1.6.0/go.mod h1:bh4zJxiIr62SOf9pRzN7UUYaEDa9HEKafK25+sLc0Gc= diff --git a/xmlcompare.go b/xmlcompare.go new file mode 100644 index 0000000..c922976 --- /dev/null +++ b/xmlcompare.go @@ -0,0 +1,267 @@ +package xmlcompare + +import ( + "fmt" + "strings" + + "github.com/beevik/etree" +) + +// Equal compares two XML strings for structural equality, ignoring element order. +// It returns true if they are equivalent, false otherwise. +func Equal(actual, expected string) (bool, error) { + actualDoc := etree.NewDocument() + if err := actualDoc.ReadFromString(actual); err != nil { + fmt.Printf("error reading actual XML: %v", err) + return false, err + } + expectedDoc := etree.NewDocument() + if err := expectedDoc.ReadFromString(expected); err != nil { + fmt.Printf("error reading expected XML: %v", err) + return false, err + } + return compareElements(actualDoc.Root(), expectedDoc.Root()), nil +} + +// compareElements compares two etree.Element trees, ignoring element order. +// When a mismatch occurs, it prints an actionable diff and, when relevant, +// includes the exact unexpected XML subtree found in the actual document +// but not present in the expected one. +func compareElements(actual, expected *etree.Element) bool { + return compareElementsWithPath(actual, expected, "/"+actual.Tag) +} + +// compareElementsWithPath performs a detailed, order-independent comparison. +// On the first difference it prints an explicit explanation. If the actual XML +// contains a subtree that is not present in the expected XML, that exact +// unexpected XML snippet is printed to aid debugging. +func compareElementsWithPath(actual, expected *etree.Element, path string) bool { + // Tag + // Compare both namespace prefix (Space) and local tag (Tag) + if actual.Space != expected.Space || actual.Tag != expected.Tag { + aName := actual.Tag + eName := expected.Tag + if actual.Space != "" { + aName = actual.Space + ":" + aName + } + if expected.Space != "" { + eName = expected.Space + ":" + eName + } + fmt.Printf("XML mismatch at %s: different tags: actual=<%s> expected=<%s>\n", path, aName, eName) + return false + } + + // Attributes (order-independent) + if !compareAttributes(actual, expected, path) { + return false + } + + // Children (order-independent) with smart pairing + actualChildren := actual.ChildElements() + expectedChildren := expected.ChildElements() + used := make([]bool, len(expectedChildren)) + + for _, actualChild := range actualChildren { + // 1) Candidates with the same qualified tag name (namespace + local) + aLocal := actualChild.Tag + sameTagIdx := sameTagCandidates(aLocal, actualChild.Space, expectedChildren, used) + if len(sameTagIdx) == 0 { + fmt.Printf("Unexpected child at %s/%s\n", path, aLocal) + return false + } + + // 2) If only one candidate, recurse into it + if len(sameTagIdx) == 1 { + j := sameTagIdx[0] + childPath := path + "/" + aLocal + if compareElementsWithPath(actualChild, expectedChildren[j], childPath) { + used[j] = true + continue + } + return false + } + + // 3) Choose the most similar expected child and recurse + bestIdx := chooseMostSimilar(actualChild, expectedChildren, sameTagIdx) + if bestIdx >= 0 { + childPath := path + "/" + aLocal + if compareElementsWithPath(actualChild, expectedChildren[bestIdx], childPath) { + used[bestIdx] = true + continue + } + return false // mismatch already reported + } + + // Fallback (should not happen): report unexpected child + fmt.Printf("Unexpected child at %s/%s\n", path, aLocal) + return false + } + + // Any expected children that remain unmatched are missing in actual + for j, ec := range expectedChildren { + if !used[j] { + fmt.Printf("Missing child at %s/%s\n", path, ec.Tag) + return false + } + } + + // Text content (normalized) + tAct := normalizeXMLText(actual.Text()) + tExp := normalizeXMLText(expected.Text()) + if tAct != tExp { + fmt.Printf("Text differs at %s: actual=%q expected=%q\n", path, tAct, tExp) + return false + } + + return true +} + +// compareAttributes checks attributes ignoring order. +func compareAttributes(actual, expected *etree.Element, path string) bool { + expAttrMap := make(map[string]string, len(expected.Attr)) + for _, attr := range expected.Attr { + expAttrMap[attr.Key] = attr.Value + } + + // Report any unexpected or differing attribute in actual + for _, attr := range actual.Attr { + if v, ok := expAttrMap[attr.Key]; !ok { + fmt.Printf("Unexpected attribute at %s: @%s=\"%s\"\n", path, attr.Key, attr.Value) + return false + } else if v != attr.Value { + fmt.Printf("Attribute value differs at %s: @%s actual=\"%s\" expected=\"%s\"\n", path, attr.Key, attr.Value, v) + return false + } + } + + // Missing checks: attributes in actual but not in expected + actAttrMap := make(map[string]string, len(actual.Attr)) + for _, attr := range actual.Attr { + actAttrMap[attr.Key] = attr.Value + } + for _, attr := range expected.Attr { + if _, ok := actAttrMap[attr.Key]; !ok { + fmt.Printf("Missing attribute at %s: expected @%s=\"%s\"\n", path, attr.Key, attr.Value) + return false + } + } + return true +} + +// attrFullKey formats an attribute's qualified name, including prefix when present. +func attrFullKey(a etree.Attr) string { + if a.Space != "" { + return a.Space + ":" + a.Key + } + return a.Key +} + +// isXMLNSAttr returns true if the attribute is a namespace declaration (xmlns or xmlns:prefix). +func isXMLNSAttr(a etree.Attr) bool { + return a.Space == "xmlns" || a.Key == "xmlns" +} + +// sameTagCandidates returns indices of children having the given local tag and not used. +func sameTagCandidates(local string, space string, candidates []*etree.Element, used []bool) []int { + idx := make([]int, 0) + for j, ec := range candidates { + if used[j] { + continue + } + if ec.Tag == local && ec.Space == space { + idx = append(idx, j) + } + } + return idx +} + +// chooseMostSimilar picks the candidate with the highest heuristic similarity. +func chooseMostSimilar(a *etree.Element, children []*etree.Element, idxs []int) int { + bestIdx, bestScore := -1, -1 + for _, j := range idxs { + if score := similarityScore(a, children[j]); score > bestScore { + bestScore, bestIdx = score, j + } + } + return bestIdx +} + +// normalizeXMLText trims and collapses whitespace so that formatting/indentation +// differences don't cause false negatives when comparing equivalent XML. +func normalizeXMLText(s string) string { + // Collapse all runs of whitespace to a single space and trim ends. + // If the result is just an empty string, return empty. + collapsed := strings.Join(strings.Fields(s), " ") + return collapsed +} + +// similarityScore computes a heuristic similarity between two elements to help +// choose the best candidate for detailed comparison. +func similarityScore(a, b *etree.Element) int { + score := 0 + + // Attribute key/value matches (ignore xmlns, use qualified keys) + mb := make(map[string]string) + for _, x := range b.Attr { + if isXMLNSAttr(x) { + continue + } + mb[attrFullKey(x)] = x.Value + } + for _, x := range a.Attr { + if isXMLNSAttr(x) { + continue + } + if v, ok := mb[attrFullKey(x)]; ok && v == x.Value { + score += 3 + } + } + + // Overlap of child tag local names (ignoring order and multiplicity) + setA := childTagSet(a) + setB := childTagSet(b) + for name := range setA { + if setB[name] { + score += 1 + } + } + + // Text equality bonus + if normalizeXMLText(a.Text()) != "" && normalizeXMLText(a.Text()) == normalizeXMLText(b.Text()) { + score += 1 + } + + // Direct child text matches by tag name (helps list-like structures generically) + am := directChildTextMap(a) + bm := directChildTextMap(b) + // weigh exact text matches per tag + for name, va := range am { + if vb, ok := bm[name]; ok && va == vb && va != "" { + score += 4 + } + } + + return score +} + +// directChildTextMap returns a map of local child tag name -> normalized text (first occurrence). +func directChildTextMap(el *etree.Element) map[string]string { + m := make(map[string]string) + for _, c := range el.ChildElements() { + name := c.Tag + if _, exists := m[name]; exists { + continue + } + txt := normalizeXMLText(c.Text()) + m[name] = txt + } + return m +} + +func childTagSet(el *etree.Element) map[string]bool { + m := make(map[string]bool) + for _, c := range el.ChildElements() { + m[c.Tag] = true + } + return m +} diff --git a/xmlcompare_test.go b/xmlcompare_test.go new file mode 100644 index 0000000..722a253 --- /dev/null +++ b/xmlcompare_test.go @@ -0,0 +1,122 @@ +package xmlcompare + +import "testing" + +func TestEqual_PassingCase_WhitespaceAndOrderInsensitive(t *testing.T) { + a := ` + + hello world + + text + ` + // Same content but different order, different whitespace, and attribute order flipped + b := ` + + + text + + + hello world + ` + + ok, err := Equal(a, b) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !ok { + t.Fatalf("expected XMLs to be considered equal") + } +} + +func TestEqual_Error_WrongNamespace(t *testing.T) { + a := `` + b := `` + ok, err := Equal(a, b) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ok { + t.Fatalf("expected mismatch because of different namespaces") + } +} + +func TestEqual_Error_MissingAttribute(t *testing.T) { + a := `` + b := `` + ok, err := Equal(a, b) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ok { + t.Fatalf("expected mismatch because expected has no @id, actual does") + } +} + +func TestEqual_Error_WrongAttributeValue(t *testing.T) { + a := `` + b := `` + ok, err := Equal(a, b) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ok { + t.Fatalf("expected mismatch because attribute values differ") + } +} + +func TestEqual_Error_AdditionalChildTag(t *testing.T) { + a := `` + b := `` + ok, err := Equal(a, b) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ok { + t.Fatalf("expected mismatch because of additional child ") + } +} + +func TestEqual_Error_MissingChildTag(t *testing.T) { + a := `` + b := `` + ok, err := Equal(a, b) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ok { + t.Fatalf("expected mismatch because actual is missing child ") + } +} + +func TestEqual_TextDifference(t *testing.T) { + a := `hello` + b := `world` + ok, err := Equal(a, b) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ok { + t.Fatalf("expected mismatch because text differs") + } +} + +func TestEqual_TextWhitespaceIsNormalized(t *testing.T) { + a := ` hello world ` + b := `hello world` + ok, err := Equal(a, b) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !ok { + t.Fatalf("expected equality because whitespace should be normalized") + } +} + +func TestEqual_ParsingError(t *testing.T) { + a := `` // invalid XML + b := `` + ok, err := Equal(a, b) + if err == nil || ok { + t.Fatalf("expected parsing error and false, got ok=%v err=%v", ok, err) + } +}