MXMessage/generate-common.py at main · GoPlasmatic/MXMessage · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
import os
import re
from pathlib import Path
from collections import defaultdict
from dataclasses import dataclass
import argparse

@dataclass
class StructMatch:
    filename: str
    content: str
    start_pos: int
    end_pos: int
    usage_count: int = 0

def scan_rust_files(directory: str) -> tuple[defaultdict, dict, list]:
    """
    Scan all .rs files (except mod.rs) for struct and enum definitions and their usage in type definitions.
    Returns uppercase types for mod.rs and lowercase types for removal.
    """
    type_locations = defaultdict(list)
    file_contents = {}
    lowercase_matches = []

    # Pattern to find struct/enum definitions with their comments
    struct_pattern = re.compile(
        r'(// (\w+): [^\n]*\n'  # Capture comment with type name
        r'(?://[^\n]*\n)*'      # Optional additional comment lines
        r'#\[derive[^\]]*\]\n'  # Derive attribute
        r'pub (?:struct|enum) \2 \{[^}]*\})',  # Struct or enum definition
        re.MULTILINE | re.DOTALL
    )

    # Pattern to find impl blocks for validation
    impl_pattern = re.compile(
        r'(impl (\w+) \{[^}]*pub fn validate\(&self\) -> Result<\(\), ValidationError> \{.*?\n\})',
        re.MULTILINE | re.DOTALL
    )

    # Combined pattern for complete struct + impl removal
    complete_pattern = re.compile(
        r'// (\w+): [^\n]*\n'  # Comment with type name
        r'(?://[^\n]*\n)*'      # Optional additional comment lines
        r'#\[derive[^\]]*\]\n'  # Derive attribute
        r'pub (?:struct|enum) \1 \{[^}]*\}\n+'  # Struct or enum definition
        r'(?:impl \1 \{[^}]*pub fn validate\(&self\) -> Result<\(\), ValidationError> \{.*?\n\}\n*)?',  # Optional impl block with validation
        re.MULTILINE | re.DOTALL
    )

    # Usage pattern - look for type names in field definitions
    usage_pattern = re.compile(r'pub \w+: (?:Option<)?(?:Vec<)?(\w+)(?:>)?(?:>)?,?')

    dir_path = Path(directory)
    if not dir_path.is_dir():
        print(f"Directory {directory} does not exist")
        return type_locations, file_contents, lowercase_matches

    # Get all .rs files except mod.rs
    rust_files = [f for f in os.listdir(dir_path)
                  if f.endswith('.rs') and f != 'mod.rs']

    print(f"Found {len(rust_files)} .rs files: {', '.join(rust_files)}")

    # Process each file to get complete content
    for filename in rust_files:
        file_path = dir_path / filename
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            file_contents[filename] = content
        except Exception as e:
            print(f"Error reading {filename}: {e}")

    # Now find struct definitions and their impl blocks
    for filename in rust_files:
        content = file_contents[filename]

        # Find all struct/enum definitions
        struct_matches = list(struct_pattern.finditer(content))
        impl_matches = list(impl_pattern.finditer(content))

        print(f"Found {len(struct_matches)} type definitions in {filename}")

        # Create a map of impl blocks by type name
        impl_map = {}
        for impl_match in impl_matches:
            impl_type_name = impl_match.group(2)
            impl_map[impl_type_name] = impl_match.group(1)

        for struct_match in struct_matches:
            type_name = struct_match.group(2)
            print(f"  - {type_name}")

            # Combine struct definition with its impl block if it exists
            complete_content = struct_match.group(1)
            if type_name in impl_map:
                complete_content += "\n\n" + impl_map[type_name]

            struct_match_obj = StructMatch(
                filename=filename,
                content=complete_content,
                start_pos=struct_match.start(),
                end_pos=struct_match.end()
            )
            type_locations[type_name].append(struct_match_obj)

    # Count usage of each type across all files
    for type_name in type_locations.keys():
        total_usage = 0
        for filename, content in file_contents.items():
            usage_matches = usage_pattern.findall(content)
            usage_count = usage_matches.count(type_name)
            total_usage += usage_count

        # Set usage count for all matches of this type
        for struct_match in type_locations[type_name]:
            struct_match.usage_count = total_usage

    return type_locations, file_contents, lowercase_matches

def generate_mod_file(duplicate_types: dict, output_file: str):
    """Generate or update mod.rs file with duplicate types."""
    existing_types, existing_content = read_existing_mod(output_file)

    # Prepare headers for mod.rs if it's a new file
    if not existing_content:
        headers = """// Plasmatic MX Message Parsing Library
// https://github.com/GoPlasmatic/MXMessage
//
// Copyright (c) 2025 Plasmatic
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// You may obtain a copy of this library at
// https://github.com/GoPlasmatic/MXMessage

use crate::error::*;
use regex::Regex;
use serde::{Deserialize, Serialize};

"""
        existing_content = headers

    # Prepare all content before writing to file
    new_content = [existing_content.rstrip('\n')]

    # Add new types
    for type_name, matches in sorted(duplicate_types.items()):
        if type_name not in existing_types:
            new_content.append(matches[0].content.rstrip('\n'))

    new_content.append('\n')
    # Write all content at once
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(new_content))

def remove_duplicates_from_files(duplicate_types: dict, file_contents: dict, dir_path: Path):
    """Remove duplicate type definitions from original files."""

    # Combined pattern for complete struct + impl removal
    def create_removal_pattern(type_name):
        return re.compile(
            rf'// {type_name}: [^\n]*\n'  # Comment with type name
            r'(?://[^\n]*\n)*'            # Optional additional comment lines
            rf'#\[derive[^\]]*\]\n'       # Derive attribute
            rf'pub (?:struct|enum) {type_name} \{{[^}}]*\}}\n+'  # Struct or enum definition
            rf'(?:impl {type_name} \{{[^}}]*pub fn validate\(&self\) -> Result<\(\), ValidationError> \{{.*?\n\}}\n*)?',  # Optional impl block
            re.MULTILINE | re.DOTALL
        )

    for type_name, matches in duplicate_types.items():
        pattern = create_removal_pattern(type_name)

        for match in matches:
            filename = match.filename
            if filename in file_contents:
                # Remove the complete pattern (struct + impl) from file content
                file_contents[filename] = pattern.sub('', file_contents[filename])

        print(f"Removed {type_name} from {len(matches)} files")

    # Write updated content back to files
    for filename, content in file_contents.items():
        if filename != 'mod.rs':  # Don't overwrite mod.rs
            file_path = dir_path / filename
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(content)

def print_summary(type_locations: defaultdict, lowercase_matches: list, typecount: int):
    """Print summary of type usage and lowercase types to be removed."""
    # Print lowercase types that will be removed
    if lowercase_matches:
        print("\nLowercase types to be removed:")
        print("-" * 40)
        by_file = defaultdict(list)
        for match in lowercase_matches:
            type_match = re.search(r'// (\w+) \.\.\.\n', match.content)
            if type_match:
                by_file[match.filename].append(type_match.group(1))

        for filename, types in sorted(by_file.items()):
            print(f"{filename}:")
            for type_name in sorted(types):
                print(f"  - {type_name}")
        print()

    # Identify root structs that should stay in their original files
    root_structs = set()
    for filename in set(match.filename for matches in type_locations.values() for match in matches):
        # Extract the base name from filename (e.g., camt_057_001_06.rs -> camt_057_001_06)
        if filename.endswith('.rs'):
            file_base = filename[:-3]  # Remove .rs extension

            for type_name in type_locations.keys():
                # Root structs are typically the main message types that:
                # 1. End with version numbers (V01, V02, etc.)
                # 2. Are document or message containers
                # 3. Are only defined in one file
                # 4. Have names that relate to the file structure
                if (type_name.endswith(('V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10')) or
                    'DOCUMENT' in type_name.upper() or
                    'MESSAGE' in type_name.upper() or
                    len(type_locations[type_name]) == 1):  # Types that appear in only one file

                    # Check if this type is only in this specific file
                    if len(type_locations[type_name]) == 1 and type_locations[type_name][0].filename == filename:
                        root_structs.add(type_name)

    print(f"\nIdentified root structs to keep in original files: {', '.join(sorted(root_structs))}")

    # Print uppercase types summary - focus on types that appear in multiple files, excluding root structs
    frequent_types = {
        name: matches
        for name, matches in type_locations.items()
        if name not in root_structs and len(matches) > 1  # Only move types that appear in multiple files
    }

    if not frequent_types:
        print(f"No uppercase types found that appear in multiple files (excluding root structs).")
        return

    print(f"\nUppercase types that appear in multiple files (excluding root structs):")
    print("-" * 70)

    # Pre-calculate usage counts
    usage_data = [
        (type_name, matches, sum(m.usage_count for m in matches))
        for type_name, matches in frequent_types.items()
    ]

    # Sort by number of files first, then by total usage count
    for type_name, matches, total_usage in sorted(
        usage_data, key=lambda x: (len(x[1]), x[2]), reverse=True
    ):
        files = [match.filename for match in matches]
        print(f"{type_name}: appears in {len(files)} files, used {total_usage} times total")
        for match in matches:
            print(f"  - {match.filename}: {match.usage_count} uses")
    print()

    return frequent_types

def read_existing_mod(output_file: str) -> tuple[set, str]:
    """Read existing mod.rs file if it exists."""
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Compile pattern once
        existing_structs = set(re.findall(r'// (\w+) \.\.\.\n', content))
        return existing_structs, content
    except FileNotFoundError:
        return set(), ""

def main():
    parser = argparse.ArgumentParser(
        description='Find frequently used structs in .rs files and move to mod.rs'
    )
    parser.add_argument('directory',
                       help='Directory containing .rs files (default: current directory)',
                       default='.',
                       nargs='?')

    parser.add_argument('typecount',
                       help='Type count threshold (default: 1)',
                       default=1,
                       nargs='?')

    args = parser.parse_args()

    try:
        # Scan files and collect both uppercase and lowercase types
        type_locations, file_contents, lowercase_matches = scan_rust_files(args.directory)

        # Print summary and get frequent types
        frequent_types = print_summary(type_locations, lowercase_matches, int(args.typecount))

        if frequent_types or lowercase_matches:
            output_path = Path(args.directory) / 'mod.rs'

            # Handle uppercase types
            if frequent_types:
                # Read existing types
                seed_types, _ = read_existing_mod(output_path)
                if seed_types:
                    print(f"\nFound {len(seed_types)} existing types in mod.rs")

                # Process files
                generate_mod_file(frequent_types, output_path)
                new_types = set(frequent_types.keys()) - seed_types

                # Print results
                if new_types:
                    print(f"Added {len(new_types)} new types to mod.rs")
                    print("New types:", ", ".join(sorted(new_types)))
                else:
                    print("No new types to add")

            # Remove both duplicate uppercase types and lowercase types
            remove_duplicates_from_files(frequent_types, file_contents, Path(args.directory))
            if lowercase_matches:
                print(f"Removed {len(lowercase_matches)} lowercase types from original files")
            if frequent_types:
                print("Removed duplicate types from original files")

    except Exception as e:
        print(f"Error: {e}")
        return 1

    return 0

if __name__ == "__main__":
    exit(main())