import re from dsiUnits import dsiUnit import logging import json from regexGenerator import generateRegex dsiregExStr = generateRegex() # Compile the regex pattern dsiregExPattern = re.compile(dsiregExStr) # Configure logging to log to a file logging.basicConfig(filename='unitValidationLog.log', level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') def parse_plain_utf8_xml(xml_string): result = {} # Regular expressions to match the required XML elements unit_regex = re.compile(r'<si:unit>(.*?)</si:unit>') unit_xml_list_regex = re.compile(r'<si:unitXMLList>(.*?)</si:unitXMLList>') lines = xml_string.split('\n') for line_num, line in enumerate(lines, 1): # Check for si:unit elements unit_match = unit_regex.search(line) if unit_match: content = unit_match.group(1).strip() result[f"{line_num}"] = content # Check for si:unitXMLList elements unit_xml_list_match = unit_xml_list_regex.search(line) if unit_xml_list_match: contents = unit_xml_list_match.group(1).strip().split() for idx, content in enumerate(contents): result[f"{line_num}:{idx}"] = content return result def process_units(unit_dict): # Static regex parser function def validate_dsi_unit(dsi_unit_str): return dsiregExPattern.fullmatch(dsi_unit_str) is not None valid_units = {} invalid_units = {} for key, value in unit_dict.items(): try: unit = dsiUnit(value) regExresult = validate_dsi_unit(value) if unit.valid: valid_units[key] = value # Assuming you want to return the string value if not regExresult: discrepancy = { "type": "Regex Error", "message": "Unit parsed as valid by dsiUnit constructor but invalid by regex", "key": key, "value": value } logging.debug(json.dumps(discrepancy)) else: invalid_units[key] = { "unit": value, "warnings": unit.warnings } print(f"Warning: Invalid unit at {key} with value: {value}") if regExresult: discrepancy = { "type": "Regex Error", "message": "Unit parsed as invalid by dsiUnit constructor but valid by regex", "key": key, "value": value } logging.debug(json.dumps(discrepancy)) except Exception as e: print(f"Error processing unit at {key} with value: {value}. Error: {e}") invalid_units[key] = { "unit": value, "error": str(e) } return valid_units, invalid_units def parse_and_process(xml_string): unit_dict = parse_plain_utf8_xml(xml_string) valid_units, invalid_units = process_units(unit_dict) return { "valid_units": valid_units, "invalid_units": invalid_units }