From 2f025dea0597b8b60b24256d264f01c9e9f3cdb9 Mon Sep 17 00:00:00 2001 From: kevstone Date: Thu, 1 Feb 2024 07:53:00 +0000 Subject: [PATCH] Macro parsing (finally) working! --- .../ast_object/ast_code_creator.py | 25 +++- src/clang_convert/ast_object/ast_parser.py | 125 ++++++++++++++++-- src/clang_convert/ast_object/ast_utils.py | 8 +- .../code_object/macro_instantation.py | 31 ++++- .../macro_instantiation_converter.py | 8 +- 5 files changed, 169 insertions(+), 28 deletions(-) diff --git a/src/clang_convert/ast_object/ast_code_creator.py b/src/clang_convert/ast_object/ast_code_creator.py index 6daed0b73..2b24294c1 100755 --- a/src/clang_convert/ast_object/ast_code_creator.py +++ b/src/clang_convert/ast_object/ast_code_creator.py @@ -77,10 +77,25 @@ class AstCodeCreator: parse_options = clang.cindex.TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD debug.line("parse", f"MACRO generation enabled [PARSE_DETAILED_PROCESSING_RECORD]") - self._translation_unit = index.parse(self._cfilepath + self._cfilename, - args=self._parse_args, - unsaved_files=None, - options=parse_options) + # TEST - Set True to load file into memory (to allow macro manipulation etc) + load_file_into_memory = False + + if load_file_into_memory: + with open(self._cfilepath + self._cfilename, 'r') as file: + file_contents = file.read() + + debug.line("FILE CONTENTS", file_contents) + + unsaved_file = ('in_memory_file.c', file_contents) + self._translation_unit = index.parse('in_memory_file.c', + args=self._parse_args, + unsaved_files=[unsaved_file], + options=parse_options) + else: + self._translation_unit = index.parse(self._cfilepath + self._cfilename, + args=self._parse_args, + unsaved_files=None, + options=parse_options) self._ast_code = ast_code.AstCode(self._cfilename) @@ -90,6 +105,6 @@ class AstCodeCreator: for node in self._ast_code.macro_details.def_nodes: debug.line("parse", f"MACRO DEFN spelling=[{node.spelling}] loc=[{os.path.basename(node.location.file.name)}]") for node in self._ast_code.macro_details.inst_nodes: - debug.line("parse", f"MACRO INST spelling=[{node.spelling}] loc=[{os.path.basename(node.location.file.name)}]") + debug.line("parse", f"MACRO INST spelling=[{node.spelling}] loc=[{os.path.basename(node.location.file.name)}] extent=[{node.extent.start.line}:{node.extent.start.column} -> {node.extent.end.line}:{node.extent.end.column}]") return self._ast_code diff --git a/src/clang_convert/ast_object/ast_parser.py b/src/clang_convert/ast_object/ast_parser.py index 227b95a15..c8a2cdfea 100755 --- a/src/clang_convert/ast_object/ast_parser.py +++ b/src/clang_convert/ast_object/ast_parser.py @@ -61,10 +61,12 @@ class AstParser: # Note - Prefer to call this as it handles macro expansions def parse_ast_node(self, node): + debug.line("parse_ast_node", f"[{node.kind}] spelling=[{node.spelling}] type=[{node.type.spelling}] extent=[{node.extent.start.line}:{node.extent.start.column}]->[{node.extent.end.line}:{node.extent.end.column}]") + # Handle macros macro_instantiation_node = self._macro_details.instantiation_node_for(node) if macro_instantiation_node: - return self.parse_macro_instantiation(macro_instantiation_node) + return self.parse_macro_instantiation(macro_instantiation_node, node) if node.kind.is_statement(): return self.parse_STMT_node(node) @@ -131,13 +133,104 @@ class AstParser: return macro_def - def parse_macro_instantiation(self, node): - debug.line("parse_macro_instantiation", f"MACRO spelling=[{node.spelling}] kind=[{node.kind}] extent=[{node.extent.start.line}:{node.extent.start.column} -> {node.extent.end.line}:{node.extent.end.column}]") - token_text = "".join([token.spelling for token in node.get_tokens()]) + ";" - debug.line("parse_macro_instantiation", f"MACRO token text=[{token_text}]") - macro_inst = macro_instantation.MacroInstantation() - macro_inst.add_line(token_text) + def find_root_macro_node(self, node): + for child in node.get_children(): + child_tokens = [token.spelling for token in child.get_tokens()] + if len(child_tokens) > 0: + return child + + node_with_tokens = self.find_root_macro_node(child) + if node_with_tokens: + return node_with_tokens + + return None + + # Attempt to match as many tokens as possible in the macro node hierarchy. + # The node with the highest number of matches will be selected + # Returns the matched node (or None) and the number of tokens matched + def match_tokens(self, tokens, node, match_count): + new_tokens = [token.spelling for token in node.get_tokens()] + tokens_count = len(tokens) + new_node = None + new_tokens_count = len(new_tokens) + tokens_to_match = min(tokens_count, new_tokens_count) + + best_match_count = 0 + for i in range(tokens_to_match): + if new_tokens[i] == tokens[i]: + best_match_count += 1 + else: + break + + if best_match_count > 0: + new_node = node + + if best_match_count != tokens_count: + for child in node.get_children(): + child_node, child_match_count = self.match_tokens(tokens, child, best_match_count) + if child_match_count > best_match_count: + best_match_count = child_match_count + new_node = child_node + break + + if new_node: + debug.line("match_tokens", f"new_node [{new_node.kind}] spelling=[{new_node.spelling}] best_match_count=[{best_match_count}]") + + return new_node, best_match_count + + # Convert all the tokens of the macro, either by finding the appropriate child node and parsing it (match_tokens function), + # or creating a literal representation of the token. + # Returns the code_object (or None) and remaining (unmatched) tokens + def convert_tokens(self, tokens, root_expanded_node): + debug.line("convert_tokens", f"[IN] tokens=[{tokens}] root_expanded_node kind={root_expanded_node.kind} spelling=[{root_expanded_node.spelling}]") + + matched_node, match_count = self.match_tokens(tokens, root_expanded_node, 0) + if matched_node: + converted_tokens = self.parse_ast_node(matched_node) + debug.line("convert_tokens", f" MATCH - converted_tokens=[{converted_tokens.as_string()}] tokens=[{tokens}] match_count=[{match_count}]") + return converted_tokens, tokens[match_count:] + else: + converted_token = literal.Literal(tokens.pop(0)) + debug.line("convert_tokens", f" NO MATCH - converted_token=[{converted_token.as_string()}] tokens=[{tokens}]") + return converted_token, tokens + + + # macro_node is the original macro code in the C file + # expanded_node is the code after the pre-processor has applied the macro expansion + def parse_macro_instantiation(self, macro_node, expanded_node): + debug.line("parse_macro_instantiation", f"MACRO macro_node spelling=[{macro_node.spelling}] kind=[{macro_node.kind}] extent=[{macro_node.extent.start.line}:{macro_node.extent.start.column} -> {macro_node.extent.end.line}:{macro_node.extent.end.column}]") + debug.line("parse_macro_instantiation", f"MACRO macro_node dump:") + ast_utils.dump_node(macro_node, 2, "truncate") + + debug.line("parse_macro_instantiation", f"MACRO expanded_node spelling=[{expanded_node.spelling}] kind=[{expanded_node.kind}] extent=[{expanded_node.extent.start.line}:{expanded_node.extent.start.column} -> {expanded_node.extent.end.line}:{expanded_node.extent.end.column}]") + debug.line("parse_macro_instantiation", f"MACRO expanded_node dump:") + ast_utils.dump_node(expanded_node, 2, "truncate") + + macro_node_tokens = [token.spelling for token in macro_node.get_tokens()] + + root_expanded_node = self.find_root_macro_node(expanded_node) + + if not root_expanded_node: + debug.line("parse_macro_instantiation", f"Could not find root_expanded_node, treating macro_node contents as a literal") + return literal.Literal(f"{' '.join(t for t in macro_node_tokens)}") + + macro_name = macro_node_tokens.pop(0) + macro_expression = code_objects.CodeObjects() + debug.line("parse_macro_instantiation", f"Found root_expanded_node, kind=[{root_expanded_node.kind}], parsing...") + + # Remove the opening paren so it doesn't cause a double-parse of the top-level node (leading to odd results!) + if macro_node_tokens[0] == "(": + open_parens_literal = literal.Literal(macro_node_tokens.pop(0)) + macro_expression.add_code_object(open_parens_literal) + + while len(macro_node_tokens) > 0: + converted_node, macro_node_tokens = self.convert_tokens(macro_node_tokens, root_expanded_node) + debug.line("parse_macro_instantiation", f"converted_node=[{debug.as_debug_string(converted_node)}]") + macro_expression.add_code_object(converted_node) + + macro_inst = macro_instantation.MacroInstantation(macro_name, macro_expression) + debug.line("parse_macro_instantiation", f"FINAL MACRO INST=[{macro_inst.as_string()}]") return macro_inst # =================================== Macros Convert functions [END] =================================== @@ -146,6 +239,10 @@ class AstParser: # Just iteratively call parse_ast_node def parse_COMPOUND_STMT(self, node): + + debug.line("parse_COMPOUND_STMT", f"Dumping node for MACRO INFO:") + ast_utils.dump_node(node, 2) + stmt_lines = compound_statement.CompoundStatement() for child in node.get_children(): @@ -419,6 +516,9 @@ class AstParser: assert len(children) == 2, f"Expected exactly two children for binary operator" left_operand, right_operand = children + debug.line("parse_BINARY_OPERATOR", f"BINARY left_operand [{left_operand.kind}] spelling=[{left_operand.spelling}] type=[{left_operand.type.spelling}] extent=[{left_operand.extent.start.line}:{left_operand.extent.start.column}]->[{left_operand.extent.end.line}:{left_operand.extent.end.column}]") + debug.line("parse_BINARY_OPERATOR", f"BINARY right_operand [{right_operand.kind}] spelling=[{right_operand.spelling}] type=[{right_operand.type.spelling}] extent=[{right_operand.extent.start.line}:{right_operand.extent.start.column}]->[{right_operand.extent.end.line}:{right_operand.extent.end.column}]") + # Tokenize and find the operator tokens = [token.spelling for token in node.get_tokens()] left_tokens = [token.spelling for token in left_operand.get_tokens()] @@ -433,12 +533,16 @@ class AstParser: right_tokens_count = len(right_tokens) if tokens_count != left_tokens_count + right_tokens_count + 1: - # The top level tokens don't match the right_operand tokens. This can happen if the top-level - # contains (for example) a macro definition. + # The top level tokens don't match the right_operand tokens. This will happen if the top-level + # contains a macro definition. We should be able to handle this, so we'll just record the fact here! + debug.line("parse_BINARY_OPERATOR", f"Right operand tokens don't match: assuming a macro") + right_operand_cvalue = self.parse_ast_node(right_operand) + + ''' # For now we'll just take the top-level tokens as a (string) literal top_level_right_tokens = tokens[left_tokens_count+1:] right_operand_cvalue = literal.Literal(f"{' '.join(t for t in top_level_right_tokens)}") - ''' + debug.line("parse_BINARY_OPERATOR", f"Right operand tokens don't match: using top-level tokens, treating as a literal") debug.line("parse_BINARY_OPERATOR", f" -> tokens = [{tokens}]") debug.line("parse_BINARY_OPERATOR", f" -> left_tokens = [{left_tokens}]") @@ -446,6 +550,7 @@ class AstParser: debug.line("parse_BINARY_OPERATOR", f" -> right_tokens = [{right_tokens}]") debug.line("parse_BINARY_OPERATOR", f" -> top_level_right_tokens = [{top_level_right_tokens}]") ''' + else: right_operand_cvalue = self.parse_ast_node(right_operand) diff --git a/src/clang_convert/ast_object/ast_utils.py b/src/clang_convert/ast_object/ast_utils.py index 72c932432..a0e184617 100755 --- a/src/clang_convert/ast_object/ast_utils.py +++ b/src/clang_convert/ast_object/ast_utils.py @@ -13,14 +13,20 @@ import code_object.struct_arg as struct_arg # tokens string can be: # "flat" to show a flat summary # "list" to show a detailed list +# "truncate" to show the first 10 tokens # "" to not show tokens -def dump_node(cnode, depth=0, tokens="flat"): +def dump_node(cnode, depth=0, tokens="truncate"): + truncate_depth = 10 debug.line("dump_node", f"{' ' * depth}[{cnode.kind}] spelling=[{cnode.spelling}] type=[{cnode.type.spelling}] extent=[{cnode.extent.start.line}:{cnode.extent.start.column}]->[{cnode.extent.end.line}:{cnode.extent.end.column}]") if tokens == "flat": debug.line("dump_node", f"{' ' * depth} -> tokens=[{[token.spelling for token in cnode.get_tokens()]}]") elif tokens == "list": for token in cnode.get_tokens(): debug.line("dump_node", f"{' ' * depth} -> token=[{token.spelling}] extent=[{token.extent.start.line}:{token.extent.start.column} -> {token.extent.end.line}:{token.extent.end.column}]") + elif tokens == "truncate": + token_list = [token.spelling for token in cnode.get_tokens()] + debug.line("dump_node", f"{' ' * depth} -> tokens[:{truncate_depth}]=[{token_list[:truncate_depth]}]") + for child in cnode.get_children(): dump_node(child, depth+1, tokens) diff --git a/src/clang_convert/code_object/macro_instantation.py b/src/clang_convert/code_object/macro_instantation.py index 06f921df9..ad35e90fc 100755 --- a/src/clang_convert/code_object/macro_instantation.py +++ b/src/clang_convert/code_object/macro_instantation.py @@ -1,12 +1,29 @@ -import code_object.macro_definition as macro_definition +import code_object.code_interface as code_interface +import code_object.code_objects as code_objects # Represents a macro instantiation, for example # BAR(42) # Assert(x != 4) # -# Can create with a single line, or multiple lines, of code (as strings) -# -# For now, derive from macro_definition as the (current) implementation is identical! -class MacroInstantation(macro_definition.MacroDefinition): - def __init__(self, code=None): - super().__init__(code) +# name (str) is the Macro name +# expression (CodeObjects) is the contents of the Macro call (inside the parens) +class MacroInstantation(code_interface.CodeInterface): + def __init__(self, name, expression): + self._name = name + self._expression = expression + assert isinstance(self._name, str), f"Name must be a str, not [{type(self._name).__name__}]" + assert isinstance(self._expression, code_objects.CodeObjects), f"Expression must be a CodeObjects class, not [{type(self._expression).__name__}]" + + @property + def name(self): + return self._name + + @property + def expression(self): + return self._expression + + def as_lines(self): + text = f"{self._name}{self._expression.as_string()};" + # Ensure it is all on one line! + text = text.replace("\n", "") + return [text] diff --git a/src/clang_convert/code_object_converter/macro_instantiation_converter.py b/src/clang_convert/code_object_converter/macro_instantiation_converter.py index c0e699d8f..8fe2a190c 100755 --- a/src/clang_convert/code_object_converter/macro_instantiation_converter.py +++ b/src/clang_convert/code_object_converter/macro_instantiation_converter.py @@ -10,9 +10,7 @@ class MacroInstantiationConverter(code_interface_converter.CodeInterfaceConverte assert isinstance(ccode_object, macro_instantation.MacroInstantation), f"Expected MacroInstantation, got type=[{type(ccode_object)}]" def create_cpp_code_object(self, conversion_pack): - # For now, just return a copy or comment out if not wanted... + cpp_macro = self._ccode_object + cpp_macro_expression = conversion_funcs.convert_ccode_object(cpp_macro.expression, conversion_pack) - if self._ccode_object.as_string().startswith("Assert"): - return conversion_funcs.as_commented_out_code(self._ccode_object, f"Ignoring Assert (for now!)") - - return macro_instantation.MacroInstantation(self._ccode_object.as_lines()) + return macro_instantation.MacroInstantation(cpp_macro.name, cpp_macro_expression)