From 45ed80e49c64da3c8c0ae70357ba068576e283d4 Mon Sep 17 00:00:00 2001 From: kevstone Date: Fri, 16 Feb 2024 21:23:55 +0000 Subject: [PATCH] Fixed an issue parsing binary operators that include a macro --- src/clang_convert/accessor_level2_input_files | 65 +++++++++++++++ .../accessor_working_input_files | 4 +- src/clang_convert/ast_object/ast_code.py | 4 + .../ast_object/ast_code_creator.py | 10 +-- src/clang_convert/ast_object/ast_parser.py | 82 +++++++++++-------- src/clang_convert/ast_object/ast_utils.py | 25 +++++- 6 files changed, 146 insertions(+), 44 deletions(-) create mode 100644 src/clang_convert/accessor_level2_input_files diff --git a/src/clang_convert/accessor_level2_input_files b/src/clang_convert/accessor_level2_input_files new file mode 100644 index 000000000..c404b4aa6 --- /dev/null +++ b/src/clang_convert/accessor_level2_input_files @@ -0,0 +1,65 @@ +# Start the line with # to exclude the file +grib_accessor_class_abstract_long_vector.cc +grib_accessor_class_ascii.cc +grib_accessor_class_bits.cc +grib_accessor_class_bytes.cc +grib_accessor_class_double.cc +grib_accessor_class_long.cc +grib_accessor_class_proj_string.cc +grib_accessor_class_blob.cc +# [SLOW FAILS] grib_accessor_class_bufr_data_array.cc +grib_accessor_class_bufr_data_element.cc +grib_accessor_class_bufr_elements_table.cc +grib_accessor_class_bufr_extract_area_subsets.cc +grib_accessor_class_bufr_extract_datetime_subsets.cc +grib_accessor_class_bufr_extract_subsets.cc +grib_accessor_class_bufr_simple_thinning.cc +grib_accessor_class_change_alternative_row_scanning.cc +grib_accessor_class_change_scanning_direction.cc +grib_accessor_class_codetable_title.cc +grib_accessor_class_codetable_units.cc +grib_accessor_class_concept.cc +grib_accessor_class_data_apply_bitmap.cc +grib_accessor_class_data_apply_boustrophedonic.cc +grib_accessor_class_data_apply_boustrophedonic_bitmap.cc +grib_accessor_class_data_secondary_bitmap.cc +grib_accessor_class_data_shsimple_packing.cc +grib_accessor_class_dictionary.cc +grib_accessor_class_g1_half_byte_codeflag.cc +grib_accessor_class_g2_mars_labeling.cc +grib_accessor_class_g2step_range.cc +grib_accessor_class_gaussian_grid_name.cc +grib_accessor_class_gds_not_present_bitmap.cc +grib_accessor_class_group.cc +grib_accessor_class_hash_array.cc +grib_accessor_class_headers_only.cc +grib_accessor_class_ifs_param.cc +grib_accessor_class_iterator.cc +grib_accessor_class_label.cc +grib_accessor_class_md5.cc +grib_accessor_class_message_copy.cc +grib_accessor_class_nearest.cc +grib_accessor_class_non_alpha.cc +grib_accessor_class_number_of_values_data_raw_packing.cc +grib_accessor_class_pack_bufr_values.cc +grib_accessor_class_packing_type.cc +grib_accessor_class_position.cc +grib_accessor_class_raw.cc +grib_accessor_class_section.cc +grib_accessor_class_section_pointer.cc +grib_accessor_class_smart_table_column.cc +grib_accessor_class_step_human_readable.cc +grib_accessor_class_to_double.cc +grib_accessor_class_to_integer.cc +grib_accessor_class_to_string.cc +grib_accessor_class_transient_darray.cc +grib_accessor_class_uint16.cc +grib_accessor_class_uint32.cc +grib_accessor_class_uint32_little_endian.cc +grib_accessor_class_uint64.cc +grib_accessor_class_uint64_little_endian.cc +grib_accessor_class_uint8.cc +grib_accessor_class_unpack_bufr_values.cc +grib_accessor_class_values.cc +grib_accessor_class_variable.cc +grib_accessor_class_when.cc \ No newline at end of file diff --git a/src/clang_convert/accessor_working_input_files b/src/clang_convert/accessor_working_input_files index f4903ab20..4de009dd6 100644 --- a/src/clang_convert/accessor_working_input_files +++ b/src/clang_convert/accessor_working_input_files @@ -15,7 +15,7 @@ grib_accessor_class_double.cc grib_accessor_class_long.cc grib_accessor_class_proj_string.cc grib_accessor_class_blob.cc -# [SLOW] grib_accessor_class_bufr_data_array.cc +# [SLOW FAILS] grib_accessor_class_bufr_data_array.cc grib_accessor_class_bufr_data_element.cc grib_accessor_class_bufr_elements_table.cc grib_accessor_class_bufr_extract_area_subsets.cc @@ -91,7 +91,7 @@ grib_accessor_class_count_total.cc grib_accessor_class_data_ccsds_packing.cc grib_accessor_class_data_g1secondary_bitmap.cc grib_accessor_class_data_g1shsimple_packing.cc -# [SLOW] grib_accessor_class_data_g22order_packing.cc +# [SLOW CONVERTS] grib_accessor_class_data_g22order_packing.cc grib_accessor_class_data_g2secondary_bitmap.cc grib_accessor_class_data_g2shsimple_packing.cc grib_accessor_class_data_png_packing.cc diff --git a/src/clang_convert/ast_object/ast_code.py b/src/clang_convert/ast_object/ast_code.py index 7c85a600b..68b958082 100755 --- a/src/clang_convert/ast_object/ast_code.py +++ b/src/clang_convert/ast_object/ast_code.py @@ -1,6 +1,8 @@ import utils.debug as debug import ast_object.ast_macro_details as ast_macro_details +import os +import ast_object.ast_utils as ast_utils # Represents a coherent unit of code that needs to be parsed together: usually a single .cc file # @@ -35,7 +37,9 @@ class AstCode: return self._macro_details def add_macro_definition(self, def_node): + debug.line("add_macro_definition", f"Adding MACRO DEFN spelling=[{def_node.spelling}] loc=[{os.path.basename(def_node.location.file.name)}] extent={ast_utils.node_extent(def_node)}") self._macro_details.add_definition(def_node) def add_macro_instantiation(self, inst_node): + debug.line("add_macro_instantiation", f"Adding MACRO INST spelling=[{inst_node.spelling}] loc=[{os.path.basename(inst_node.location.file.name)}] extent={ast_utils.node_extent(inst_node)}") self._macro_details.add_instantiation(inst_node) diff --git a/src/clang_convert/ast_object/ast_code_creator.py b/src/clang_convert/ast_object/ast_code_creator.py index f6e12eb9c..477365e2a 100755 --- a/src/clang_convert/ast_object/ast_code_creator.py +++ b/src/clang_convert/ast_object/ast_code_creator.py @@ -32,13 +32,13 @@ class AstCodeCreator: # ALWAYS appear at the top of the global declaration self._ast_code.add_global_function_entry(node) else: - debug.line("parse_node", f"Ignoring [no file info] node spelling=[{node.spelling}] kind=[{node.kind}]") + #debug.line("parse_node", f"Ignoring [no file info] node spelling=[{node.spelling}] kind=[{node.kind}]") return elif node.kind == clang.cindex.CursorKind.MACRO_INSTANTIATION: if node.location.file and node.location.file.name == self._cfilepath + self._cfilename: self._ast_code.add_macro_instantiation(node) elif node.location.file and node.location.file.name != self._cfilepath + self._cfilename: - debug.line("parse_node", f"Ignoring [non-local] node spelling=[{node.spelling}] file=[{os.path.basename(node.location.file.name)}]") + #debug.line("parse_node", f"Ignoring [non-local] node spelling=[{node.spelling}] file=[{os.path.basename(node.location.file.name)}]") return elif node.kind == clang.cindex.CursorKind.INCLUSION_DIRECTIVE: pass @@ -108,10 +108,4 @@ class AstCodeCreator: self.parse_root() - # Debug - dump macros - for node in self._ast_code.macro_details.def_nodes: - debug.line("parse", f"MACRO DEFN spelling=[{node.spelling}] loc=[{os.path.basename(node.location.file.name)}]") - for node in self._ast_code.macro_details.inst_nodes: - debug.line("parse", f"MACRO INST spelling=[{node.spelling}] loc=[{os.path.basename(node.location.file.name)}] extent=[{node.extent.start.line}:{node.extent.start.column} -> {node.extent.end.line}:{node.extent.end.column}]") - return self._ast_code diff --git a/src/clang_convert/ast_object/ast_parser.py b/src/clang_convert/ast_object/ast_parser.py index d8eb20254..56b7ba895 100755 --- a/src/clang_convert/ast_object/ast_parser.py +++ b/src/clang_convert/ast_object/ast_parser.py @@ -73,7 +73,7 @@ class AstParser: # Note - Prefer to call this as it handles macro expansions def parse_ast_node(self, node): - debug.line("parse_ast_node", f"[{node.kind}] spelling=[{node.spelling}] type=[{node.type.spelling}] extent=[{node.extent.start.line}:{node.extent.start.column}]->[{node.extent.end.line}:{node.extent.end.column}]") + debug.line("parse_ast_node", f"[{node.kind}] spelling=[{node.spelling}] type=[{node.type.spelling}] extent={ast_utils.node_extent(node)}") # Handle macros macro_instantiation_node = self._macro_details.instantiation_node_for(node) @@ -116,7 +116,7 @@ class AstParser: # =================================== Macros Convert functions [BEGIN] =================================== def parse_macro_definition(self, node): - debug.line("parse_macro_definition", f"MACRO spelling=[{node.spelling}] kind=[{node.kind}] extent=[{node.extent.start.line}:{node.extent.start.column} -> {node.extent.end.line}:{node.extent.end.column}]") + debug.line("parse_macro_definition", f"MACRO spelling=[{node.spelling}] kind=[{node.kind}] extent={ast_utils.node_extent(node)}") tokens = [token.spelling for token in node.get_tokens()] debug.line("parse_macro_definition", f"MACRO tokens=[{tokens}]") tokens_count = len(tokens) @@ -211,11 +211,11 @@ class AstParser: # macro_node is the original macro code in the C file # expanded_node is the code after the pre-processor has applied the macro expansion def parse_macro_instantiation(self, macro_node, expanded_node): - debug.line("parse_macro_instantiation", f"MACRO macro_node spelling=[{macro_node.spelling}] kind=[{macro_node.kind}] extent=[{macro_node.extent.start.line}:{macro_node.extent.start.column} -> {macro_node.extent.end.line}:{macro_node.extent.end.column}]") + debug.line("parse_macro_instantiation", f"MACRO macro_node spelling=[{macro_node.spelling}] kind=[{macro_node.kind}] extent={ast_utils.node_extent(expanded_node)}") debug.line("parse_macro_instantiation", f"MACRO macro_node dump:") ast_utils.dump_node(macro_node, 2, "truncate") - debug.line("parse_macro_instantiation", f"MACRO expanded_node spelling=[{expanded_node.spelling}] kind=[{expanded_node.kind}] extent=[{expanded_node.extent.start.line}:{expanded_node.extent.start.column} -> {expanded_node.extent.end.line}:{expanded_node.extent.end.column}]") + debug.line("parse_macro_instantiation", f"MACRO expanded_node spelling=[{expanded_node.spelling}] kind=[{expanded_node.kind}] extent={ast_utils.node_extent(expanded_node)}") debug.line("parse_macro_instantiation", f"MACRO expanded_node dump:") ast_utils.dump_node(expanded_node, 2, "truncate") @@ -253,8 +253,8 @@ class AstParser: # Just iteratively call parse_ast_node def parse_COMPOUND_STMT(self, node): - debug.line("parse_COMPOUND_STMT", f"Dumping node for MACRO INFO:") - ast_utils.dump_node(node, 2) + #debug.line("parse_COMPOUND_STMT", f"Dumping node for MACRO INFO:") + #ast_utils.dump_node(node, 2) stmt_lines = compound_statement.CompoundStatement() @@ -724,42 +724,60 @@ class AstParser: return c_unary_op def parse_BINARY_OPERATOR(self, node): - - debug.line("parse_BINARY_OPERATOR", f"DEBUG NODE DUMP:") - ast_utils.dump_node(node) + #debug.line("parse_BINARY_OPERATOR", f"DEBUG NODE DUMP:") + #ast_utils.dump_node(node) children = list(node.get_children()) assert len(children) == 2, f"Expected exactly two children for binary operator" + left_operand, right_operand = children + node_tokens = list(node.get_tokens()) + left_operand_tokens = list(left_operand.get_tokens()) + right_operand_tokens = list(right_operand.get_tokens()) - debug.line("parse_BINARY_OPERATOR", f"BINARY left_operand [{left_operand.kind}] spelling=[{left_operand.spelling}] type=[{left_operand.type.spelling}] extent=[{left_operand.extent.start.line}:{left_operand.extent.start.column}]->[{left_operand.extent.end.line}:{left_operand.extent.end.column}]") - debug.line("parse_BINARY_OPERATOR", f"BINARY right_operand [{right_operand.kind}] spelling=[{right_operand.spelling}] type=[{right_operand.type.spelling}] extent=[{right_operand.extent.start.line}:{right_operand.extent.start.column}]->[{right_operand.extent.end.line}:{right_operand.extent.end.column}]") - - # Tokenize and find the operator - tokens = [token.spelling for token in node.get_tokens()] - left_tokens = [token.spelling for token in left_operand.get_tokens()] - right_tokens = [token.spelling for token in right_operand.get_tokens()] - - # Find the operator by excluding operand tokens - tokens_count = len(tokens) - left_tokens_count = len(left_tokens) - operator_token = tokens[left_tokens_count] + debug.line("parse_BINARY_OPERATOR", f"Node spelling=[{node.spelling}] tokens=[{[token.spelling for token in node_tokens]}] extent={ast_utils.node_extent(node)}") + debug.line("parse_BINARY_OPERATOR", f"left_operand [{left_operand.kind}] spelling=[{left_operand.spelling}] tokens=[{[token.spelling for token in left_operand_tokens]}] type=[{left_operand.type.spelling}] extent={ast_utils.node_extent(left_operand)}") + debug.line("parse_BINARY_OPERATOR", f"right_operand [{right_operand.kind}] spelling=[{right_operand.spelling}] tokens=[{[token.spelling for token in right_operand_tokens]}] type=[{right_operand.type.spelling}] extent={ast_utils.node_extent(right_operand)}") left_operand_cvalue = self.parse_ast_node(left_operand) - - right_tokens_count = len(right_tokens) - if tokens_count != left_tokens_count + right_tokens_count + 1: - # The top level tokens don't match the right_operand tokens. This will happen if the top-level - # contains a macro definition. We should be able to handle this, so we'll just record the fact here! - debug.line("parse_BINARY_OPERATOR", f"Right operand tokens don't match: assuming a macro") right_operand_cvalue = self.parse_ast_node(right_operand) - - debug.line("parse_BINARY_OPERATOR", f"Create c_binary_op: left_operand_cvalue=[{debug.as_debug_string(left_operand_cvalue)}] operator_token=[{debug.as_debug_string(operator_token)}] right_operand_cvalue=[{debug.as_debug_string(right_operand_cvalue)}]") - if not right_operand_cvalue: - return literal.Literal(f"// [Ignoring C Code] {' '.join([token.spelling for token in node.get_tokens()])}") + return literal.Literal(f"// [Ignoring C Code] {' '.join([token.spelling for token in node_tokens])}") - c_binary_op = binary_operation.BinaryOperation(left_operand_cvalue, operator_token, right_operand_cvalue) + debug.line("parse_BINARY_OPERATOR", f"left_operand_cvalue=[{debug.as_debug_string(left_operand_cvalue)}]") + debug.line("parse_BINARY_OPERATOR", f"right_operand_cvalue=[{debug.as_debug_string(right_operand_cvalue)}]") + + # Get operator + operator_token = None + + # Step 1: See if we have child node tokens + node_tokens_count = len(node_tokens) + left_tokens_count = len(left_operand_tokens) + right_tokens_count = len(right_operand_tokens) + + if node_tokens_count > 0 and left_tokens_count > 0: + if node_tokens_count >= left_tokens_count + right_tokens_count + 1: + operator_token = node_tokens[left_tokens_count] + + debug.line("parse_BINARY_OPERATOR", f"[Step 1] [child tokens] node_tokens_count=[{node_tokens_count}] left_tokens_count=[{left_tokens_count}] right_tokens_count=[{right_tokens_count}]") + debug.line("parse_BINARY_OPERATOR", f"[Step 1] [child tokens] operator_token=[{operator_token.spelling if operator_token else None}]") + + if not operator_token: + # Step 2: Deduce it from the node tokens + operator_extent = clang.cindex.SourceRange.from_locations(left_operand.extent.end, right_operand.extent.start) + debug.line("parse_BINARY_OPERATOR", f"operator_extent=[{ast_utils.source_range_string(operator_extent)}]") + operator_token = ast_utils.find_token_from_extent(node_tokens, operator_extent) + + debug.line("parse_BINARY_OPERATOR", f"[Step 2] [node_tokens] operator_token=[{operator_token.spelling if operator_token else None}]") + + if not operator_token: + # Step 3: Search ALL translation unit tokens (this will be slow for large C files - may need to optimise) + operator_token = ast_utils.find_token_from_extent(node.translation_unit.cursor.get_tokens(), operator_extent) + + debug.line("parse_BINARY_OPERATOR", f"[Step 3] [ALL tokens] operator_token=[{operator_token.spelling if operator_token else None}]") + assert operator_token + + c_binary_op = binary_operation.BinaryOperation(left_operand_cvalue, operator_token.spelling, right_operand_cvalue) return c_binary_op def parse_COMPOUND_ASSIGNMENT_OPERATOR(self, node): diff --git a/src/clang_convert/ast_object/ast_utils.py b/src/clang_convert/ast_object/ast_utils.py index c80ba691f..c2b87974d 100755 --- a/src/clang_convert/ast_object/ast_utils.py +++ b/src/clang_convert/ast_object/ast_utils.py @@ -13,6 +13,17 @@ import code_object.array_access as array_access # Utilities for working with C AST Nodes +# Return a string representation e.g. [117:10->117:45] + +def node_extent(node): + return f"[{node.extent.start.line}:{node.extent.start.column}->{node.extent.end.line}:{node.extent.end.column}]" + +def token_extent(node): + return f"[{node.extent.start.line}:{node.extent.start.column}->{node.extent.end.line}:{node.extent.end.column}]" + +def source_range_string(src_range): + return f"[{src_range.start.line}:{src_range.start.column}->{src_range.end.line}:{src_range.end.column}]" + # tokens string can be: # "flat" to show a flat summary # "list" to show a detailed list @@ -20,12 +31,12 @@ import code_object.array_access as array_access # "" to not show tokens def dump_node(cnode, depth=0, tokens="truncate"): truncate_depth = 10 - debug.line("dump_node", f"{' ' * depth}[{depth}:{cnode.kind}] spelling=[{cnode.spelling}] type=[{cnode.type.spelling}] extent=[{cnode.extent.start.line}:{cnode.extent.start.column}]->[{cnode.extent.end.line}:{cnode.extent.end.column}]") + debug.line("dump_node", f"{' ' * depth}[{depth}:{cnode.kind}] spelling=[{cnode.spelling}] type=[{cnode.type.spelling}] extent={node_extent(cnode)}") if tokens == "flat": debug.line("dump_node", f"{' ' * depth} -> tokens=[{[token.spelling for token in cnode.get_tokens()]}]") elif tokens == "list": for token in cnode.get_tokens(): - debug.line("dump_node", f"{' ' * depth} -> token=[{token.spelling}] extent=[{token.extent.start.line}:{token.extent.start.column} -> {token.extent.end.line}:{token.extent.end.column}]") + debug.line("dump_node", f"{' ' * depth} -> token=[{token.spelling}] extent={token_extent(token)}") elif tokens == "truncate": token_list = [token.spelling for token in cnode.get_tokens()] debug.line("dump_node", f"{' ' * depth} -> tokens[:{truncate_depth}]=[{token_list[:truncate_depth]}]") @@ -34,6 +45,16 @@ def dump_node(cnode, depth=0, tokens="truncate"): for child in cnode.get_children(): dump_node(child, depth+1, tokens) +def find_token_from_extent(tokens, extent): + for t in tokens: + if t.extent.start.line == extent.start.line and \ + t.extent.end.line == extent.end.line and \ + t.extent.start.column >= extent.start.column and \ + t.extent.end.column <= extent.end.column: + return t + + return None + # Create a C FuncSig object from a FUNCTION_DECL node def create_cfuncsig(cnode): if cnode.kind == clang.cindex.CursorKind.FUNCTION_TEMPLATE: