From 45ed80e49c64da3c8c0ae70357ba068576e283d4 Mon Sep 17 00:00:00 2001
From: kevstone <kev@apperoso.com>
Date: Fri, 16 Feb 2024 21:23:55 +0000
Subject: [PATCH] Fixed an issue parsing binary operators that include a macro

---
 src/clang_convert/accessor_level2_input_files | 65 +++++++++++++++
 .../accessor_working_input_files              |  4 +-
 src/clang_convert/ast_object/ast_code.py      |  4 +
 .../ast_object/ast_code_creator.py            | 10 +--
 src/clang_convert/ast_object/ast_parser.py    | 82 +++++++++++--------
 src/clang_convert/ast_object/ast_utils.py     | 25 +++++-
 6 files changed, 146 insertions(+), 44 deletions(-)
 create mode 100644 src/clang_convert/accessor_level2_input_files

diff --git a/src/clang_convert/accessor_level2_input_files b/src/clang_convert/accessor_level2_input_files
new file mode 100644
index 000000000..c404b4aa6
--- /dev/null
+++ b/src/clang_convert/accessor_level2_input_files
@@ -0,0 +1,65 @@
+# Start the line with # to exclude the file
+grib_accessor_class_abstract_long_vector.cc
+grib_accessor_class_ascii.cc
+grib_accessor_class_bits.cc
+grib_accessor_class_bytes.cc
+grib_accessor_class_double.cc
+grib_accessor_class_long.cc
+grib_accessor_class_proj_string.cc
+grib_accessor_class_blob.cc
+# [SLOW FAILS] grib_accessor_class_bufr_data_array.cc
+grib_accessor_class_bufr_data_element.cc
+grib_accessor_class_bufr_elements_table.cc
+grib_accessor_class_bufr_extract_area_subsets.cc
+grib_accessor_class_bufr_extract_datetime_subsets.cc
+grib_accessor_class_bufr_extract_subsets.cc
+grib_accessor_class_bufr_simple_thinning.cc
+grib_accessor_class_change_alternative_row_scanning.cc
+grib_accessor_class_change_scanning_direction.cc
+grib_accessor_class_codetable_title.cc
+grib_accessor_class_codetable_units.cc
+grib_accessor_class_concept.cc
+grib_accessor_class_data_apply_bitmap.cc
+grib_accessor_class_data_apply_boustrophedonic.cc
+grib_accessor_class_data_apply_boustrophedonic_bitmap.cc
+grib_accessor_class_data_secondary_bitmap.cc
+grib_accessor_class_data_shsimple_packing.cc
+grib_accessor_class_dictionary.cc
+grib_accessor_class_g1_half_byte_codeflag.cc
+grib_accessor_class_g2_mars_labeling.cc
+grib_accessor_class_g2step_range.cc
+grib_accessor_class_gaussian_grid_name.cc
+grib_accessor_class_gds_not_present_bitmap.cc
+grib_accessor_class_group.cc
+grib_accessor_class_hash_array.cc
+grib_accessor_class_headers_only.cc
+grib_accessor_class_ifs_param.cc
+grib_accessor_class_iterator.cc
+grib_accessor_class_label.cc
+grib_accessor_class_md5.cc
+grib_accessor_class_message_copy.cc
+grib_accessor_class_nearest.cc
+grib_accessor_class_non_alpha.cc
+grib_accessor_class_number_of_values_data_raw_packing.cc
+grib_accessor_class_pack_bufr_values.cc
+grib_accessor_class_packing_type.cc
+grib_accessor_class_position.cc
+grib_accessor_class_raw.cc
+grib_accessor_class_section.cc
+grib_accessor_class_section_pointer.cc
+grib_accessor_class_smart_table_column.cc
+grib_accessor_class_step_human_readable.cc
+grib_accessor_class_to_double.cc
+grib_accessor_class_to_integer.cc
+grib_accessor_class_to_string.cc
+grib_accessor_class_transient_darray.cc
+grib_accessor_class_uint16.cc
+grib_accessor_class_uint32.cc
+grib_accessor_class_uint32_little_endian.cc
+grib_accessor_class_uint64.cc
+grib_accessor_class_uint64_little_endian.cc
+grib_accessor_class_uint8.cc
+grib_accessor_class_unpack_bufr_values.cc
+grib_accessor_class_values.cc
+grib_accessor_class_variable.cc
+grib_accessor_class_when.cc
\ No newline at end of file
diff --git a/src/clang_convert/accessor_working_input_files b/src/clang_convert/accessor_working_input_files
index f4903ab20..4de009dd6 100644
--- a/src/clang_convert/accessor_working_input_files
+++ b/src/clang_convert/accessor_working_input_files
@@ -15,7 +15,7 @@ grib_accessor_class_double.cc
 grib_accessor_class_long.cc
 grib_accessor_class_proj_string.cc
 grib_accessor_class_blob.cc
-# [SLOW] grib_accessor_class_bufr_data_array.cc
+# [SLOW FAILS] grib_accessor_class_bufr_data_array.cc
 grib_accessor_class_bufr_data_element.cc
 grib_accessor_class_bufr_elements_table.cc
 grib_accessor_class_bufr_extract_area_subsets.cc
@@ -91,7 +91,7 @@ grib_accessor_class_count_total.cc
 grib_accessor_class_data_ccsds_packing.cc
 grib_accessor_class_data_g1secondary_bitmap.cc
 grib_accessor_class_data_g1shsimple_packing.cc
-# [SLOW] grib_accessor_class_data_g22order_packing.cc
+# [SLOW CONVERTS] grib_accessor_class_data_g22order_packing.cc
 grib_accessor_class_data_g2secondary_bitmap.cc
 grib_accessor_class_data_g2shsimple_packing.cc
 grib_accessor_class_data_png_packing.cc
diff --git a/src/clang_convert/ast_object/ast_code.py b/src/clang_convert/ast_object/ast_code.py
index 7c85a600b..68b958082 100755
--- a/src/clang_convert/ast_object/ast_code.py
+++ b/src/clang_convert/ast_object/ast_code.py
@@ -1,6 +1,8 @@
 
 import utils.debug as debug
 import ast_object.ast_macro_details as ast_macro_details
+import os
+import ast_object.ast_utils as ast_utils
 
 # Represents a coherent unit of code that needs to be parsed together: usually a single .cc file
 #
@@ -35,7 +37,9 @@ class AstCode:
         return self._macro_details
 
     def add_macro_definition(self, def_node):
+        debug.line("add_macro_definition", f"Adding MACRO DEFN spelling=[{def_node.spelling}] loc=[{os.path.basename(def_node.location.file.name)}] extent={ast_utils.node_extent(def_node)}")
         self._macro_details.add_definition(def_node)
 
     def add_macro_instantiation(self, inst_node):
+        debug.line("add_macro_instantiation", f"Adding MACRO INST spelling=[{inst_node.spelling}] loc=[{os.path.basename(inst_node.location.file.name)}] extent={ast_utils.node_extent(inst_node)}")
         self._macro_details.add_instantiation(inst_node)
diff --git a/src/clang_convert/ast_object/ast_code_creator.py b/src/clang_convert/ast_object/ast_code_creator.py
index f6e12eb9c..477365e2a 100755
--- a/src/clang_convert/ast_object/ast_code_creator.py
+++ b/src/clang_convert/ast_object/ast_code_creator.py
@@ -32,13 +32,13 @@ class AstCodeCreator:
                     #       ALWAYS appear at the top of the global declaration
                     self._ast_code.add_global_function_entry(node)
             else:
-                debug.line("parse_node", f"Ignoring [no file info] node spelling=[{node.spelling}] kind=[{node.kind}]")
+                #debug.line("parse_node", f"Ignoring [no file info] node spelling=[{node.spelling}] kind=[{node.kind}]")
                 return
         elif node.kind == clang.cindex.CursorKind.MACRO_INSTANTIATION:
             if node.location.file and node.location.file.name == self._cfilepath + self._cfilename:
                 self._ast_code.add_macro_instantiation(node)
         elif node.location.file and node.location.file.name != self._cfilepath + self._cfilename:
-            debug.line("parse_node", f"Ignoring [non-local] node spelling=[{node.spelling}] file=[{os.path.basename(node.location.file.name)}]")
+            #debug.line("parse_node", f"Ignoring [non-local] node spelling=[{node.spelling}] file=[{os.path.basename(node.location.file.name)}]")
             return
         elif node.kind == clang.cindex.CursorKind.INCLUSION_DIRECTIVE:
             pass
@@ -108,10 +108,4 @@ class AstCodeCreator:
 
         self.parse_root()
 
-        # Debug - dump macros
-        for node in self._ast_code.macro_details.def_nodes:
-            debug.line("parse", f"MACRO DEFN spelling=[{node.spelling}] loc=[{os.path.basename(node.location.file.name)}]")
-        for node in self._ast_code.macro_details.inst_nodes:
-            debug.line("parse", f"MACRO INST spelling=[{node.spelling}] loc=[{os.path.basename(node.location.file.name)}] extent=[{node.extent.start.line}:{node.extent.start.column} -> {node.extent.end.line}:{node.extent.end.column}]")
-
         return self._ast_code
diff --git a/src/clang_convert/ast_object/ast_parser.py b/src/clang_convert/ast_object/ast_parser.py
index d8eb20254..56b7ba895 100755
--- a/src/clang_convert/ast_object/ast_parser.py
+++ b/src/clang_convert/ast_object/ast_parser.py
@@ -73,7 +73,7 @@ class AstParser:
     # Note - Prefer to call this as it handles macro expansions
     def parse_ast_node(self, node):
 
-        debug.line("parse_ast_node", f"[{node.kind}] spelling=[{node.spelling}] type=[{node.type.spelling}] extent=[{node.extent.start.line}:{node.extent.start.column}]->[{node.extent.end.line}:{node.extent.end.column}]")
+        debug.line("parse_ast_node", f"[{node.kind}] spelling=[{node.spelling}] type=[{node.type.spelling}] extent={ast_utils.node_extent(node)}")
 
         # Handle macros
         macro_instantiation_node = self._macro_details.instantiation_node_for(node)
@@ -116,7 +116,7 @@ class AstParser:
     # =================================== Macros Convert functions [BEGIN] ===================================
 
     def parse_macro_definition(self, node):
-        debug.line("parse_macro_definition", f"MACRO spelling=[{node.spelling}] kind=[{node.kind}] extent=[{node.extent.start.line}:{node.extent.start.column} -> {node.extent.end.line}:{node.extent.end.column}]")
+        debug.line("parse_macro_definition", f"MACRO spelling=[{node.spelling}] kind=[{node.kind}] extent={ast_utils.node_extent(node)}")
         tokens = [token.spelling for token in node.get_tokens()]
         debug.line("parse_macro_definition", f"MACRO tokens=[{tokens}]")
         tokens_count = len(tokens)
@@ -211,11 +211,11 @@ class AstParser:
     # macro_node is the original macro code in the C file
     # expanded_node is the code after the pre-processor has applied the macro expansion
     def parse_macro_instantiation(self, macro_node, expanded_node):
-        debug.line("parse_macro_instantiation", f"MACRO macro_node spelling=[{macro_node.spelling}] kind=[{macro_node.kind}] extent=[{macro_node.extent.start.line}:{macro_node.extent.start.column} -> {macro_node.extent.end.line}:{macro_node.extent.end.column}]")
+        debug.line("parse_macro_instantiation", f"MACRO macro_node spelling=[{macro_node.spelling}] kind=[{macro_node.kind}] extent={ast_utils.node_extent(expanded_node)}")
         debug.line("parse_macro_instantiation", f"MACRO macro_node dump:")
         ast_utils.dump_node(macro_node, 2, "truncate")
 
-        debug.line("parse_macro_instantiation", f"MACRO expanded_node spelling=[{expanded_node.spelling}] kind=[{expanded_node.kind}] extent=[{expanded_node.extent.start.line}:{expanded_node.extent.start.column} -> {expanded_node.extent.end.line}:{expanded_node.extent.end.column}]")
+        debug.line("parse_macro_instantiation", f"MACRO expanded_node spelling=[{expanded_node.spelling}] kind=[{expanded_node.kind}] extent={ast_utils.node_extent(expanded_node)}")
         debug.line("parse_macro_instantiation", f"MACRO expanded_node dump:")
         ast_utils.dump_node(expanded_node, 2, "truncate")
 
@@ -253,8 +253,8 @@ class AstParser:
     # Just iteratively call parse_ast_node
     def parse_COMPOUND_STMT(self, node):
 
-        debug.line("parse_COMPOUND_STMT", f"Dumping node for MACRO INFO:")
-        ast_utils.dump_node(node, 2)
+        #debug.line("parse_COMPOUND_STMT", f"Dumping node for MACRO INFO:")
+        #ast_utils.dump_node(node, 2)
 
         stmt_lines = compound_statement.CompoundStatement()
 
@@ -724,42 +724,60 @@ class AstParser:
         return c_unary_op
 
     def parse_BINARY_OPERATOR(self, node):
-
-        debug.line("parse_BINARY_OPERATOR", f"DEBUG NODE DUMP:")
-        ast_utils.dump_node(node)
+        #debug.line("parse_BINARY_OPERATOR", f"DEBUG NODE DUMP:")
+        #ast_utils.dump_node(node)
 
         children = list(node.get_children())
         assert len(children) == 2, f"Expected exactly two children for binary operator"
+
         left_operand, right_operand = children
+        node_tokens = list(node.get_tokens())
+        left_operand_tokens = list(left_operand.get_tokens())
+        right_operand_tokens = list(right_operand.get_tokens())
 
-        debug.line("parse_BINARY_OPERATOR", f"BINARY left_operand [{left_operand.kind}] spelling=[{left_operand.spelling}] type=[{left_operand.type.spelling}] extent=[{left_operand.extent.start.line}:{left_operand.extent.start.column}]->[{left_operand.extent.end.line}:{left_operand.extent.end.column}]")
-        debug.line("parse_BINARY_OPERATOR", f"BINARY right_operand [{right_operand.kind}] spelling=[{right_operand.spelling}] type=[{right_operand.type.spelling}] extent=[{right_operand.extent.start.line}:{right_operand.extent.start.column}]->[{right_operand.extent.end.line}:{right_operand.extent.end.column}]")
-
-        # Tokenize and find the operator
-        tokens = [token.spelling for token in node.get_tokens()]
-        left_tokens = [token.spelling for token in left_operand.get_tokens()]
-        right_tokens = [token.spelling for token in right_operand.get_tokens()]
-
-        # Find the operator by excluding operand tokens
-        tokens_count = len(tokens)
-        left_tokens_count = len(left_tokens)
-        operator_token = tokens[left_tokens_count]
+        debug.line("parse_BINARY_OPERATOR", f"Node spelling=[{node.spelling}] tokens=[{[token.spelling for token in node_tokens]}] extent={ast_utils.node_extent(node)}")
+        debug.line("parse_BINARY_OPERATOR", f"left_operand [{left_operand.kind}] spelling=[{left_operand.spelling}] tokens=[{[token.spelling for token in left_operand_tokens]}] type=[{left_operand.type.spelling}] extent={ast_utils.node_extent(left_operand)}")
+        debug.line("parse_BINARY_OPERATOR", f"right_operand [{right_operand.kind}] spelling=[{right_operand.spelling}] tokens=[{[token.spelling for token in right_operand_tokens]}] type=[{right_operand.type.spelling}] extent={ast_utils.node_extent(right_operand)}")
 
         left_operand_cvalue = self.parse_ast_node(left_operand)
-
-        right_tokens_count = len(right_tokens)
-        if tokens_count != left_tokens_count + right_tokens_count + 1:
-            # The top level tokens don't match the right_operand tokens. This will happen if the top-level
-            # contains a macro definition. We should be able to handle this, so we'll just record the fact here!
-            debug.line("parse_BINARY_OPERATOR", f"Right operand tokens don't match: assuming a macro")
         right_operand_cvalue = self.parse_ast_node(right_operand)
-
-        debug.line("parse_BINARY_OPERATOR", f"Create c_binary_op: left_operand_cvalue=[{debug.as_debug_string(left_operand_cvalue)}] operator_token=[{debug.as_debug_string(operator_token)}] right_operand_cvalue=[{debug.as_debug_string(right_operand_cvalue)}]")
-
         if not right_operand_cvalue:
-            return literal.Literal(f"// [Ignoring C Code] {' '.join([token.spelling for token in node.get_tokens()])}")
+            return literal.Literal(f"// [Ignoring C Code] {' '.join([token.spelling for token in node_tokens])}")
 
-        c_binary_op = binary_operation.BinaryOperation(left_operand_cvalue, operator_token, right_operand_cvalue)
+        debug.line("parse_BINARY_OPERATOR", f"left_operand_cvalue=[{debug.as_debug_string(left_operand_cvalue)}]")
+        debug.line("parse_BINARY_OPERATOR", f"right_operand_cvalue=[{debug.as_debug_string(right_operand_cvalue)}]")
+
+        # Get operator
+        operator_token = None
+
+        # Step 1: See if we have child node tokens
+        node_tokens_count = len(node_tokens)
+        left_tokens_count = len(left_operand_tokens)
+        right_tokens_count = len(right_operand_tokens)
+
+        if node_tokens_count > 0 and left_tokens_count > 0:
+            if node_tokens_count >= left_tokens_count + right_tokens_count + 1:
+                operator_token = node_tokens[left_tokens_count]
+
+        debug.line("parse_BINARY_OPERATOR", f"[Step 1] [child tokens] node_tokens_count=[{node_tokens_count}] left_tokens_count=[{left_tokens_count}] right_tokens_count=[{right_tokens_count}]")
+        debug.line("parse_BINARY_OPERATOR", f"[Step 1] [child tokens] operator_token=[{operator_token.spelling if operator_token else None}]")
+
+        if not operator_token:
+            # Step 2: Deduce it from the node tokens
+            operator_extent = clang.cindex.SourceRange.from_locations(left_operand.extent.end, right_operand.extent.start)
+            debug.line("parse_BINARY_OPERATOR", f"operator_extent=[{ast_utils.source_range_string(operator_extent)}]")
+            operator_token = ast_utils.find_token_from_extent(node_tokens, operator_extent)
+
+        debug.line("parse_BINARY_OPERATOR", f"[Step 2] [node_tokens] operator_token=[{operator_token.spelling if operator_token else None}]")
+
+        if not operator_token:
+            # Step 3: Search ALL translation unit tokens (this will be slow for large C files - may need to optimise)
+            operator_token = ast_utils.find_token_from_extent(node.translation_unit.cursor.get_tokens(), operator_extent)
+
+        debug.line("parse_BINARY_OPERATOR", f"[Step 3] [ALL tokens] operator_token=[{operator_token.spelling if operator_token else None}]")
+        assert operator_token
+
+        c_binary_op = binary_operation.BinaryOperation(left_operand_cvalue, operator_token.spelling, right_operand_cvalue)
         return c_binary_op
 
     def parse_COMPOUND_ASSIGNMENT_OPERATOR(self, node):
diff --git a/src/clang_convert/ast_object/ast_utils.py b/src/clang_convert/ast_object/ast_utils.py
index c80ba691f..c2b87974d 100755
--- a/src/clang_convert/ast_object/ast_utils.py
+++ b/src/clang_convert/ast_object/ast_utils.py
@@ -13,6 +13,17 @@ import code_object.array_access as array_access
 
 # Utilities for working with C AST Nodes
 
+# Return a string representation e.g. [117:10->117:45]
+
+def node_extent(node):
+    return f"[{node.extent.start.line}:{node.extent.start.column}->{node.extent.end.line}:{node.extent.end.column}]"
+
+def token_extent(node):
+    return f"[{node.extent.start.line}:{node.extent.start.column}->{node.extent.end.line}:{node.extent.end.column}]"
+
+def source_range_string(src_range):
+    return f"[{src_range.start.line}:{src_range.start.column}->{src_range.end.line}:{src_range.end.column}]"
+
 # tokens string can be:
 # "flat" to show a flat summary
 # "list" to show a detailed list
@@ -20,12 +31,12 @@ import code_object.array_access as array_access
 # "" to not show tokens
 def dump_node(cnode, depth=0, tokens="truncate"):
     truncate_depth = 10
-    debug.line("dump_node", f"{' ' * depth}[{depth}:{cnode.kind}] spelling=[{cnode.spelling}] type=[{cnode.type.spelling}] extent=[{cnode.extent.start.line}:{cnode.extent.start.column}]->[{cnode.extent.end.line}:{cnode.extent.end.column}]")
+    debug.line("dump_node", f"{' ' * depth}[{depth}:{cnode.kind}] spelling=[{cnode.spelling}] type=[{cnode.type.spelling}] extent={node_extent(cnode)}")
     if tokens == "flat":
         debug.line("dump_node", f"{' ' * depth} -> tokens=[{[token.spelling for token in cnode.get_tokens()]}]")
     elif tokens == "list":
         for token in cnode.get_tokens():
-            debug.line("dump_node", f"{' ' * depth} -> token=[{token.spelling}] extent=[{token.extent.start.line}:{token.extent.start.column} -> {token.extent.end.line}:{token.extent.end.column}]")
+            debug.line("dump_node", f"{' ' * depth} -> token=[{token.spelling}] extent={token_extent(token)}")
     elif tokens == "truncate":
         token_list = [token.spelling for token in cnode.get_tokens()]
         debug.line("dump_node", f"{' ' * depth} -> tokens[:{truncate_depth}]=[{token_list[:truncate_depth]}]")
@@ -34,6 +45,16 @@ def dump_node(cnode, depth=0, tokens="truncate"):
     for child in cnode.get_children():
         dump_node(child, depth+1, tokens)
 
+def find_token_from_extent(tokens, extent):
+    for t in tokens:
+        if t.extent.start.line == extent.start.line and \
+           t.extent.end.line == extent.end.line and \
+           t.extent.start.column >= extent.start.column and \
+           t.extent.end.column <= extent.end.column:
+            return t
+
+    return None
+
 # Create a C FuncSig object from a FUNCTION_DECL node
 def create_cfuncsig(cnode):
     if cnode.kind == clang.cindex.CursorKind.FUNCTION_TEMPLATE: