secure-software-engineering · rashidabhar · Oct 18, 2023 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/.gitignore b/.gitignore
@@ -180,4 +180,4 @@ src/target_tools/ollama/src/fine_tuning/wandb/*
 src/target_tools/ollama/src/fine_tuning/outputs/*
 
 # Ignore autogen files
-autogen/data
+autogen/data
diff --git a/src/result_analyzer/analysis_utils.py b/src/result_analyzer/analysis_utils.py
@@ -111,7 +111,7 @@ def format_type(_types, is_ml=False):
         for _type in _types:
             i_type_list = []
             if is_ml:
-                if _type.startswith("Union["):
+                if is_ml and _type.startswith("Union["):
                     # TODO: Improve code, should not lower() for all. e.g., MyClass
                     types_split = [
                         x.replace(" ", "").lower()
@@ -124,15 +124,31 @@ def format_type(_types, is_ml=False):
                     # i_type_list.append(_t.split("[")[0].lower())
             else:
                 for _t in _type:
-                    if _t.startswith("Union["):
+                    if _t and _t.startswith("Union["):
                         types_split = [
                             x.replace(" ", "").lower()
                             for x in _t.split("Union[")[1].split("]")[0].split(",")
                         ]
                         i_type_list.extend(types_split)
+                    elif _t and _t.startswith("Optional["):
+                        types_split = [
+                            x.replace(" ", "").lower()
+                            for x in _t.split("Optional[")[1].split("]")[0].split(",")
+                        ]
+                        types_split.append("Nonetype")
+                        i_type_list.extend(types_split)
+                    elif _t and _t.startswith("Type["):
+                        types_split = [
+                            x.replace(" ", "").lower()
+                            for x in _t.split("Type[")[1].split("]")[0].split(",")
+                        ]
+                        i_type_list.extend(types_split)
+                    elif _t and _t in ["None", "Unknown"]:
+                        i_type_list.append("Nonetype")
                     else:
                         # TODO: Maybe no translation should be done here
-                        i_type_list.append(_t.lower())
+                        if _t:
+                            i_type_list.append(_t.lower())
                         # i_type_list.append(_t.split("[")[0].lower())
             type_formatted.append(list(set(i_type_list)))
 
@@ -176,10 +192,14 @@ def check_match(
     if expected.get("file") != out.get("file"):
         return False
 
-    # check if line_number match
+    # # check if line_number match
     if expected.get("line_number") != out.get("line_number"):
         return False
 
+    # if "col_offset" in expected and "col_offset" in out:
+    if expected["col_offset"] != out["col_offset"]:
+        return False
+
     if "col_offset" in expected and "col_offset" in out:
         if expected["col_offset"] != out["col_offset"]:
             return False
@@ -658,3 +678,97 @@ def benchmark_count(benchmark_path):
         _a, _functions, _params, _variables = get_fact_stats(json_files)
         total_result.append([cat, _a, _functions, _params, _variables])
     return total_result
+
+
+def normalize_type(type_str, nested_level=0):
+    """
+    Normalize the type string by removing module prefixes and simplifying typing constructs.
+    Example: 'builtins.str' -> 'str',
+             'typing.Tuple[builtins.str, builtins.float]' -> 'Tuple[str, float]',
+             'musictaxonomy.spotify.models.spotifyuser' -> 'SpotifyUser',
+             'List[List[Tuple[str]]]' -> 'List[List[Any]]' if nested level > 2.
+    """
+
+    if type_str is None:
+        return None
+
+    # Remove extra quotes if present
+    if type_str.startswith('"') and type_str.endswith('"'):
+        type_str = type_str.strip('"')
+
+    # Mapping of module prefixes to remove
+    type_mappings = {
+        "builtins.": "",
+        "typing.": "",
+    }
+    # Additional type mappings
+    additional_type_mappings = {
+        "integer": "int",
+        "string": "str",
+        "dictonary": "dict",
+        "method": "Callable",
+        "func": "Callable",
+        "function": "Callable",
+        "none": "None",
+        "Nonetype": "None",
+        "nonetype": "None",
+        "NoneType": "None",
+        "Text": "str",
+    }
+
+    if type_str is None:
+        return None
+
+    # Replace module prefixes
+    for prefix, replacement in type_mappings.items():
+        type_str = type_str.replace(prefix, replacement)
+
+    # Apply additional type mappings
+    type_str = additional_type_mappings.get(type_str, type_str)
+
+    # Handle generic types (e.g., Tuple[], List[], Dict[])
+    if "[" in type_str and "]" in type_str:
+        base_type, generic_content = type_str.split("[", 1)
+        generic_content = generic_content.rsplit("]", 1)[0]
+        # Process the generic parameters recursively
+        generic_params = []
+        bracket_level = 0
+        param = ""
+        for char in generic_content:
+            if char == "[":
+                bracket_level += 1
+                param += char
+            elif char == "]":
+                bracket_level -= 1
+                param += char
+            elif char == "," and bracket_level == 0:
+                generic_params.append(param.strip())
+                param = ""
+            else:
+                param += char
+        if param:
+            generic_params.append(param.strip())
+
+        # If nested level is greater than 0, replace with Any
+        if nested_level > 0:
+            normalized_params = ["Any"]
+        else:
+            normalized_params = [
+                normalize_type(param, nested_level + 1) for param in generic_params
+            ]
+
+        return f"{base_type}[{', '.join(normalized_params)}]"
+
+    # Handle fully qualified names by extracting the last segment
+    if "." in type_str:
+        return type_str.split(".")[-1]
+
+    # Return the simplified type
+    return type_str
+
+
+def normalize_types(types):
+    """
+    Normalize the type strings in the data.
+    """
+    return [normalize_type(type_str) for type_str in types]