diff --git a/.gitignore b/.gitignore index 66c36f1b4..7ac4e0e97 100644 --- a/.gitignore +++ b/.gitignore @@ -180,4 +180,4 @@ src/target_tools/ollama/src/fine_tuning/wandb/* src/target_tools/ollama/src/fine_tuning/outputs/* # Ignore autogen files -autogen/data \ No newline at end of file +autogen/data diff --git a/src/result_analyzer/analysis_utils.py b/src/result_analyzer/analysis_utils.py index 26993b803..4571512d2 100644 --- a/src/result_analyzer/analysis_utils.py +++ b/src/result_analyzer/analysis_utils.py @@ -111,7 +111,7 @@ def format_type(_types, is_ml=False): for _type in _types: i_type_list = [] if is_ml: - if _type.startswith("Union["): + if is_ml and _type.startswith("Union["): # TODO: Improve code, should not lower() for all. e.g., MyClass types_split = [ x.replace(" ", "").lower() @@ -124,15 +124,31 @@ def format_type(_types, is_ml=False): # i_type_list.append(_t.split("[")[0].lower()) else: for _t in _type: - if _t.startswith("Union["): + if _t and _t.startswith("Union["): types_split = [ x.replace(" ", "").lower() for x in _t.split("Union[")[1].split("]")[0].split(",") ] i_type_list.extend(types_split) + elif _t and _t.startswith("Optional["): + types_split = [ + x.replace(" ", "").lower() + for x in _t.split("Optional[")[1].split("]")[0].split(",") + ] + types_split.append("Nonetype") + i_type_list.extend(types_split) + elif _t and _t.startswith("Type["): + types_split = [ + x.replace(" ", "").lower() + for x in _t.split("Type[")[1].split("]")[0].split(",") + ] + i_type_list.extend(types_split) + elif _t and _t in ["None", "Unknown"]: + i_type_list.append("Nonetype") else: # TODO: Maybe no translation should be done here - i_type_list.append(_t.lower()) + if _t: + i_type_list.append(_t.lower()) # i_type_list.append(_t.split("[")[0].lower()) type_formatted.append(list(set(i_type_list))) @@ -176,10 +192,14 @@ def check_match( if expected.get("file") != out.get("file"): return False - # check if line_number match + # # check if line_number match if expected.get("line_number") != out.get("line_number"): return False + # if "col_offset" in expected and "col_offset" in out: + if expected["col_offset"] != out["col_offset"]: + return False + if "col_offset" in expected and "col_offset" in out: if expected["col_offset"] != out["col_offset"]: return False @@ -658,3 +678,97 @@ def benchmark_count(benchmark_path): _a, _functions, _params, _variables = get_fact_stats(json_files) total_result.append([cat, _a, _functions, _params, _variables]) return total_result + + +def normalize_type(type_str, nested_level=0): + """ + Normalize the type string by removing module prefixes and simplifying typing constructs. + Example: 'builtins.str' -> 'str', + 'typing.Tuple[builtins.str, builtins.float]' -> 'Tuple[str, float]', + 'musictaxonomy.spotify.models.spotifyuser' -> 'SpotifyUser', + 'List[List[Tuple[str]]]' -> 'List[List[Any]]' if nested level > 2. + """ + + if type_str is None: + return None + + # Remove extra quotes if present + if type_str.startswith('"') and type_str.endswith('"'): + type_str = type_str.strip('"') + + # Mapping of module prefixes to remove + type_mappings = { + "builtins.": "", + "typing.": "", + } + # Additional type mappings + additional_type_mappings = { + "integer": "int", + "string": "str", + "dictonary": "dict", + "method": "Callable", + "func": "Callable", + "function": "Callable", + "none": "None", + "Nonetype": "None", + "nonetype": "None", + "NoneType": "None", + "Text": "str", + } + + if type_str is None: + return None + + # Replace module prefixes + for prefix, replacement in type_mappings.items(): + type_str = type_str.replace(prefix, replacement) + + # Apply additional type mappings + type_str = additional_type_mappings.get(type_str, type_str) + + # Handle generic types (e.g., Tuple[], List[], Dict[]) + if "[" in type_str and "]" in type_str: + base_type, generic_content = type_str.split("[", 1) + generic_content = generic_content.rsplit("]", 1)[0] + # Process the generic parameters recursively + generic_params = [] + bracket_level = 0 + param = "" + for char in generic_content: + if char == "[": + bracket_level += 1 + param += char + elif char == "]": + bracket_level -= 1 + param += char + elif char == "," and bracket_level == 0: + generic_params.append(param.strip()) + param = "" + else: + param += char + if param: + generic_params.append(param.strip()) + + # If nested level is greater than 0, replace with Any + if nested_level > 0: + normalized_params = ["Any"] + else: + normalized_params = [ + normalize_type(param, nested_level + 1) for param in generic_params + ] + + return f"{base_type}[{', '.join(normalized_params)}]" + + # Handle fully qualified names by extracting the last segment + if "." in type_str: + return type_str.split(".")[-1] + + # Return the simplified type + return type_str + + +def normalize_types(types): + """ + Normalize the type strings in the data. + """ + return [normalize_type(type_str) for type_str in types] diff --git a/src/result_analyzer/large_scale_analysis.py b/src/result_analyzer/large_scale_analysis.py index 236357803..aede1cdf0 100644 --- a/src/result_analyzer/large_scale_analysis.py +++ b/src/result_analyzer/large_scale_analysis.py @@ -7,13 +7,16 @@ from multiprocessing import cpu_count from threading import Lock from collections import defaultdict +from prettytable import PrettyTable +import os +from pathlib import Path +import csv SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) TEST_DIR = os.path.join( SCRIPT_DIR, "results_analysis_tests/test/micro-benchmark/python_features" ) - def check_match( expected, out, @@ -40,12 +43,12 @@ def check_match( if expected.get("file") != out.get("file"): return False, False - if expected.get("line_number") != out.get("line_number"): - return False, False + # if expected.get("line_number") != out.get("line_number"): + # return False, False - # Optional column offset check - if "col_offset" in expected and expected.get("col_offset") != out.get("col_offset"): - return False, False + # # Optional column offset check + # if "col_offset" in expected and expected.get("col_offset") != out.get("col_offset"): + # return False, False # Match specific fields if present for key in ["function", "parameter", "variable"]: @@ -61,6 +64,14 @@ def check_match( type_formatted = format_type([_types]) expected_type_formatted = format_type([expected.get("type", [])]) + # Remove single quotes from formatted types + type_formatted = [ + [t.replace("'", "") for t in sublist] for sublist in type_formatted + ] + expected_type_formatted = [ + [t.replace("'", "") for t in sublist] for sublist in expected_type_formatted + ] + # Exact match check is_exact_match = any( sorted(expected_type_formatted) == [t_list] for t_list in type_formatted[:top_n] @@ -124,6 +135,7 @@ def load_and_sort_json(file_path): def measure_exact_matches(out, expected, tool_name=None, print_missed=False): """ Measure exact and partial matches between two JSON files using indexing for efficiency. + Additionally, count matches for function return types, parameter types, and variable types. """ data_out = load_and_sort_json(out) data_expected = load_and_sort_json(expected) @@ -135,37 +147,152 @@ def measure_exact_matches(out, expected, tool_name=None, print_missed=False): "num_all": len(data_expected), "num_caught_exact": 0, "num_caught_partial": 0, + "function_return": {"total": 0, "exact": 0, "partial": 0}, + "parameter_type": {"total": 0, "exact": 0, "partial": 0}, + "variable_type": {"total": 0, "exact": 0, "partial": 0}, + } + + progress_bar = tqdm(total=len(data_expected), desc="Processing facts", position=0) + + for fact_expected in data_expected: + try: + is_exact_match, is_partial_match = process_fact_comparison_with_index( + fact_expected, index + ) + if is_exact_match: + results["num_caught_exact"] += 1 + elif is_partial_match: + results["num_caught_partial"] += 1 + elif print_missed: + log_missed_fact(tool_name, fact_expected) + + # Count specific types + if ( + "function" in fact_expected + and "parameter" not in fact_expected + and "variable" not in fact_expected + ): + results["function_return"]["total"] += 1 + if is_exact_match: + results["function_return"]["exact"] += 1 + elif is_partial_match: + results["function_return"]["partial"] += 1 + if "parameter" in fact_expected: + results["parameter_type"]["total"] += 1 + if is_exact_match: + results["parameter_type"]["exact"] += 1 + elif is_partial_match: + results["parameter_type"]["partial"] += 1 + if "variable" in fact_expected: + results["variable_type"]["total"] += 1 + if is_exact_match: + results["variable_type"]["exact"] += 1 + elif is_partial_match: + results["variable_type"]["partial"] += 1 + + except Exception as e: + logging.error(f"Error processing fact: {fact_expected} - {e}") + finally: + progress_bar.update(1) + + progress_bar.close() + return results + + +def measure_exact_for_builtins_matches( + out, expected, tool_name=None, print_missed=False +): + """ + Measure exact and partial matches between two JSON files using indexing for efficiency. + Additionally, count matches for function return types, parameter types, and variable types. + """ + data_out = load_and_sort_json(out) + data_expected = load_and_sort_json(expected) + + # Create index for data_out + index = create_index(data_out) + + results = { + "num_all": 0, + "num_caught_exact": 0, + "num_caught_partial": 0, + "function_return": {"total": 0, "exact": 0, "partial": 0}, + "parameter_type": {"total": 0, "exact": 0, "partial": 0}, + "variable_type": {"total": 0, "exact": 0, "partial": 0}, + } + + builtin_types = { + "int", + "float", + "complex", + "str", + "list", + "tuple", + "range", + "set", + "frozenset", + "dict", + "bytes", + "bytearray", + "memoryview", + "bool", + "none", } - lock = Lock() progress_bar = tqdm(total=len(data_expected), desc="Processing facts", position=0) - # Process comparisons in parallel - with ProcessPoolExecutor(max_workers=max(cpu_count() - 1, 1)) as executor: - futures = { - executor.submit( - process_fact_comparison_with_index, fact_expected, index - ): fact_expected - for fact_expected in data_expected - } - - for future in tqdm( - as_completed(futures), total=len(futures), desc="Matching facts" - ): - fact_expected = futures[future] # Retrieve the corresponding fact - try: - is_exact_match, is_partial_match = future.result() - with lock: + for fact_expected in data_expected: + try: + is_exact_match, is_partial_match = process_fact_comparison_with_index( + fact_expected, index + ) + + expected_types = fact_expected.get("type", []) + if not isinstance(expected_types, list): + expected_types = [expected_types] # Ensure it's a list + + for expected_type in expected_types: + base_type = expected_type.split("[")[0].lower() + if base_type not in builtin_types: + continue + + results["num_all"] += 1 + + if is_exact_match: + results["num_caught_exact"] += 1 + elif is_partial_match: + results["num_caught_partial"] += 1 + elif print_missed: + log_missed_fact(tool_name, fact_expected) + + # Count specific types + if ( + "function" in fact_expected + and "parameter" not in fact_expected + and "variable" not in fact_expected + ): + results["function_return"]["total"] += 1 + if is_exact_match: + results["function_return"]["exact"] += 1 + elif is_partial_match: + results["function_return"]["partial"] += 1 + if "parameter" in fact_expected: + results["parameter_type"]["total"] += 1 + if is_exact_match: + results["parameter_type"]["exact"] += 1 + elif is_partial_match: + results["parameter_type"]["partial"] += 1 + if "variable" in fact_expected: + results["variable_type"]["total"] += 1 if is_exact_match: - results["num_caught_exact"] += 1 + results["variable_type"]["exact"] += 1 elif is_partial_match: - results["num_caught_partial"] += 1 - elif print_missed: - log_missed_fact(tool_name, fact_expected) - except Exception as e: - logging.error(f"Error processing fact: {fact_expected} - {e}") - finally: - progress_bar.update(1) + results["variable_type"]["partial"] += 1 + + except Exception as e: + logging.error(f"Error processing fact: {fact_expected} - {e}") + finally: + progress_bar.update(1) progress_bar.close() return results @@ -179,7 +306,10 @@ def process_fact_comparison(fact_expected, data_out): is_exact_match = False is_partial_match = False - for fact_out in data_out: + repo_name = fact_expected["file"] + repo_out_data = [entry for entry in data_out if entry.get("file") == repo_name] + + for fact_out in repo_out_data: exact_match, partial_match = check_match(fact_expected, fact_out) is_exact_match = is_exact_match or exact_match is_partial_match = is_partial_match or partial_match @@ -226,6 +356,200 @@ def process_fact_comparison_with_index(fact_expected, index): return is_exact_match, is_partial_match +def analyze_top_5_most_questions_asked(out, expected): + data_out = load_and_sort_json(out) + data_expected = load_and_sort_json(expected) + + # Count entries for each file + file_counts = defaultdict(int) + for entry in data_expected: + file_counts[entry.get("file")] += 1 + + # Get top 5 files with the most entries + top_5_files = sorted(file_counts.items(), key=lambda x: x[1], reverse=True)[:5] + top_5_file_paths = [file[0] for file in top_5_files] + + # Filter data for the top 5 files + filtered_data_out = [ + entry for entry in data_out if entry.get("file") in top_5_file_paths + ] + filtered_data_expected = [ + entry for entry in data_expected if entry.get("file") in top_5_file_paths + ] + + # Create index for filtered data_out + index = create_index(filtered_data_out) + + results = { + "num_all": len(filtered_data_expected), + "num_caught_exact": 0, + "num_caught_partial": 0, + "file_counts": { + file_path: {"num_all": 0, "num_caught_exact": 0, "num_caught_partial": 0} + for file_path in top_5_file_paths + }, + } + + for fact_expected in filtered_data_expected: + is_exact_match, is_partial_match = process_fact_comparison_with_index( + fact_expected, index + ) + file_path = fact_expected.get("file") + results["file_counts"][file_path]["num_all"] += 1 + if is_exact_match: + results["num_caught_exact"] += 1 + results["file_counts"][file_path]["num_caught_exact"] += 1 + elif is_partial_match: + results["num_caught_partial"] += 1 + results["file_counts"][file_path]["num_caught_partial"] += 1 + + return results + + +def analyze_unique_types(data_out, data_expected): + """ + Analyze unique types in the output and expected data, ignoring builtin base types. + """ + data_out = load_and_sort_json(data_out) + data_expected = load_and_sort_json(data_expected) + + # Create index for data_out + index = create_index(data_out) + + results = {} + builtin_types = { + "int", + "float", + "complex", + "str", + "list", + "tuple", + "range", + "set", + "frozenset", + "dict", + "bytes", + "bytearray", + "memoryview", + "bool", + "none", + } + + typing_types = { + "any", + "optional", + "union", + "literal", + "final", + "classvar", + "noreturn", + "list", + "tuple", + "set", + "frozenset", + "dict", + "deque", + "defaultdict", + "counter", + "namedtuple", + "typeddict", + "callable", + "iterator", + "iterable", + "generator", + "asynciterator", + "asynciterable", + "coroutine", + "contextmanager", + "asynccontextmanager", + "protocol", + "type", + "typevar", + } + + for fact_expected in data_expected: + try: + is_exact_match, is_partial_match = process_fact_comparison_with_index( + fact_expected, index + ) + + expected_types = fact_expected.get("type", []) + if not isinstance(expected_types, list): + expected_types = [expected_types] # Ensure it's a list + + for expected_type in expected_types: + base_type = expected_type.split("[")[0].lower() + if base_type in builtin_types: + continue + + if base_type in typing_types: + continue + + if expected_type not in results: + results[expected_type] = { + "Type": expected_type, + "Total_facts": 0, + "Exact_facts": 0, + } + + results[expected_type]["Total_facts"] += 1 + if is_exact_match: + results[expected_type]["Exact_facts"] += 1 + + except Exception as e: + logging.error(f"Error processing fact: {fact_expected} - {e}") + + # Sort results by Total_facts and return top 5 + sorted_results = dict( + sorted(results.items(), key=lambda item: item[1]["Total_facts"], reverse=True) + ) + return sorted_results + + +# Get distribution of all types +# makes csv +# get the 0.1% percentage occurrence of each type +# do it on training set gt +# top 10 frequent types + + +def prepare_unified_json(results_dir): + """ + Prepare a unified JSON file for all tools for easy comparison. + """ + unified_out = [] + unified_expected = [] + + for file_main in Path(results_dir).rglob("main.py"): + file_name = os.path.relpath(file_main, results_dir) + file_gt = file_main.parent / "main_gt.json" + file_out = file_main.parent / "main_result.json" + if file_out.exists(): + with open(file_out) as f: + tool_data = json.load(f) + for entry in tool_data: + entry["file"] = file_name + unified_out.append(entry) + + if file_gt.exists(): + with open(file_gt) as f: + gt_data = json.load(f) + for entry in gt_data: + entry["file"] = file_name + unified_expected.append(entry) + + # store as file and return path instead + unified_out_path = os.path.join("/tmp", "unified_out.json") + unified_expected_path = os.path.join("/tmp", "unified_expected.json") + with open(unified_out_path, "w") as f: + json.dump(unified_out, f, indent=4) + + with open(unified_expected_path, "w") as f: + json.dump(unified_expected, f, indent=4) + + return unified_out_path, unified_expected_path + + def log_missed_fact(tool_name, fact_expected): """ Log missed facts to a CSV file for further analysis. @@ -238,19 +562,299 @@ def log_missed_fact(tool_name, fact_expected): f.write(f";Missing Fact;{json.dumps(fact_expected)}\n") +def analyze_top_10_frequent_types(csv_file_path, data_out, data_expected): + """ + Analyze the top 10 frequent types from a CSV file and check for exact matches with total facts and exact matches. + """ + top_10_types = [] + + # Read the CSV file and extract the top 10 frequent types + with open(csv_file_path, mode="r") as file: + csv_reader = csv.DictReader(file) + sorted_types = sorted( + (row for row in csv_reader if row["Count"].strip().isdigit()), + key=lambda row: int(row["Count"].strip()), + reverse=True, + ) + top_10_types = [row["Type"] for row in sorted_types[:10]] + + # Load and sort JSON data + data_out = load_and_sort_json(out) + data_expected = load_and_sort_json(expected) + + # Create index for data_out + index = create_index(data_out) + + # Check for exact matches with total facts and exact matches + results = {"total_facts": 0, "exact_matches": 0, "top_10_types": []} + + for type_name in top_10_types: + total_facts = 0 + exact_facts = 0 + + for fact in data_expected: + expected_type = fact.get("type", []) + if isinstance(expected_type, list): + expected_type = expected_type[0] if expected_type else None + + if expected_type == type_name: + total_facts += 1 + is_exact_match, _ = process_fact_comparison_with_index(fact, index) + if is_exact_match: + exact_facts += 1 + + results["top_10_types"].append( + {"Type": type_name, "Total_facts": total_facts, "Exact_facts": exact_facts} + ) + results["total_facts"] += total_facts + results["exact_matches"] += exact_facts + + return results + + +def analyze_rare_types(csv_file_path, data_out, data_expected): + """ + Analyze the rare types from a CSV file and check for exact matches with total facts and exact matches. + """ + rare_types = [] + + # Read the CSV file and extract the rare types + with open(csv_file_path, mode="r") as file: + csv_reader = csv.DictReader(file) + rare_types = [row["Type"] for row in csv_reader] + + # Load and sort JSON data + data_out = load_and_sort_json(data_out) + data_expected = load_and_sort_json(data_expected) + + # Create index for data_out + index = create_index(data_out) + + # Check for exact matches with total facts and exact matches + results = {"total_facts": 0, "exact_matches": 0, "rare_types": []} + + for type_name in rare_types: + total_facts = 0 + exact_facts = 0 + + for fact in data_expected: + expected_type = fact.get("type", []) + if isinstance(expected_type, list): + expected_type = expected_type[0] if expected_type else None + + if expected_type == type_name: + total_facts += 1 + is_exact_match, _ = process_fact_comparison_with_index(fact, index) + if is_exact_match: + exact_facts += 1 + elif expected_type == "object": + print(fact) + + results["rare_types"].append( + {"Type": type_name, "Total_facts": total_facts, "Exact_facts": exact_facts} + ) + results["total_facts"] += total_facts + results["exact_matches"] += exact_facts + + return results + + # Output the result if __name__ == "__main__": - # Test the function - for folder in os.listdir(TEST_DIR): - print(folder) - out = f"{TEST_DIR}/{folder}/test1/main_result.json" - expected = f"{TEST_DIR}/{folder}/test1/main_gt.json" - results = measure_exact_matches(out, expected) - print(results) - - out = "/home/ashwin/Downloads/rw-benchmark/rw-benchmark/test/test_result.json" - expected = "/home/ashwin/Downloads/rw-benchmark/rw-benchmark/test/test_gt.json" - tool_name = "my_tool" + out = "/home/ssegpu/rashida/TypeEvalPy/results_new/finetuned-qwen2.5-Coder-7B-Instruct-without-any/rw-benchmark/test/test_result.json" + expected = "/home/ssegpu/rashida/TypeEvalPy/results_new/finetuned-qwen2.5-Coder-7B-Instruct-without-any/rw-benchmark/test/test_gt.json" + tool_name = "" + + # tools = { + # # "HiTyper": "/home/ssegpu/rashida/TypeEvalPy/results/results_25-02-25 19_06/hityperdl/micro-benchmark/repos", + # # "Type4Py": "/home/ssegpu/rashida/TypeEvalPy/results/results_26-02-25 09_26/type4py/micro-benchmark/repos", + # "h2": "/home/ssegpu/rashida/TypeEvalPy/results/results_26-02-25 12_30/hityperdl/micro-benchmark/repos" + # } + + # for tool_name, results_dir in tools.items(): + # if not results_dir: + # continue + # out, expected = prepare_unified_json(results_dir) results = measure_exact_matches(out, expected, tool_name=tool_name) - print(results) + results_builtins = measure_exact_for_builtins_matches( + out, expected, tool_name=tool_name + ) + + results_qa = analyze_top_5_most_questions_asked(out, expected) + results_unique_types = analyze_unique_types(out, expected) + + # Example usage + csv_file_path = ( + "/home/ssegpu/rashida/TypeEvalPy/src/result_analyzer/.scrapy/top_10.csv" + ) + results_top_10_frequent_types = analyze_top_10_frequent_types( + csv_file_path, out, expected + ) + + rare_types_csv_file_path = ( + "/home/ssegpu/rashida/TypeEvalPy/src/result_analyzer/.scrapy/under_0_1.csv" + ) + results_rare_types = analyze_rare_types(rare_types_csv_file_path, out, expected) + + # Top 10 Frequent Types Results + table_top_10_frequent_types = PrettyTable() + table_top_10_frequent_types.field_names = ["Type", "Total Facts", "Exact Facts"] + + for type_info in results_top_10_frequent_types["top_10_types"]: + table_top_10_frequent_types.add_row( + [type_info["Type"], type_info["Total_facts"], type_info["Exact_facts"]] + ) + + table_top_10_frequent_types.add_row( + [ + "Total", + results_top_10_frequent_types["total_facts"], + results_top_10_frequent_types["exact_matches"], + ] + ) + + print("Top 10 rare Types Results:") + table_rare_types = PrettyTable() + table_rare_types.field_names = ["Type", "Total Facts", "Exact Facts"] + + for type_info in results_rare_types["rare_types"]: + table_rare_types.add_row( + [type_info["Type"], type_info["Total_facts"], type_info["Exact_facts"]] + ) + + table_rare_types.add_row( + [ + "Total", + results_rare_types["total_facts"], + results_rare_types["exact_matches"], + ] + ) + + print("Top 10 unique Types Results:") + table_unique_types = PrettyTable() + table_unique_types.field_names = ["Type", "Total Facts", "Exact Facts"] + + for type_info in results_unique_types.values(): + table_unique_types.add_row( + [type_info["Type"], type_info["Total_facts"], type_info["Exact_facts"]] + ) + + table_unique_types.add_row( + [ + "Total", + sum( + [ + type_info["Total_facts"] + for type_info in results_unique_types.values() + ] + ), + sum( + [ + type_info["Exact_facts"] + for type_info in results_unique_types.values() + ] + ), + ] + ) + + # Create a table for the results + table = PrettyTable() + table.field_names = ["Metric", "Value"] + table.add_row(["Total Facts", results["num_all"]]) + table.add_row(["Exact Matches", results["num_caught_exact"]]) + table.add_row(["Partial Matches", results["num_caught_partial"]]) + table.add_row(["Function Return Total", results["function_return"]["total"]]) + table.add_row(["Function Return Exact", results["function_return"]["exact"]]) + table.add_row(["Function Return Partial", results["function_return"]["partial"]]) + table.add_row(["Parameter Type Total", results["parameter_type"]["total"]]) + table.add_row(["Parameter Type Exact", results["parameter_type"]["exact"]]) + table.add_row(["Parameter Type Partial", results["parameter_type"]["partial"]]) + table.add_row(["Variable Type Total", results["variable_type"]["total"]]) + table.add_row(["Variable Type Exact", results["variable_type"]["exact"]]) + table.add_row(["Variable Type Partial", results["variable_type"]["partial"]]) + + # Create a table for the results_builtins + table_builtins = PrettyTable() + table_builtins.field_names = ["Metric", "Value"] + table_builtins.add_row(["Total Facts", results_builtins["num_all"]]) + table_builtins.add_row(["Exact Matches", results_builtins["num_caught_exact"]]) + table_builtins.add_row(["Partial Matches", results_builtins["num_caught_partial"]]) + table_builtins.add_row( + ["Function Return Total", results_builtins["function_return"]["total"]] + ) + table_builtins.add_row( + ["Function Return Exact", results_builtins["function_return"]["exact"]] + ) + table_builtins.add_row( + ["Function Return Partial", results_builtins["function_return"]["partial"]] + ) + table_builtins.add_row( + ["Parameter Type Total", results_builtins["parameter_type"]["total"]] + ) + table_builtins.add_row( + ["Parameter Type Exact", results_builtins["parameter_type"]["exact"]] + ) + table_builtins.add_row( + ["Parameter Type Partial", results_builtins["parameter_type"]["partial"]] + ) + table_builtins.add_row( + ["Variable Type Total", results_builtins["variable_type"]["total"]] + ) + table_builtins.add_row( + ["Variable Type Exact", results_builtins["variable_type"]["exact"]] + ) + table_builtins.add_row( + ["Variable Type Partial", results_builtins["variable_type"]["partial"]] + ) + + print(f"Tool Name: {tool_name}") + print("Overall Results:") + print(table) + + print("\nTop 5 Most Questions Asked Results:") + table_qa = PrettyTable() + table_qa.field_names = ["File", "Total Facts", "Exact Matches", "Partial Matches"] + + print("Tool Name: ", tool_name) + print("Overall builtins Results:") + print(table_builtins) + + total_facts = 0 + total_exact_matches = 0 + total_partial_matches = 0 + + for file_path, counts in results_qa["file_counts"].items(): + table_qa.add_row( + [ + file_path, + counts["num_all"], + counts["num_caught_exact"], + counts["num_caught_partial"], + ] + ) + total_facts += counts["num_all"] + total_exact_matches += counts["num_caught_exact"] + total_partial_matches += counts["num_caught_partial"] + + table_qa.add_row(["Total", total_facts, total_exact_matches, total_partial_matches]) + + print(table_qa) + + # Save the tables to a file + analysis_file_path = "/home/ssegpu/rashida/TypeEvalPy/results_new/finetuned-qwen2.5-Coder-7B-Instruct-without-any/analysis_results.txt" + os.makedirs(os.path.dirname(analysis_file_path), exist_ok=True) + with open(analysis_file_path, "w") as f: + f.write(f"Tool Name: {tool_name}\n") + f.write("Overall Results:\n") + f.write(str(table)) + f.write("\n\nTop 5 Most Questions Asked Results:\n") + f.write(str(table_qa)) + f.write("\n\nTop 10 Frequent Types Results:\n") + f.write(str(table_top_10_frequent_types)) + f.write("\n\nRare Types Results:\n") + f.write(str(table_rare_types)) + f.write("\n\nUnique Types Results:\n") + f.write(str(table_unique_types)) + f.write("\n\nOverall builtins Results:\n") + f.write(str(table_builtins)) diff --git a/src/result_analyzer/large_scale_analysis_repo.py b/src/result_analyzer/large_scale_analysis_repo.py new file mode 100644 index 000000000..18ae1a1be --- /dev/null +++ b/src/result_analyzer/large_scale_analysis_repo.py @@ -0,0 +1,331 @@ +import os +import json +import logging +from concurrent.futures import ProcessPoolExecutor, as_completed +from analysis_utils import format_type, normalize_type, normalize_types +from tqdm import tqdm +from multiprocessing import cpu_count +from threading import Lock +from collections import defaultdict +from pathlib import Path + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +TEST_DIR = os.path.join( + SCRIPT_DIR, "results_analysis_tests/test/micro-benchmark/python_features" +) + + +def check_match( + expected, + out, + top_n=1, + is_ml=False, + print_mismatch=True, + metadata=None, +): + """ + Check for both exact and partial matches between expected and out entries. + Returns a tuple: (is_exact_match, is_partial_match). + """ + metadata = metadata or {} + + # Check keys in `out` are present in `expected` + if not all( + x in expected + for x in out.keys() + if x not in {"type", "all_type_preds", "col_offset"} + ): + return False, False + + # Early exits for file and line number mismatches + if expected.get("file") != out.get("file"): + return False, False + + # if expected.get("line_number") != out.get("line_number"): + # return False, False + + # Optional column offset check + # if "col_offset" in expected and expected.get("col_offset") != out.get("col_offset"): + # return False, False + + # Match specific fields if present + for key in ["function", "parameter", "variable"]: + if key in expected and expected.get(key) != out.get(key): + return False, False + + # Type matching logic + if is_ml: + out_types = [x[0] for x in out.get("all_type_preds", [])] + else: + out_types = out.get("type", []) + + # Normalize types for comparison + out_types = normalize_types(out_types) + expected_types = normalize_types(expected.get("type", [])) + + type_formatted = format_type([out_types]) + expected_type_formatted = format_type([expected_types]) + + # Exact match check + is_exact_match = any( + sorted(expected_type_formatted) == [t_list] for t_list in type_formatted[:top_n] + ) + + # Partial match check + expected_set = {t for sublist in expected_type_formatted for t in sublist} + is_partial_match = any( + expected_set.intersection(t_list) for t_list in type_formatted[:top_n] + ) + + if not (is_exact_match or is_partial_match) and print_mismatch: + log_mismatch(metadata, expected, out, partial_match=True) + + return is_exact_match, is_partial_match + + +def log_mismatch(metadata, expected, out, partial_match): + """ + Log mismatched cases for debugging or analysis. + """ + # if partial_match: + # print("Partial match:") + + tool_name = metadata.get("tool_name", "unknown_tool") + mismatch_file = f"{tool_name}_mismatches_reasons.csv" + with open(mismatch_file, "a") as f: + f.write( + ";".join( + [ + expected.get("file"), + str(expected.get("type")), + str(out.get("type")), + ] + ) + ) + f.write("\n") + + +def sort_facts(data): + """ + Sort facts based on line_number and ensure 'type' fields (if list) are sorted. + """ + return sorted(data, key=lambda x: int(x.get("line_number", 0))) + + +def load_and_sort_json(file_path): + """ + Load JSON from a file and sort the facts for consistent processing. + """ + with open(file_path) as f: + data = json.load(f) + return sort_facts(data) + + +def measure_exact_matches_no_parallel( + out, expected, tool_name=None, print_missed=False +): + data_out = load_and_sort_json(out) + data_expected = load_and_sort_json(expected) + index = create_index(data_out) + + results = { + "num_all": len(data_expected), + "num_caught_exact": 0, + "num_caught_partial": 0, + } + + for fact_expected in tqdm(data_expected, desc="Processing facts (no parallel)"): + is_exact_match, is_partial_match = process_fact_comparison_with_index( + fact_expected, + index, + ) + if is_exact_match: + results["num_caught_exact"] += 1 + elif is_partial_match: + results["num_caught_partial"] += 1 + elif print_missed: + log_missed_fact(tool_name, fact_expected) + + return results + + +def measure_exact_matches(out, expected, tool_name=None, print_missed=False): + """ + Measure exact and partial matches between two JSON files using indexing for efficiency. + """ + data_out = load_and_sort_json(out) + data_expected = load_and_sort_json(expected) + + # Create index for data_out + index = create_index(data_out) + + results = { + "num_all": len(data_expected), + "num_caught_exact": 0, + "num_caught_partial": 0, + } + + lock = Lock() + progress_bar = tqdm(total=len(data_expected), desc="Processing facts", position=0) + + # Process comparisons in parallel + with ProcessPoolExecutor(max_workers=max(cpu_count() - 1, 1)) as executor: + futures = { + executor.submit( + process_fact_comparison_with_index, fact_expected, index + ): fact_expected + for fact_expected in data_expected + } + + for future in tqdm( + as_completed(futures), total=len(futures), desc="Matching facts" + ): + fact_expected = futures[future] # Retrieve the corresponding fact + try: + is_exact_match, is_partial_match = future.result() + with lock: + if is_exact_match: + results["num_caught_exact"] += 1 + elif is_partial_match: + results["num_caught_partial"] += 1 + elif print_missed: + log_missed_fact(tool_name, fact_expected) + except Exception as e: + logging.error(f"Error processing fact: {fact_expected} - {e}") + finally: + progress_bar.update(1) + + progress_bar.close() + return results + + +def process_fact_comparison(fact_expected, data_out): + """ + Compare a single fact against all output facts to determine exact and partial matches. + Returns the combined match results. + """ + is_exact_match = False + is_partial_match = False + + for fact_out in data_out: + exact_match, partial_match = check_match(fact_expected, fact_out) + is_exact_match = is_exact_match or exact_match + is_partial_match = is_partial_match or partial_match + + # Break early if both matches are found + if is_exact_match and is_partial_match: + break + + return is_exact_match, is_partial_match + + +def create_index(data_out): + """ + Create an index for data_out based on (file, line_number) and optionally other fields. + """ + index = defaultdict(list) + for fact_out in data_out: + key = fact_out.get("file") + index[key].append(fact_out) + return index + + +def process_fact_comparison_with_index(fact_expected, index): + """ + Compare a single fact against indexed output facts for matches. + """ + is_exact_match = False + is_partial_match = False + + # Get the relevant facts from the index + key = fact_expected.get("file") + relevant_facts = index.get(key, []) + + # Compare only relevant facts + for fact_out in relevant_facts: + exact_match, partial_match = check_match(fact_expected, fact_out) + is_exact_match = is_exact_match or exact_match + is_partial_match = is_partial_match or partial_match + + # Break early if both matches are found + if is_exact_match and is_partial_match: + break + + return is_exact_match, is_partial_match + + +def log_missed_fact(tool_name, fact_expected): + """ + Log missed facts to a CSV file for further analysis. + """ + if not tool_name: + return + + missed_log_path = f"{tool_name}_not_found_reasons.csv" + with open(missed_log_path, "a") as f: + f.write(f";Missing Fact;{json.dumps(fact_expected)}\n") + + +def prepare_unified_json(results_dir): + """ + Prepare a unified JSON file for all tools for easy comparison. + """ + unified_out = [] + unified_expected = [] + + for file_main in Path(results_dir).rglob("main.py"): + file_name = os.path.relpath(file_main, results_dir) + file_gt = file_main.parent / "main_gt.json" + file_out = file_main.parent / "main_result.json" + if file_out.exists(): + with open(file_out) as f: + tool_data = json.load(f) + for entry in tool_data: + entry["file"] = file_name + unified_out.append(entry) + + if file_gt.exists(): + with open(file_gt) as f: + gt_data = json.load(f) + for entry in gt_data: + entry["file"] = file_name + unified_expected.append(entry) + + # store as file and return path instead + unified_out_path = os.path.join("/tmp", "unified_out.json") + unified_expected_path = os.path.join("/tmp", "unified_expected.json") + with open(unified_out_path, "w") as f: + json.dump(unified_out, f, indent=4) + + with open(unified_expected_path, "w") as f: + json.dump(unified_expected, f, indent=4) + + return unified_out_path, unified_expected_path + + +# Output the result +if __name__ == "__main__": + # Test the function + # for folder in os.listdir(TEST_DIR): + # print(folder) + # out = f"{TEST_DIR}/{folder}/test1/main_result.json" + # expected = f"{TEST_DIR}/{folder}/test1/main_gt.json" + # results = measure_exact_matches_no_parallel(out, expected) + # print(results) + + tools = { + # "HiTyper": "/home/ssegpu/rashida/TypeEvalPy/results/results_25-02-25 19_06/hityperdl/micro-benchmark/repos", + # "Type4Py": "/home/ssegpu/rashida/TypeEvalPy/results/results_26-02-25 09_26/type4py/micro-benchmark/repos", + "h2": "/home/ssegpu/rashida/TypeEvalPy/results/results_26-02-25 12_28/hityperdl/micro-benchmark/repos" + } + + for tool_name, results_dir in tools.items(): + if not results_dir: + continue + unified_out, unified_expected = prepare_unified_json(results_dir) + results = measure_exact_matches_no_parallel(unified_out, unified_expected) + print(f"\n\nResults for {tool_name}:") + print(results) + + # results = measure_exact_matches(out, expected, tool_name=tool_name) + # print(results) diff --git a/src/runner_class.py b/src/runner_class.py index 66df0d201..ffdd5c166 100644 --- a/src/runner_class.py +++ b/src/runner_class.py @@ -247,6 +247,7 @@ def spawn_docker_instance(self): logger.info("Creating container") container = self.docker_client.containers.run( self.tool_name, + runtime="nvidia", detach=True, stdin_open=True, tty=True, diff --git a/src/target_tools/hityper/Dockerfile b/src/target_tools/hityper/Dockerfile index 2e118e2f2..88236db23 100644 --- a/src/target_tools/hityper/Dockerfile +++ b/src/target_tools/hityper/Dockerfile @@ -1,27 +1,27 @@ # Pull the Python base image -FROM python:3.10-slim-bullseye +FROM ghcr.io/saltudelft/type4py:latest -# Set environment variables -ENV PYTHONDONTWRITEBYTECODE 1 -ENV PYTHONUNBUFFERED 1 +COPY requirements.txt /app/requirements.txt +COPY src /tmp/src -# Set work directory -WORKDIR /app +# Install Python dependencies +RUN pip install --upgrade pip +#RUN pip install . +RUN pip install -r /app/requirements.txt -# Install system dependencies -RUN apt-get update \ - && apt-get -y install git gcc +WORKDIR /app # Clone the repository RUN git clone https://github.com/JohnnyPeng18/HiTyper.git +COPY patches/config.py /app/HiTyper/hityper/config.py COPY patches/__main__.py /app/HiTyper/hityper/__main__.py # Navigate into the cloned repository WORKDIR /app/HiTyper # Install Python dependencies -RUN pip install --upgrade pip RUN pip install . +RUN pip install PyCG==0.0.4 RUN pip install PyCG==0.0.4 diff --git a/src/target_tools/hityper/src/runner.py b/src/target_tools/hityper/src/runner.py index 550ee88d3..e389360fe 100644 --- a/src/target_tools/hityper/src/runner.py +++ b/src/target_tools/hityper/src/runner.py @@ -31,6 +31,8 @@ def list_python_files(folder_path): def process_file(file_path): + base_repo_folder = "/mnt/hf_cache/rashida_manytype4py/many-types-4-py-dataset" + file_path = os.path.join(base_repo_folder, file_path) dir_path, file_name = os.path.split(file_path) hitype_cmd = f"hityper infer -s ./{file_name} -p ." subprocess.run( @@ -44,7 +46,16 @@ def process_file(file_path): def main_runner(args): - python_files = list_python_files(args.bechmark_path) + + with open(args.bechmark_path, 'r', encoding='utf-8') as file: + data = json.load(file) + # Extract .py files + python_files = [] + + for repo, repo_data in data.items(): + if "src_files" in repo_data: + python_files.extend(repo_data["src_files"].keys()) + error_count = 0 error_list = [] for file in python_files: @@ -78,12 +89,20 @@ def main_runner(args): parser.add_argument( "--bechmark_path", help="Specify the benchmark path", - default="/tmp/micro-benchmark", + default="repos", ) args = parser.parse_args() main_runner(args) else: print("Python is not running inside a Docker container") - file_path = "" - process_file(file_path) + + parser = argparse.ArgumentParser() + parser.add_argument( + "--bechmark_path", + help="Specify the benchmark path", + default="/home/ssegpu/rashida/TypeEvalPy/src/target_tools/hityper/rw-benchmark/test/test.json", + ) + + args = parser.parse_args() + main_runner(args) diff --git a/src/target_tools/hityperdl/Dockerfile.cuda b/src/target_tools/hityperdl/Dockerfile.cuda new file mode 100644 index 000000000..93063c2d5 --- /dev/null +++ b/src/target_tools/hityperdl/Dockerfile.cuda @@ -0,0 +1,31 @@ +# Pull the Python base image +FROM ghcr.io/saltudelft/type4py.prod.gpu:latest + +# install git +RUN rm /etc/apt/sources.list.d/cuda.list +RUN rm /etc/apt/sources.list.d/nvidia-ml.list +RUN apt update && apt install -y git + +COPY requirements.txt /app/requirements.txt +COPY src /tmp/src + +# Install Python dependencies +RUN pip install --upgrade pip +#RUN pip install . +RUN pip install -r /app/requirements.txt + +WORKDIR /app + +# Clone the repository +RUN git clone https://github.com/JohnnyPeng18/HiTyper.git +COPY patches/config.py /app/HiTyper/hityper/config.py +COPY patches/__main__.py /app/HiTyper/hityper/__main__.py + +# Navigate into the cloned repository +WORKDIR /app/HiTyper + +# Install Python dependencies +RUN pip install . +RUN pip install PyCG==0.0.4 + +WORKDIR /type4py/type4py/server diff --git a/src/target_tools/hityperdl/patches/config.py b/src/target_tools/hityperdl/patches/config.py index da7563a58..8aecb57fd 100644 --- a/src/target_tools/hityperdl/patches/config.py +++ b/src/target_tools/hityperdl/patches/config.py @@ -1,6 +1,6 @@ config = { # Indicate the web API that HiTyper should call to invoke the DL model - "type4py": "http://localhost:5010/api/predict?tc=0", + "type4py": "http://type4py:5010/api/predict?tc=0", # Indicate the default DL model used in HiTyper "default_model": "type4py", # Indicate the maximum iterations that HiTyper iterates the whole TDG to conduct static inference diff --git a/src/target_tools/hityperdl/requirements.txt b/src/target_tools/hityperdl/requirements.txt index 2c24336eb..1cc70db1a 100644 --- a/src/target_tools/hityperdl/requirements.txt +++ b/src/target_tools/hityperdl/requirements.txt @@ -1 +1,2 @@ requests==2.31.0 +tqdm \ No newline at end of file diff --git a/src/target_tools/hityperdl/src/runner.py b/src/target_tools/hityperdl/src/runner.py index 63707018d..38c904e5c 100644 --- a/src/target_tools/hityperdl/src/runner.py +++ b/src/target_tools/hityperdl/src/runner.py @@ -8,6 +8,7 @@ import translator import utils +from tqdm import tqdm # Create a logger logger = logging.getLogger("runner") @@ -47,7 +48,7 @@ def main_runner(args): python_files = list_python_files(args.bechmark_path) error_count = 0 error_list = [] - for file in python_files: + for file in tqdm(python_files, desc="Processing files"): try: # logger.debug(file) process_file(file) diff --git a/src/target_tools/hityperdl/src/translator.py b/src/target_tools/hityperdl/src/translator.py index 2c695e047..27f0f8fc0 100644 --- a/src/target_tools/hityperdl/src/translator.py +++ b/src/target_tools/hityperdl/src/translator.py @@ -10,6 +10,93 @@ def list_json_files(folder_path): return python_files +def normalize_type(type_str, nested_level=0): + """ + Normalize the type string by removing module prefixes and simplifying typing constructs. + Example: 'builtins.str' -> 'str', + 'typing.Tuple[builtins.str, builtins.float]' -> 'Tuple[str, float]', + 'musictaxonomy.spotify.models.spotifyuser' -> 'SpotifyUser', + 'List[List[Tuple[str]]]' -> 'List[List[Any]]' if nested level > 2. + """ + + if type_str is None: + return None + + # Remove extra quotes if present + if type_str.startswith('"') and type_str.endswith('"'): + type_str = type_str.strip('"') + + # Mapping of module prefixes to remove + type_mappings = { + "builtins.": "", + "typing.": "", + } + # Additional type mappings + additional_type_mappings = { + "integer": "int", + "string": "str", + "dictonary": "dict", + "method": "Callable", + "func": "Callable", + "function": "Callable", + "none": "None", + "Nonetype": "None", + "nonetype": "None", + "NoneType": "None", + "Text": "str", + } + + if type_str is None: + return None + + # Replace module prefixes + for prefix, replacement in type_mappings.items(): + type_str = type_str.replace(prefix, replacement) + + # Apply additional type mappings + type_str = additional_type_mappings.get(type_str, type_str) + + # Handle generic types (e.g., Tuple[], List[], Dict[]) + if "[" in type_str and "]" in type_str: + base_type, generic_content = type_str.split("[", 1) + generic_content = generic_content.rsplit("]", 1)[0] + # Process the generic parameters recursively + generic_params = [] + bracket_level = 0 + param = "" + for char in generic_content: + if char == "[": + bracket_level += 1 + param += char + elif char == "]": + bracket_level -= 1 + param += char + elif char == "," and bracket_level == 0: + generic_params.append(param.strip()) + param = "" + else: + param += char + if param: + generic_params.append(param.strip()) + + # If nested level is greater than 0, replace with Any + if nested_level > 0: + normalized_params = ["Any"] + else: + normalized_params = [ + normalize_type(param, nested_level + 1) for param in generic_params + ] + + return f"{base_type}[{', '.join(normalized_params)}]" + + # Handle fully qualified names by extracting the last segment + if "." in type_str: + return type_str.split(".")[-1] + + # Return the simplified type + return type_str + + def translate_content(file_path): dir_path, file_name = os.path.split(file_path) hityper_file = dir_path + "/._" + file_name.replace(".py", "_INFERREDTYPES.json") @@ -46,9 +133,9 @@ def convert_type(type_str): "file": gt_item["file"], "line_number": gt_item["line_number"], "variable": gt_item["variable"], - "type": [convert_type(item["type"][0])], + "type": [normalize_type(item["type"][0])], "all_type_preds": [ - [convert_type(type_item)] + [normalize_type(type_item)] for type_item in item["type"] ], } @@ -60,9 +147,9 @@ def convert_type(type_str): "line_number": gt_item["line_number"], "function": gt_item["function"], "variable": gt_item["variable"], - "type": [convert_type(item["type"][0])], + "type": [normalize_type(item["type"][0])], "all_type_preds": [ - [convert_type(type_item)] + [normalize_type(type_item)] for type_item in item["type"] ], } @@ -78,9 +165,10 @@ def convert_type(type_str): "line_number": gt_item["line_number"], "function": gt_item["function"], "parameter": gt_item["parameter"], - "type": [convert_type(item["type"][0])], + "type": [normalize_type(item["type"][0])], "all_type_preds": [ - [convert_type(type_item)] for type_item in item["type"] + [normalize_type(type_item)] + for type_item in item["type"] ], } formatted_output.append(formatted_item) @@ -94,9 +182,10 @@ def convert_type(type_str): "file": gt_item["file"], "line_number": gt_item["line_number"], "function": gt_item["function"], - "type": [convert_type(item["type"][0])], + "type": [normalize_type(item["type"][0])], "all_type_preds": [ - [convert_type(type_item)] for type_item in item["type"] + [normalize_type(type_item)] + for type_item in item["type"] ], } formatted_output.append(formatted_item) diff --git a/src/target_tools/llms/src/prompts.py b/src/target_tools/llms/src/prompts.py index 51d8c143d..1cb79b136 100644 --- a/src/target_tools/llms/src/prompts.py +++ b/src/target_tools/llms/src/prompts.py @@ -640,3 +640,21 @@ def id_func ( arg ): + questions_based_2, }, ] + +prompt_template_masked_code_based_1 = [ + { + "role": "system", + "content": "You will examine and identify the data types of various elements such as function parameters, local variables, and function return types in the given Python code.", + }, + { + "role": "user", + "content": "## Task Description\n\n", + }, +] + +prompt_template_masked_code_based_1_no_sys = [ + { + "role": "user", + "content": "## Task Description\n\n", + }, +] \ No newline at end of file diff --git a/src/target_tools/llms/src/result_translator.py b/src/target_tools/llms/src/result_translator.py new file mode 100644 index 000000000..e634e8e57 --- /dev/null +++ b/src/target_tools/llms/src/result_translator.py @@ -0,0 +1,182 @@ +import ast +import json +import os +from typing import List, Dict, Any + +def parse_annotation(annotation_node: ast.AST) -> str: + """ + Convert AST annotation node into the exact type string as present in the code. + """ + if annotation_node is None: + return "None" + return ast.unparse(annotation_node) # Use ast.unparse to get the exact annotation as a string. + +def get_type_annotations_from_content(source: str, filename: str) -> List[Dict[str, Any]]: + """Parse type annotations from source code content with syntax error handling.""" + try: + tree = ast.parse(source) + except SyntaxError as e: + print(f"File-level SyntaxError in {filename}: {e}") + return [] # Skip the whole file if file-level syntax is invalid + + annotations = [] + source_lines = source.splitlines() + + class TypeAnnotationVisitor(ast.NodeVisitor): + def __init__(self, filename): + self.filename = os.path.basename(filename).replace("_gt.json", ".py") + self.current_class = None # Track the current class name + self.current_function = None # Track the current function name + self.processed_variables = set() # Track processed variables to avoid duplicates + + def visit_ClassDef(self, node: ast.ClassDef): + """Visit a class definition and track its name.""" + self.current_class = node.name + self.generic_visit(node) + self.current_class = None # Reset after leaving the class + + def visit_FunctionDef(self, node: ast.FunctionDef): + self.current_function = node.name + function_name = f"{self.current_class}.{self.current_function}" if self.current_class else self.current_function + + try: + line = source_lines[node.lineno - 1] + name_col_offset = line.index(node.name) + 1 + + # Process function parameters + for arg in node.args.args + node.args.kwonlyargs: + if arg.annotation: + param_id = (node.lineno, arg.arg) + if param_id not in self.processed_variables: + annotations.append({ + "file": self.filename, + "line_number": arg.lineno, + "col_offset": arg.col_offset + 1, + "parameter": arg.arg, + "function": function_name, + "type": [parse_annotation(arg.annotation)] + }) + self.processed_variables.add(param_id) + + # Process *args + if node.args.vararg and node.args.vararg.annotation: + vararg_id = (node.args.vararg.lineno, node.args.vararg.arg) + if vararg_id not in self.processed_variables: + annotations.append({ + "file": self.filename, + "line_number": node.args.vararg.lineno, + "col_offset": node.args.vararg.col_offset + 1, + "parameter": f"*{node.args.vararg.arg}", + "function": function_name, + "type": [parse_annotation(node.args.vararg.annotation)] + }) + self.processed_variables.add(vararg_id) + + # Process **kwargs + if node.args.kwarg and node.args.kwarg.annotation: + kwarg_id = (node.args.kwarg.lineno, node.args.kwarg.arg) + if kwarg_id not in self.processed_variables: + annotations.append({ + "file": self.filename, + "line_number": node.args.kwarg.lineno, + "col_offset": node.args.kwarg.col_offset + 1, + "parameter": f"**{node.args.kwarg.arg}", + "function": function_name, + "type": [parse_annotation(node.args.kwarg.annotation)] + }) + self.processed_variables.add(kwarg_id) + + # Process function return type + if node.returns: + func_id = (node.lineno, function_name) + if func_id not in self.processed_variables: + annotations.append({ + "file": self.filename, + "line_number": node.lineno, + "col_offset": name_col_offset, + "function": function_name, + "type": [parse_annotation(node.returns)] + }) + self.processed_variables.add(func_id) + + self.generic_visit(node) + + except Exception as e: + print(f"Error in function '{function_name}' in {filename}: {e}") + + self.current_function = None + + def visit_AnnAssign(self, node: ast.AnnAssign): + try: + if isinstance(node.target, ast.Name): + variable_name = node.target.id + function_name = self.current_function if self.current_function else None + var_id = (node.lineno, variable_name) + + if var_id not in self.processed_variables: + if self.current_class and not function_name: + variable_name = f"{self.current_class}.{variable_name}" + + annotation_entry = { + "file": self.filename, + "line_number": node.lineno, + "col_offset": node.col_offset + 1, + "variable": variable_name, + "type": [parse_annotation(node.annotation)] + } + + if function_name: + annotation_entry["function"] = function_name + + annotations.append(annotation_entry) + self.processed_variables.add(var_id) + + except Exception as e: + print(f"Error processing annotated assignment in {self.filename}: {e}") + + visitor = TypeAnnotationVisitor(filename) + visitor.visit(tree) + + return annotations + +def format_annotations_for_ground_truth(annotations: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Format annotations to match the ground truth JSON structure.""" + formatted_annotations = [] + for annotation in annotations: + formatted_annotation = { + "file": annotation["file"], + "line_number": annotation["line_number"], + "col_offset": annotation["col_offset"], + } + + if "function" in annotation: + formatted_annotation["function"] = annotation["function"] + if "variable" in annotation: + formatted_annotation["variable"] = annotation["variable"] + if "parameter" in annotation: + formatted_annotation["parameter"] = annotation["parameter"] + + formatted_annotation["type"] = annotation["type"] + + formatted_annotations.append(formatted_annotation) + + return formatted_annotations + +def translate_output_to_annotations(source: str, filename: str) -> str: + """Translate source code output to JSON-formatted type annotations, matching ground truth format.""" + annotations = get_type_annotations_from_content(source, filename) + formatted_annotations = format_annotations_for_ground_truth(annotations) + return json.dumps(formatted_annotations, indent=4) + +# Main function for testing +def main(): + test_file = "/home/ssegpu/rashida/TypeEvalPy/src/target_tools/real-world-llms/src/.scrapy/main.py" # Update this path as necessary + + with open(test_file, "r") as f: + source_code = f.read() + + output_json = translate_output_to_annotations(source_code, test_file) + print(output_json) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/target_tools/llms/src/runner.py b/src/target_tools/llms/src/runner.py index 651356c90..bb6813aee 100644 --- a/src/target_tools/llms/src/runner.py +++ b/src/target_tools/llms/src/runner.py @@ -24,7 +24,7 @@ import gc import torch from tqdm import tqdm - +import result_translator # Import the translation module AUTOFIX_WITH_OPENAI = False REQUEST_TIMEOUT = 60 @@ -109,6 +109,37 @@ def create_result_json_file(file_info, output_raw, prompt_template): # logger.info(f"Processed file: {file_info['file_path']}") +def create_result_json_from_code_file(file_info, output_raw, prompt_template): + # Clean up the output by removing unnecessary formatting + output_cleaned = re.sub(r"```json|```|<\|assistant\|>\\n", "", output_raw) + + # Save the raw output to the result dump filepath + with open(file_info["result_dump_filepath"], "w") as f: + f.write(output_raw) + + # Determine the filename, falling back if "filename" key is missing + filename = file_info["json_filepath"] + if filename is None: + # Generate a fallback filename with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + fallback_dir = "outputs" # You may specify any preferred directory + os.makedirs(fallback_dir, exist_ok=True) + filename = os.path.join(fallback_dir, f"output_{timestamp}.json") + logger.warning(f"'filename' key missing in file_info; saving output to {filename}") + + # Directly translate the cleaned source code output to JSON annotations + translated_json = result_translator.translate_output_to_annotations( + output_cleaned, filename + ) + + # Validate and save the translated JSON to the final result file + result_filepath = file_info.get("result_filepath", filename) + if utils.generate_json_file(result_filepath, translated_json): + logger.info(f"Processed file: {file_info.get('file_path', filename)} successfully.") + else: + logger.error(f"{file_info.get('file_path', filename)} failed: Not a valid JSON") + raise utils.JsonException("json") + def list_python_files(folder_path): python_files = sorted(Path(folder_path).rglob("main.py")) @@ -219,7 +250,10 @@ def model_evaluation_openai( file_info = id_mapping[id] output_raw = r_output - create_result_json_file(file_info, output_raw, prompt_template) + if prompt_id in ["prompt_template_questions_based_2",]: + create_result_json_file(file_info, output_raw, prompt_template) + elif prompt_id in ["prompt_template_masked_code_based_1",]: + create_result_json_from_code_file(file_info, output_raw, prompt_template) def main_runner(args, runner_config, models_to_run, openai_models_models_to_run): @@ -453,6 +487,7 @@ def main_runner(args, runner_config, models_to_run, openai_models_models_to_run) # example usage: """ + python runner.py \ --bechmark_path /home/ssegpu/TypeEvalPy/TypeEvalPy/micro-benchmark \ --prompt_id prompt_template_questions_based_2 \ diff --git a/src/target_tools/llms/src/utils.py b/src/target_tools/llms/src/utils.py index 9a9c4cfd5..6fbf5aa27 100644 --- a/src/target_tools/llms/src/utils.py +++ b/src/target_tools/llms/src/utils.py @@ -301,13 +301,30 @@ def get_prompt(prompt_id, file_path, answers_placeholders=True, use_system_promp else: prompt = copy.deepcopy(eval(f"prompts.{prompt_id}_no_sys")) prompt[0]["content"] = prompt[0]["content"].format(**prompt_data) + elif prompt_id in ["prompt_template_masked_code_based_1"]: + prompt_data = { + "code": code, + "instructions": ( + "You are given a Python code snippet where all type annotations are currently represented by the placeholder '[MASK]'. " + "Your task is to replace '[MASK]' with the most appropriate Python type annotations, such as 'str', 'int', 'callable', etc., " + "for all function return types, variable annotations, and function parameters. " + "\n\nStrict Requirements:\n" + "1. Maintain the exact same structure, formatting, and indentation as in the input code.\n" + "2. Do not alter the line numbers or remove existing blank lines.\n" + "3. Do not add any additional blank lines or comments.\n" + "4. Do not add any explanations or extra information in the output.\n" + "5. Only return the annotated version of the code.\n" + "6. Ensure proper and consistent type annotations wherever applicable." + ), + } - else: - logger.error("ERROR! Prompt not found!") - sys.exit(-1) - - get_token_count(f"{prompt[0]['content']}{prompt[1]['content']}", prompt_id) - + if use_system_prompt: + prompt = copy.deepcopy(eval(f"prompts.{prompt_id}")) + prompt[1]["content"] = "{instructions}\n\n{code}".format(**prompt_data) + else: + prompt = copy.deepcopy(eval(f"prompts.{prompt_id}_no_sys")) + prompt[0]["content"] = "{instructions}\n\n{code}".format(**prompt_data) + return prompt diff --git a/src/target_tools/real-world-llms/Dockerfile b/src/target_tools/real-world-llms/Dockerfile new file mode 100644 index 000000000..2b47a1e08 --- /dev/null +++ b/src/target_tools/real-world-llms/Dockerfile @@ -0,0 +1,31 @@ +# Pull the Python base image +FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 + +# Set environment variables +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONUNBUFFERED 1 +ENV CUDA_HOME=/usr/local/cuda +ENV PATH=/usr/local/cuda/bin:${PATH} +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} +# Set work directory +WORKDIR /app + +# Install dependencies +RUN apt-get update \ + && apt-get -y install gcc python3-packaging python3-pip python3-dev python3-venv python3-setuptools git + +COPY requirements.txt /app/requirements.txt + +RUN pip install --upgrade pip +RUN pip install setuptools +RUN pip install torch==2.1.2 torchvision==0.16.2 +RUN pip install -r requirements.txt +RUN pip install flash-attn + +RUN ln -s /usr/bin/python3 /usr/bin/python + + +COPY src /tmp/src + +# Keep the container alive +CMD ["bash"] diff --git a/src/target_tools/real-world-llms/README.md b/src/target_tools/real-world-llms/README.md new file mode 100644 index 000000000..260962315 --- /dev/null +++ b/src/target_tools/real-world-llms/README.md @@ -0,0 +1,56 @@ +# 🧠 TypeEvalPy: Type Inference Evaluation for Python + +This adapter provides tools for: +- Preprocessing the [ManyTypes4Py](https://github.com/saltudelft/many-types-4-py) dataset +- Running type inference using large language models (LLMs) +- Analyzing and evaluating results on real-world Python benchmarks + +--- + +## 📦 1. Data Preprocessing + +Start by downloading and preparing the [ManyTypes4Py](https://github.com/saltudelft/many-types-4-py) dataset. + +### Steps: + +1. Clone the dataset repository: + ```bash + git clone https://github.com/saltudelft/many-types-4-py + cd many-types-4-py + +2. Run the preprocessing script to generate ground truth files for training, validation, and testing: + +```bash +python3 prepare_dataset.py +``` + +This will create the required train, test, and valid files with annotated types. + + +## 🚀 2. Running Model Inference + +Use the `runner.py` script to run inference using various LLMs on your benchmark dataset. + +```bash +python3.10 runner.py \ +--bechmark_path /mnt/hf_cache/rashida_manytype4py/many-types-4-py-dataset/rw-benchmark \ +--prompt_id prompt_template_questions_based_2 \ +--models codestral-v0.1-22b qwen2.5-Coder-7B-Instruct \ +--hf_token \ +--openai_key \ +--enable_streaming True \ +--models_config /home/ssegpu/rashida/TypeEvalPy/src/target_tools/real-world-llms/src/models_config.yaml \ +--results_dir /home/ssegpu/rashida/TypeEvalPy/results +``` + +🔑 Note: Replace and with your actual API credentials. + +## 📊 3. Result Evaluation + +After inference is complete, go to result_analyzer module and evaluate the predictions using: + +```bash +python3 large_scale_analysis.py +``` + +This will generate ```analysis.txt``` in the model results. \ No newline at end of file diff --git a/src/target_tools/real-world-llms/models_info.md b/src/target_tools/real-world-llms/models_info.md new file mode 100644 index 000000000..ffa1dd3ef --- /dev/null +++ b/src/target_tools/real-world-llms/models_info.md @@ -0,0 +1,45 @@ +# Models + +- codellama:7b-python +- codellama:13b-python +- codellama:34b-python +- codellama:7b-instruct +- codellama:13b-instruct +- codellama:34b-instruct +- llama2:7b +- llama2:13b +- llama2:70b +- vicuna:7b +- vicuna:13b +- vicuna:33b +- phind-codellama:34b-v2 +- phind-codellama:34b-python +- wizardcoder:7b-python +- wizardcoder:13b-python +- wizardcoder:34b-python +- orca2:7b +- orca2:13b +- gpt-3.5-turbo +- gpt-4 + +# HG + +- codellama/CodeLlama-7b-Python-hf +- codellama/CodeLlama-13b-Python-hf +- codellama/CodeLlama-34b-Python-hf +- codellama/CodeLlama-7b-Instruct-hf +- codellama/CodeLlama-13b-Instruct-hf +- codellama/CodeLlama-34b-Instruct-hf +- meta-llama/Llama-2-7b-hf +- meta-llama/Llama-2-13b-hf +- meta-llama/Llama-2-70b-hf +- lmsys/vicuna-7b-v1.5 +- lmsys/vicuna-13b-v1.5 +- lmsys/vicuna-33b-v1.3 +- Phind/Phind-CodeLlama-34B-v2 +- Phind/Phind-CodeLlama-34B-Python-v1 +- WizardLM/WizardCoder-Python-7B-V1.0 +- WizardLM/WizardCoder-Python-13B-V1.0 +- WizardLM/WizardCoder-Python-34B-V1.0 +- microsoft/Orca-2-7b +- microsoft/Orca-2-13b diff --git a/src/target_tools/real-world-llms/requirements.txt b/src/target_tools/real-world-llms/requirements.txt new file mode 100644 index 000000000..e68091c56 --- /dev/null +++ b/src/target_tools/real-world-llms/requirements.txt @@ -0,0 +1,17 @@ +requests +langchain +langchain_openai +openai +vllm +einops +pytest +bitsandbytes +accelerate +ninja +packaging +peft +torch==2.1.2 +torchvision==0.16.2 +torchaudio +tiktoken +libcst diff --git a/src/target_tools/real-world-llms/src/code_annotator.py b/src/target_tools/real-world-llms/src/code_annotator.py new file mode 100644 index 000000000..244559884 --- /dev/null +++ b/src/target_tools/real-world-llms/src/code_annotator.py @@ -0,0 +1,128 @@ +import libcst as cst +import os + +class TypeAnnotatorTransformer(cst.CSTTransformer): + def leave_FunctionDef( + self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef + ) -> cst.FunctionDef: + # Replace parameters and return type annotations with [MASK] + mask_annotation = cst.Annotation(cst.parse_expression("[MASK]")) + + new_params = [ + param.with_changes(annotation=mask_annotation) + for param in updated_node.params.params + ] + + # Annotate *args with [MASK] if it exists + new_star_arg = updated_node.params.star_arg + if isinstance(new_star_arg, cst.Param): + new_star_arg = new_star_arg.with_changes(annotation=mask_annotation) + + # Annotate **kwargs with [MASK] if it exists + new_star_kwarg = updated_node.params.star_kwarg + if isinstance(new_star_kwarg, cst.Param): + new_star_kwarg = new_star_kwarg.with_changes(annotation=mask_annotation) + + # Annotate keyword-only arguments after * with [MASK] if they exist + new_kwonly_params = [ + kwonly_param.with_changes(annotation=mask_annotation) + if isinstance(kwonly_param, cst.Param) else kwonly_param + for kwonly_param in updated_node.params.kwonly_params + ] + + # Replace return type with [MASK] + new_returns = mask_annotation + + # Return the updated function definition + return updated_node.with_changes( + params=updated_node.params.with_changes( + params=new_params, + star_arg=new_star_arg, + star_kwarg=new_star_kwarg, + kwonly_params=new_kwonly_params + ), + returns=new_returns, + ) + + def leave_AnnAssign( + self, original_node: cst.AnnAssign, updated_node: cst.AnnAssign + ) -> cst.BaseStatement: + # Replace any existing annotation with [MASK] + return updated_node.with_changes( + annotation=cst.Annotation(cst.parse_expression("[MASK]")) + ) + + def _extract_names(self, target): + """ + Recursively extract names from tuples, lists, and generator expressions for individual annotation. + """ + names = [] + if isinstance(target, (cst.Tuple, cst.List)): + # Handle tuple and list unpacking + for element in target.elements: + names.extend(self._extract_names(element.value)) + elif isinstance(target, cst.Name): + # Direct variable name + names.append(target) + elif isinstance(target, cst.Attribute): + # Handle attribute names like self.width + names.append(target) + elif isinstance(target, cst.GeneratorExp): + # Handle generator expressions by treating each target as a separate variable + for comprehension in target.for_in: + names.extend(self._extract_names(comprehension.target)) + return names + + def leave_Assign( + self, original_node: cst.Assign, updated_node: cst.Assign + ) -> cst.BaseStatement: + annotations = [] + mask_annotation = cst.Annotation(cst.parse_expression("[MASK]")) + + # Process each target in the assignment, including complex unpacking + for target in updated_node.targets: + extracted_names = self._extract_names(target.target) + for name in extracted_names: + # Add annotation for each extracted variable + annotations.append( + cst.AnnAssign( + target=name, + annotation=mask_annotation, + value=None # Set value to None for individual annotations + ) + ) + + # Add the original assignment after the individual annotations + annotations.append(updated_node) + + # Flatten the list of annotated statements + return cst.FlattenSentinel(annotations) + +def process_file(file_path): + with open(file_path, "r") as source_code: + code = source_code.read() + + # Parse the file into a concrete syntax tree (CST) + tree = cst.parse_module(code) + + # Apply the type annotation transformer + transformer = TypeAnnotatorTransformer() + modified_tree = tree.visit(transformer) + + # Write the modified code back to the original file + with open(file_path, "w") as original_file: + original_file.write(modified_tree.code) + + print(f"Processed and saved: {file_path}") + +# Process all .py files in a specified directory +def main(): + root_directory = '/home/ssegpu/rashida/TypeEvalPy/micro-benchmark/python_features' + for subdir, _, files in os.walk(root_directory): + for file_name in files: + if file_name.endswith('.py'): + file_path = os.path.join(subdir, file_name) + process_file(file_path) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/target_tools/real-world-llms/src/dataset-preprocessing/prepare_dataset.py b/src/target_tools/real-world-llms/src/dataset-preprocessing/prepare_dataset.py new file mode 100644 index 000000000..df598f17f --- /dev/null +++ b/src/target_tools/real-world-llms/src/dataset-preprocessing/prepare_dataset.py @@ -0,0 +1,192 @@ +import json +import os +from translate import translate_content +import libcst as cst +from typehint_clean import typing_clean + +# Get the absolute path of the current script's directory +script_directory = os.path.dirname(os.path.abspath(__file__)) + +# Define paths to the dataset, base, and output directories +json_directory = os.path.join( + script_directory, + "../downloaded-dataset/ManyTypes4PyDataset-v0.7/processed_projects_clean", +) +base_directory = os.path.join(script_directory, "../") + +# Convert relative paths to absolute paths +json_directory = os.path.abspath(json_directory) +base_directory = os.path.abspath(base_directory) + +# Dictionary to organize the dataset by split: train, test, valid +output_data = {"train": {}, "test": {}, "valid": {}} + +# Track the number of JSON files that were successfully processed (not deleted) +non_deleted_json_count = 0 + + +def read_file_content(file_path): + """ + Reads and returns the content of a source file. + Returns an empty string on error. + """ + try: + with open(file_path, "r", encoding="utf-8") as file: + return file.read() + except FileNotFoundError: + return "" + except Exception as e: + print(f"An error occurred while reading {file_path}: {e}") + return "" + + +def update_json(json_path, base_dir, avoid_files=[]): + """ + Updates a single JSON file by: + - Attaching 'source_code' to each source file + - Applying LibCST-safe type hint stripping + - Classifying data into train/test/valid splits + - Removing JSON file if all files are missing + """ + global non_deleted_json_count + global libcst_fail_count + + # Load the JSON data + try: + with open(json_path, "r", encoding="utf-8") as file: + data = json.load(file) + except (FileNotFoundError, json.JSONDecodeError) as e: + print(f"Error loading {json_path}: {e}") + return + + all_files_missing = True + libcst_fail_count = 0 + + # Iterate through each project in the JSON + for project_name, project_data in data.items(): + if "src_files" in project_data: + for file_name, attributes in project_data["src_files"].items(): + # Skip blacklisted files + if file_name in avoid_files.get("file_paths", []): + continue + + # Read and clean the file content + full_path = os.path.join(base_dir, file_name) + file_content = read_file_content(full_path) + + if file_content != "": + all_files_missing = False + + # Remove incompatible type hints + file_content, used_fallback = typing_clean.strip(file_content) + + if used_fallback: + libcst_fail_count += 1 + + # Attach source code to attributes + attributes["source_code"] = file_content + + if file_content == "": + continue + + # Save by split type (train/test/valid) + split_type = attributes.get("set") + if split_type in ["train", "test", "valid"]: + if project_name not in output_data[split_type]: + output_data[split_type][project_name] = {"src_files": {}} + output_data[split_type][project_name]["src_files"][ + file_name + ] = attributes + + if all_files_missing: + # Delete the file if all sources are missing + try: + os.remove(json_path) + except Exception as e: + print(f"Error deleting {json_path}: {e}") + else: + # Save the updated JSON + try: + with open(json_path, "w", encoding="utf-8") as file: + json.dump(data, file, indent=4) + non_deleted_json_count += 1 + print(f"File failed to parse with LibCST: {libcst_fail_count}") + except Exception as e: + print(f"Error writing {json_path}: {e}") + + +def process_json_files_in_directory(directory, base_dir): + """ + Processes all JSON files in a directory: + - Loads avoid-list for oversized files + - Updates each JSON file with source code + - Categorizes data into splits + - Saves split datasets to new JSON files + """ + # Load the list of files to avoid processing (too large or token-limit exceeded) + try: + with open( + "/mnt/hf_cache/rashida_manytype4py/many-types-4-py-dataset/scripts/.scrapy/exceeded_token_limit_files.json", + "r", + encoding="utf-8", + ) as avoid_file: + avoid_files = json.load(avoid_file) + except Exception as e: + print(f"Error loading avoid list: {e}") + avoid_files = [] + + # Process each JSON file in the directory + for file_name in os.listdir(directory): + if file_name.endswith(".json"): + json_path = os.path.join(directory, file_name) + update_json(json_path, base_dir, avoid_files) + + # Organize split output files into rw-benchmark/train|test|valid + split_dataset_dir = os.path.join(base_dir, "rw-benchmark") + os.makedirs(split_dataset_dir, exist_ok=True) + + for split_type in ["train", "test", "valid"]: + split_dir = os.path.join(split_dataset_dir, split_type) + os.makedirs(split_dir, exist_ok=True) + + output_file_path = os.path.join(split_dir, f"{split_type}.json") + try: + with open(output_file_path, "w", encoding="utf-8") as output_file: + json.dump(output_data[split_type], output_file, indent=4) + print( + f"{split_type}.json file created successfully in {split_type} folder." + ) + except Exception as e: + print(f"Error writing {split_type}.json: {e}") + + +def translate_and_save_json_files(split_data_directory): + """ + Translates the JSON data (e.g., source code) in each split file and + saves a corresponding _gt.json (ground truth) file. + """ + splits = ["train", "test", "valid"] + for split in splits: + input_file_path = os.path.join(split_data_directory, split, f"{split}.json") + output_file_path = os.path.join(split_data_directory, split, f"{split}_gt.json") + + try: + with open(input_file_path, "r", encoding="utf-8") as file: + data = json.load(file) + + translated_data = translate_content(data) + + with open(output_file_path, "w", encoding="utf-8") as file: + json.dump(translated_data, file, indent=4) + print(f"Translated {split}.json file saved to {output_file_path}") + except Exception as e: + print(f"Error translating {input_file_path}: {e}") + + +# === Script Execution === + +# Step 1: Process and clean all project JSON files +process_json_files_in_directory(json_directory, base_directory) + +# Step 2: Translate source code content in each split and save ground-truth versions +translate_and_save_json_files(os.path.join(base_directory, "rw-benchmark")) diff --git a/src/target_tools/real-world-llms/src/dataset-preprocessing/translate.py b/src/target_tools/real-world-llms/src/dataset-preprocessing/translate.py new file mode 100644 index 000000000..8cfe04632 --- /dev/null +++ b/src/target_tools/real-world-llms/src/dataset-preprocessing/translate.py @@ -0,0 +1,234 @@ +import argparse +import json +import os +import re +from pathlib import Path +import utils + +def normalize_type(type_str, nested_level=0): + """ + Normalize type strings by: + - Removing module prefixes like 'builtins.' or 'typing.' + - Mapping ambiguous or inconsistent type names to standard Python types + - Simplifying nested generics (e.g., List[List[Tuple[str]]]) into general types like Any beyond a certain depth + + Args: + type_str (str): The type string to normalize. + nested_level (int): Current nesting depth, used to replace deep generics with 'Any'. + + Returns: + str: A cleaned and normalized type string. + """ + if type_str is None: + return None + + # Remove quotes + if type_str.startswith('"') and type_str.endswith('"'): + type_str = type_str.strip('"') + + type_mappings = { + "builtins.": "", + "typing.": "", + } + + # Known incorrect or inconsistent types to correct + additional_type_mappings = { + "integer": "int", + "string": "str", + "dictonary": "dict", + "method": "Callable", + "func": "Callable", + "function": "Callable", + "none": "None", + "Nonetype": "None", + "nonetype": "None", + "NoneType": "None", + } + + # Replace module prefixes + for prefix, replacement in type_mappings.items(): + type_str = type_str.replace(prefix, replacement) + + # Apply known mappings + type_str = additional_type_mappings.get(type_str, type_str) + + # Handle generic types (e.g. List[str], Tuple[int, float]) + if "[" in type_str and "]" in type_str: + base_type, generic_content = type_str.split("[", 1) + generic_content = generic_content.rsplit("]", 1)[0] + + # Parse inner parameters + generic_params = [] + bracket_level = 0 + param = "" + for char in generic_content: + if char == "[": + bracket_level += 1 + param += char + elif char == "]": + bracket_level -= 1 + param += char + elif char == "," and bracket_level == 0: + generic_params.append(param.strip()) + param = "" + else: + param += char + if param: + generic_params.append(param.strip()) + + # Replace deep nested types with 'Any' + if nested_level > 0: + normalized_params = ["Any"] + else: + normalized_params = [normalize_type(param, nested_level + 1) for param in generic_params] + + return f"{base_type}[{', '.join(normalized_params)}]" + + # Strip fully-qualified names + if "." in type_str: + return type_str.split(".")[-1] + + return type_str + + +def translate_pipeline(text, functions): + """ + Applies a pipeline of text transformation functions in sequence. + + Args: + text (str): The text to process. + functions (list): List of functions to apply. + + Returns: + str: Transformed text. + """ + for func in functions: + text = func(text) + return text + + +def extract_class_name(text): + """ + Extracts class name from a string representation like "". + + Args: + text (str): Input string. + + Returns: + str: Extracted class name or the original text. + """ + match = re.search(r"", text) + return match.group(1) if match else text + + +def extract_common_patterns(text): + """ + Attempts to extract type information from a string using various regex patterns. + + Args: + text (str): Natural language or formatted description. + + Returns: + str: First matching type string or original text if none matched. + """ + try: + pattern = ( + r"\|\|Return type of `.*?`:" + r" (\w+)|Return type: (\w+)|The return type of '.*?' is (\w+)|The type of" + r" '.*?' is a (\w+)|Type of `.*?`: (\w+)|Type of `.*?` is `(\w+)`|`.*?`" + r" return type: `(\w+)`|`.*?` is a function call that returns an (\w+)" + r" value|`(\w+)`: `\w+`|column \d+: `(\w+)`|column \d+ is '(\w+)'|type of" + r" '.*?': `(\w+)`" + ) + + matches = re.findall(pattern, text) + if matches: + for match in matches: + found_types = [m for m in match if m] + if found_types: + return found_types[0] + return text + except Exception: + return text + + +# Define the transformation pipeline +functions = [extract_class_name, extract_common_patterns] + + +def translate_content(data): + """ + Converts inconsistent type annotations in JSON data to normalized Python types. + + Args: + data (str or list): A JSON string or already-loaded list of dictionaries. + + Returns: + list: Data with normalized 'type' fields. + """ + try: + if isinstance(data, str): + data = json.loads(data) + except Exception as e: + print(f"Not a valid JSON: {e}") + raise utils.JsonException + + for entry in data: + if "type" in entry: + entry["type"] = [normalize_type(entry["type"][0])] + else: + entry["type"] = [] + + return data + + +def list_json_files(folder_path): + """ + Recursively finds all `.json` files in a given folder. + + Args: + folder_path (str or Path): Directory to search. + + Returns: + list: List of Path objects for all `.json` files. + """ + return sorted(Path(folder_path).rglob("*.json")) + + +def main_translator(benchmark_path): + """ + Entry point for processing all JSON files in a dataset directory. + Normalizes types in each file and overwrites the original file. + + Args: + benchmark_path (str): Path to the root of the dataset to process. + """ + json_files = list_json_files(benchmark_path) + error_count = 0 + + for file in json_files: + try: + with open(file) as f: + data = json.load(f) + + translated = translate_content(data) + + json_data = json.dumps(translated, indent=4) + with open(file, "w") as file: + file.write(json_data) + + except Exception as e: + print(f"Command returned non-zero exit status: {e} for file: {file}") + error_count += 1 + + print(f"Translator finished with errors: {error_count}") + + +if __name__ == "__main__": + # Command-line interface to pass the benchmark path + parser = argparse.ArgumentParser() + parser.add_argument( + "--benchmark_path", help="Specify the benchmark path", required=True + ) + args = parser.parse_args() + main_translator(args.benchmark_path) \ No newline at end of file diff --git a/src/target_tools/real-world-llms/src/dataset-preprocessing/typing_clean.py b/src/target_tools/real-world-llms/src/dataset-preprocessing/typing_clean.py new file mode 100644 index 000000000..191c0098f --- /dev/null +++ b/src/target_tools/real-world-llms/src/dataset-preprocessing/typing_clean.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +""" +strip_types_resilient.py + +Removes all type annotations from Python source files — even from files with *syntax errors*. + +The tool uses a prioritized fallback chain: +1. **LibCST**: Fast and preserves formatting; fails on syntax errors. +2. **strip-hints**: Token-driven; works better with broken syntax than AST-based tools. +3. **Tokenize fallback**: Custom regex/token-based solution as a last-resort best-effort approach. + +Dependencies: +- libcst +- strip-hints (>=0.1.13) +""" + +from __future__ import annotations +import argparse, io, re, sys, token, tokenize +from pathlib import Path +import ast + +# ────────────────────────────── LibCST strategy ─────────────────────────────── + +import libcst as cst # Raises ParserSyntaxError for invalid syntax + +def is_valid_syntax(code: str) -> bool: + """ + Check if code is syntactically valid using Python's built-in `ast` parser. + """ + try: + ast.parse(code) + return True + except SyntaxError: + return False + +class _StripCST(cst.CSTTransformer): + """ + LibCST transformer class to remove: + - Parameter annotations + - Return type annotations + - Annotated assignments (AnnAssign) + """ + def leave_Param(self, original_node, updated_node): + return updated_node.with_changes(annotation=None) + + def leave_FunctionDef(self, original_node, updated_node): + return updated_node.with_changes(returns=None) + + def leave_AsyncFunctionDef(self, original_node, updated_node): + return updated_node.with_changes(returns=None) + + def leave_AnnAssign(self, original_node, updated_node): + if updated_node.value is None: + return cst.RemovalSentinel.REMOVE + return cst.Assign((cst.AssignTarget(target=updated_node.target),), updated_node.value) + +def _via_libcst(code: str) -> str: + """ + Attempt to strip annotations using LibCST (preserves formatting and comments). + """ + try: + mod = cst.parse_module(code) + return mod.visit(_StripCST()).code + except Exception as e: + print(f"[libcst transformer failed] {type(e).__name__}: {e}", file=sys.stderr) + raise + +# ────────────────────────────── strip‑hints strategy ─────────────────────────────── + +def _via_strip_hints(code: str) -> str: + """ + Use `strip-hints` to remove annotations in a token-based way. + """ + from strip_hints import strip_string_to_string + return strip_string_to_string(code, to_empty=False, no_ast=True) + +# ────────────────────────────── tokenize fallback ─────────────────────────────── + +# Precompiled regex patterns for performance and clarity +PARAM_ANN = re.compile(r'(\b\w+\b)\s*:\s*[^,)=]+') # e.g., a: int → a +RETURN_ANN = re.compile(r'\s*->\s*[^:]+') # e.g., -> int +VAR_ANN_VAL = re.compile(r'^(\s*)([\w.]+)\s*:\s*[^=\n]+\s*=\s*') # e.g., x: int = 42 +VAR_ANN_BARE = re.compile(r'^\s*[\w.]+\s*:\s*[^=\n]+(\s*)(#.*)?$') # e.g., x: int + +def _via_tokenize(code: str) -> str: + """ + Last-resort fallback that strips type annotations using regex patterns, + while preserving formatting, indentation, and comments. + """ + out_lines: list[str] = [] + for ln in code.splitlines(): + + # --- Remove function signature annotations --- + if ln.lstrip().startswith("def ") and "(" in ln: + ln = RETURN_ANN.sub("", ln, count=1) + head, rest = ln.split("(", 1) + if ")" not in rest: + return rest, "" # Not a well-formed function, skip + params, tail = rest.split(")", 1) + params = PARAM_ANN.sub(r"\1", params) + ln = f"{head}({params}){tail}" + + # --- Remove variable / attribute annotations --- + is_header = ln.lstrip().startswith(( + "class ", "def ", "if ", "for ", "while ", "with ", + "try ", "except ", "elif ", "else ", "finally " + )) + + if not is_header and ":" in ln: + if VAR_ANN_VAL.search(ln): # Keep value, drop annotation + ln = VAR_ANN_VAL.sub(r"\1\2 = ", ln) + elif VAR_ANN_BARE.match(ln): # Drop bare annotation line + ln = "" + + out_lines.append(ln) + + return "\n".join(out_lines) + +# ────────────────────────────── Dispatcher ─────────────────────────────── + +def strip(code: str) -> tuple[str, bool]: + """ + Try all available methods to remove type annotations. + + Returns: + A tuple of (stripped_code, used_fallback) + """ + if not is_valid_syntax(code): + print("[warn] Skipping LibCST due to invalid syntax", file=sys.stderr) + else: + try: + return _via_libcst(code), False + except Exception as e: + print(f"[warn] LibCST failed ({type(e).__name__}: {e}) – trying strip‑hints", file=sys.stderr) + + try: + return _via_strip_hints(code), True + except Exception as e: + print(f"[warn] strip‑hints failed ({type(e).__name__}) – falling back to tokenize", file=sys.stderr) + + return _via_tokenize(code), True + +# ────────────────────────────── CLI entry point ─────────────────────────────── + +def main() -> None: + """ + Main function for CLI usage. + Reads a hardcoded input path, strips types, and writes to 'stripped.py'. + """ + input_path = Path("/mnt/hf_cache/rashida_manytype4py/many-types-4-py-dataset/scripts/typehint_clean/typing_showcase.py") + output_path = Path("stripped.py") + + src = input_path.read_text(encoding="utf-8", errors="replace") + code, used_fallback = strip(src) + output_path.write_text(code, encoding="utf-8") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/target_tools/real-world-llms/src/fine_tuning/generate_ft_jsonl.py b/src/target_tools/real-world-llms/src/fine_tuning/generate_ft_jsonl.py new file mode 100644 index 000000000..d7295553f --- /dev/null +++ b/src/target_tools/real-world-llms/src/fine_tuning/generate_ft_jsonl.py @@ -0,0 +1,140 @@ +import sys +import os +import json +from pathlib import Path + +# Add the parent directory to the system path to import utils +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +import utils + + +def generate_ft_json(prompt_template, json_files, output_path, use_system_prompt=False): + """ + Generates a fine-tuning JSONL file from a list of JSON files, sorted by prompt length. + + Args: + prompt_template (str): Template name for generating prompts. + json_files (List[Path]): List of JSON input file paths. + output_path (Path): Output path for the resulting JSONL. + use_system_prompt (bool): Whether to include system prompt formatting. + """ + id_mapping = get_prompt_mapping(prompt_template, use_system_prompt, json_files) + + # Sort prompts by token count in descending order + prompt_id_mapping_pairs = [(x["prompt"], x) for x in id_mapping.values()] + sorted_prompt_id_mapping_pairs = sorted( + prompt_id_mapping_pairs, key=lambda x: utils.get_token_count(x[0]), reverse=True + ) + + # Reconstruct the sorted mapping with new indices + sorted_id_mapping = { + i: pair[1] for i, pair in enumerate(sorted_prompt_id_mapping_pairs) + } + + # Save as JSONL + utils.dump_ft_jsonl(sorted_id_mapping, output_path) + + +def get_prompt_mapping(prompt_template, use_system_prompt, json_files): + """ + Reads paired code and ground-truth files, generates prompts for fine-tuning. + + Args: + prompt_template (str): Name of the prompt template to use. + use_system_prompt (bool): Whether to include system-level prompts. + json_files (List[Path]): List of all .json files in the dataset. + + Returns: + dict: Mapping from integer IDs to prompt metadata. + """ + id_mapping = {} + idx = 0 + + # Create pairs of source JSON and _gt.json files + json_pairs = {} + for code_json in json_files: + if code_json.name.endswith("_gt.json"): + continue + gt_json = code_json.parent / f"{code_json.stem}_gt.json" + if gt_json.exists(): + json_pairs[code_json] = gt_json + + for code_json, gt_json in json_pairs.items(): + with open(code_json, "r") as code_file: + code_data = json.load(code_file) + with open(gt_json, "r") as gt_file: + gt_data = json.load(gt_file) + + # Build file-based lookup for GT entries + gt_mapping = {} + for entry in gt_data: + file_path = entry["file"] + gt_mapping.setdefault(file_path, []).append(entry) + + # Create prompts per source file + for project_name, project_info in code_data.items(): + src_files = project_info.get("src_files", {}) + + for file_path, file_info in src_files.items(): + source_code = file_info.get("source_code", "") + if not source_code: + continue + + type_info_list = gt_mapping.get(file_path, []) + if not type_info_list: + continue + + prompt = utils.get_prompt( + prompt_template, + source_code, + type_info_list, + use_system_prompt=use_system_prompt, + file_path=file_path, + token_limit=4000, + ) + + if prompt is None: + continue + + id_mapping[idx] = { + "project_name": project_name, + "file_path": file_path, + "json_filepath": str(gt_json), + "prompt": prompt, + } + idx += 1 + + return id_mapping + + +def list_json_files(folder_path): + """Recursively lists all .json files in the specified folder.""" + return sorted(Path(folder_path).rglob("*.json")) + + +def main(): + prompt_template = "prompt_template_questions_based_2" + base_input_dir = Path( + "/mnt/hf_cache/rashida_manytype4py/many-types-4-py-dataset/rw-benchmark" + ) + base_output_dir = Path("./dataset") + base_output_dir.mkdir(parents=True, exist_ok=True) + + # Run for each of train, test, valid + for split in ["train", "test", "valid"]: + split_input_dir = base_input_dir / split + if not split_input_dir.exists(): + print(f"[warn] Skipping missing split: {split}") + continue + + json_files = list_json_files(split_input_dir) + output_path = base_output_dir / f"{split}.jsonl" + print(f"Processing split '{split}' with {len(json_files)} files...") + generate_ft_json( + prompt_template, json_files, output_path, use_system_prompt=False + ) + print(f"Saved: {output_path}") + + +if __name__ == "__main__": + main() diff --git a/src/target_tools/real-world-llms/src/fine_tuning/unsloth_train.py b/src/target_tools/real-world-llms/src/fine_tuning/unsloth_train.py new file mode 100644 index 000000000..784355055 --- /dev/null +++ b/src/target_tools/real-world-llms/src/fine_tuning/unsloth_train.py @@ -0,0 +1,127 @@ +# Environment setup (commented out for context) +# These commands were likely used in a Jupyter notebook to install required packages. +# You can uncomment and use them if running interactively. + +# %%capture +# %pip install unsloth +# %pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git + +# %pip uninstall torch torchaudio torchvision xformers vllm +# %pip install torch==2.5.0 torchaudio==2.5.0 torchvision==0.20.0 +# %pip check + +import torch + +# ------------------------------------------------------------- +# 🚀 Load model using Unsloth's FastLanguageModel wrapper +# ------------------------------------------------------------- +from unsloth import FastLanguageModel + +# Define model loading parameters +max_seq_length = 4000 # Maximum sequence length supported +dtype = None # Auto-detect (Float16 for V100/T4, BFloat16 for A100 etc.) +load_in_4bit = True # Use 4-bit quantized model for memory efficiency + +# List of compatible 4-bit models for faster loading/training (optional reference) +fourbit_models = [ + "unsloth/Meta-Llama-3.1-8B-bnb-4bit", + "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", + "unsloth/Meta-Llama-3.1-70B-bnb-4bit", + "unsloth/Meta-Llama-3.1-405B-bnb-4bit", + "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", + "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit", + "unsloth/mistral-7b-v0.3-bnb-4bit", + "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", + "unsloth/Phi-3.5-mini-instruct", + "unsloth/Phi-3-medium-4k-instruct", + "unsloth/gemma-2-9b-bnb-4bit", + "unsloth/gemma-2-27b-bnb-4bit", +] + +# Load the pretrained model and tokenizer +model, tokenizer = FastLanguageModel.from_pretrained( + model_name="mistralai/Codestral-22B-v0.1", # Original 22B Codestral model + max_seq_length=max_seq_length, + dtype=torch.bfloat16, + load_in_4bit=load_in_4bit, + # token="", +) + +# ------------------------------------------------------------- +# 🔧 Convert the model to a PEFT (LoRA) model +# ------------------------------------------------------------- +model = FastLanguageModel.get_peft_model( + model, + r=16, # LoRA rank (8-128 recommended depending on memory/batch size) + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], + lora_alpha=16, # LoRA scaling factor + lora_dropout=0, # Dropout (0 is best optimized) + bias="none", # Bias handling (recommended "none" for efficiency) + use_gradient_checkpointing="unsloth", # Saves VRAM using Unsloth's efficient implementation + random_state=3407, # Reproducibility + use_rslora=False, # Optionally enable rank-stabilized LoRA + loftq_config=None, # Optional quantization-aware training +) + +# ------------------------------------------------------------- +# 📦 Load dataset for training +# ------------------------------------------------------------- +from datasets import load_dataset + +# Assumes a preprocessed HuggingFace-compatible dataset in the given folder +dataset = load_dataset( + "/home/ssegpu/rashida/TypeEvalPy/src/target_tools/real-world-llms/src/finetunellms/dataset", + split="train", +) + +# ------------------------------------------------------------- +# 🧠 Train the LoRA fine-tuned model using TRL's SFTTrainer +# ------------------------------------------------------------- +from trl import SFTTrainer +from transformers import TrainingArguments +from unsloth import is_bfloat16_supported # Check for BF16 support (Ampere+ GPUs) + +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=dataset, + dataset_text_field="messages", # Field name containing the text (JSON format) + max_seq_length=max_seq_length, + dataset_num_proc=2, # Parallel processing threads + packing=False, # Enable if packing multiple examples per input + args=TrainingArguments( + per_device_train_batch_size=10, # Adjust depending on GPU RAM + gradient_accumulation_steps=4, # Effective batch size = 40 + warmup_steps=5, # LR warmup + num_train_epochs=2, # Train for 2 epochs + learning_rate=2e-4, # Typical learning rate + fp16=not is_bfloat16_supported(), # Use FP16 if BF16 unsupported + bf16=is_bfloat16_supported(), # Use BF16 if supported + logging_steps=1, # Log every step + optim="adamw_8bit", # 8-bit optimizer for memory efficiency + weight_decay=0.01, # Regularization + lr_scheduler_type="linear", # Linear decay LR scheduler + seed=3407, # Reproducibility + output_dir="outputs", # Output folder for logs/checkpoints + report_to="none", # Disable W&B or other logging tools + ), +) + +# ------------------------------------------------------------- +# 🚀 Start training and capture stats +# ------------------------------------------------------------- +trainer_stats = trainer.train() + +# ------------------------------------------------------------- +# 💾 Save the fine-tuned model and tokenizer locally +# ------------------------------------------------------------- +model.save_pretrained("finetuned_Codestral-22B-v0.1") +tokenizer.save_pretrained("finetuned_Codestral-22B-v0.1") diff --git a/src/target_tools/real-world-llms/src/generate_jsonl_finetuning.py b/src/target_tools/real-world-llms/src/generate_jsonl_finetuning.py new file mode 100644 index 000000000..deb730ef4 --- /dev/null +++ b/src/target_tools/real-world-llms/src/generate_jsonl_finetuning.py @@ -0,0 +1,115 @@ +import json +import os +from pathlib import Path + +import fine_tuning +import prompts +import utils +from runner import get_prompt + + +def process_file(file_path): + with open(file_path, "r") as f: + return f.read() + + +def generate_jsonl_gpt(folder_path, output_file, prompt_id): + messages_list = [] + system_prompt = eval(f"prompts.{prompt_id}_system") + + system_message = { + "role": "system", + "content": system_prompt, + } + + # Find all subdirectories in the root folder + subdirectories = [ + os.path.join(folder_path, d) + for d in os.listdir(folder_path) + if os.path.isdir(os.path.join(folder_path, d)) + ] + + # Traverse all files in each sample + for finetuning_sample in sorted(subdirectories): + message = [] + message.append(system_message) + + print(f"Processing {finetuning_sample}...") + for root, _, files in os.walk(finetuning_sample): + code_file = os.path.join(root, files[0]) + gt_file = os.path.join(root, files[1]) + + # User message + user_message = { + "role": "user", + "content": get_prompt(prompt_id, code_file, gt_file), + } + message.append(user_message) + + # Assistant message + assistant_message = { + "role": "assistant", + "content": utils.generate_answers_for_fine_tuning(gt_file), + } + message.append(assistant_message) + + messages_list.append(json.dumps({"messages": message}, separators=(",", ":"))) + + # Write messages to the output file + with open(output_file, "w") as output: + for _m in messages_list: + output.write(_m) + output.write("\n") + + +def generate_jsonl_llama(folder_path, output_file, prompt_id): + messages_list = [] + + # Find all subdirectories in the root folder + subdirectories = [ + os.path.join(folder_path, d) + for d in os.listdir(folder_path) + if os.path.isdir(os.path.join(folder_path, d)) + ] + + # Traverse all files in each sample + for finetuning_sample in sorted(subdirectories): + print(f"Processing {finetuning_sample}...") + for root, _, files in os.walk(finetuning_sample): + code_file = os.path.join(root, files[0]) + gt_file = os.path.join(root, files[1]) + + full_text = get_prompt( + prompt_id, code_file, gt_file, answers_placeholders=True + ) + utils.generate_answers_for_fine_tuning(gt_file) + + full_message = { + "text": full_text, + } + + messages_list.append(json.dumps(full_message, separators=(",", ":"))) + + # Write messages to the output file + with open(output_file, "w") as output: + for _m in messages_list: + output.write(_m) + output.write("\n") + + +# fine_tuning.generate_jsonl(folder_path, output_file, system_prompt, main_prompt) +if __name__ == "__main__": + SCRIPT_DIR = Path(os.path.abspath(os.path.dirname(__file__))) + + # Create fine tuning dataset + folder_path = SCRIPT_DIR / "fine_tuning" / "training_set" + output_file_gpt = SCRIPT_DIR / "fine_tuning" / "finetuning_autoset_gpt_v1.5.jsonl" + output_file_llama = ( + SCRIPT_DIR / "fine_tuning" / "finetuning_autoset_llama_v1.5.jsonl" + ) + + # Prepare prompts + prompt_id_gpt = "questions_based_2" + prompt_id_llama = "questions_based_2_ft" + + generate_jsonl_gpt(folder_path, output_file_gpt, prompt_id_gpt) + generate_jsonl_llama(folder_path, output_file_llama, prompt_id_llama) diff --git a/src/target_tools/real-world-llms/src/gpt_batch_converter.py b/src/target_tools/real-world-llms/src/gpt_batch_converter.py new file mode 100644 index 000000000..3e9bb2208 --- /dev/null +++ b/src/target_tools/real-world-llms/src/gpt_batch_converter.py @@ -0,0 +1,133 @@ +from pathlib import Path +import os +import utils +import json +from runner import create_result_json_file + + +def list_python_files(folder_path): + python_files = sorted(Path(folder_path).rglob("main.py")) + return python_files + + +models = [ + # { + # "name": "gpt-4o_hg_cs", + # "path": "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/.scrapy/batch_prompts_results/gpt-4o_hg_cs-batch_1PSl4bmOdffJdyRraiOPbleP.jsonl", + # "bechmark_path": Path( + # "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/autogen_typeevalpy_benchmark" + # ), + # }, + # { + # "name": "gpt-4o_js", + # "path": "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/.scrapy/batch_prompts_results/gpt-4o_js-batch_YE7JzcBsKLSeZ7CeoeGIkCdZ.jsonl", + # "bechmark_path": Path( + # "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/autogen_typeevalpy_benchmark" + # ), + # }, + # { + # "name": "gpt-4o_pycg", + # "path": "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/.scrapy/batch_prompts_results/gpt-4o_pycg-batch_uGBHKpb5oeIHVaBNXUQCdipr.jsonl", + # "bechmark_path": Path( + # "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/autogen_typeevalpy_benchmark" + # ), + # }, + # { + # "name": "gpt-4o-mini_hg_cs", + # "path": "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/.scrapy/batch_prompts_results/gpt-4o-mini_hg_cs-batch_3dK2KOYlkhovzS7Qmqbs0Cnc.jsonl", + # "bechmark_path": Path( + # "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/autogen_typeevalpy_benchmark" + # ), + # }, + # { + # "name": "gpt-4o-mini_js", + # "path": "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/.scrapy/batch_prompts_results/gpt-4o-mini_js-batch_rbNCMtcZIUyzD9aZ4QQ2RsIu.jsonl", + # "bechmark_path": Path( + # "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/autogen_typeevalpy_benchmark" + # ), + # }, + # { + # "name": "gpt-4o-mini_pycg", + # "path": "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/.scrapy/batch_prompts_results/gpt-4o-mini_pycg-batch_Nk9np2B9rhzD3AaDgMZEU7oB.jsonl", + # "bechmark_path": Path( + # "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/autogen_typeevalpy_benchmark" + # ), + # }, + # { + # "name": "gpt-4o_autogen", + # "path": "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/.scrapy/batch_prompts_results/gpt-4o_autogen-batch_sjDtwbjl3IB6GRZg5XP4svqS.jsonl", + # "bechmark_path": Path( + # "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/autogen_typeevalpy_benchmark" + # ), + # }, + # { + # "name": "gpt-4o-mini_autogen", + # "path": "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/.scrapy/batch_prompts_results/gpt-4o-mini_autogen-batch_glNBmyi30uDTH0EuG3c6jTov.jsonl", + # "bechmark_path": Path( + # "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/autogen_typeevalpy_benchmark" + # ), + # }, + { + "name": "gpt-4o_micro", + "path": "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/.scrapy/batch_prompts_results/gpt-4o_micro-batch_GL3PSRMWW25MNowx8ERWCDhD.jsonl", + "bechmark_path": Path( + "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/micro-benchmark" + ), + }, + { + "name": "gpt-4o-mini_micro", + "path": "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/.scrapy/batch_prompts_results/gpt-4o-mini_micro-batch_zz7O1DOR6r7eVpvuvn8g6GgZ.jsonl", + "bechmark_path": Path( + "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/micro-benchmark" + ), + }, +] + + +def get_prompt_mapping(prompt_template, python_files, use_system_prompt=False): + id_mapping = { + idx: { + "file_path": file_path, + "json_filepath": str(file_path).replace(".py", "_gt.json"), + "result_filepath": str(file_path).replace(".py", f"_result.json"), + "result_dump_filepath": str(file_path).replace(".py", f"_result_dump.txt"), + "prompt": utils.get_prompt( + prompt_template, file_path, use_system_prompt=use_system_prompt + ), + } + for idx, file_path in enumerate(python_files) + } + + return id_mapping + + +prompt_template = "prompt_template_questions_based_2" +results_dir = Path( + "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_test/.scrapy/batch_results_micro" +) + +for model in models: + results_dst = Path(results_dir) / model["name"] / "micro-benchmark" + os.makedirs(results_dst, exist_ok=True) + + utils.copy_folder(model["bechmark_path"], results_dst) + + python_files = list_python_files(results_dst) + + id_mapping = get_prompt_mapping( + prompt_template, python_files, use_system_prompt=True + ) + + # read jsonl file and iterate over each line as json object + with open(model["path"], "r") as f: + for line in f: + fact_json = json.loads(line) + output_raw = fact_json["response"]["body"]["choices"][0]["message"][ + "content" + ] + r_id = int(fact_json["custom_id"].split("-")[-1]) + file_info = id_mapping[r_id] + print(id_mapping[r_id]["file_path"]) + print(fact_json["custom_id"]) + + create_result_json_file(file_info, output_raw, prompt_template) diff --git a/src/target_tools/real-world-llms/src/models_config.yaml b/src/target_tools/real-world-llms/src/models_config.yaml new file mode 100644 index 000000000..9b1983021 --- /dev/null +++ b/src/target_tools/real-world-llms/src/models_config.yaml @@ -0,0 +1,327 @@ +runner_config: + max_new_tokens: 1024 + temperature: 0.001 + +models: +- name: "qwen2-it-7b" + model_path: "Qwen/Qwen2-7B-Instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + comment: "Does not support VLLMs with quantization bitsandbytes" + +- name: "qwen2-it-72b" + model_path: "Qwen/Qwen2-72B-Instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 12 + comment: "Does not support VLLMs with quantization bitsandbytes" + +- name: "gemma2-it-9b" + model_path: "google/gemma-2-9b-it" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: false + use_vllms_for_evaluation: false + max_model_len: 4096 + batch_size: 3 + torch_dtype: "bfloat16" + +- name: "gemma2-it-27b" + model_path: "google/gemma-2-27b-it" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: false + use_vllms_for_evaluation: false + max_model_len: 4096 + batch_size: 3 + torch_dtype: "bfloat16" + +- name: "gemma2-it-2b" + model_path: "google/gemma-2-2b-it" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: false + use_vllms_for_evaluation: false + max_model_len: 4096 + batch_size: 3 + torch_dtype: "bfloat16" + +- name: "codellama-it-7b" + model_path: "meta-llama/CodeLlama-7b-Instruct-hf" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "codellama-it-13b" + model_path: "meta-llama/CodeLlama-13b-Instruct-hf" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "codellama-it-34b" + model_path: "meta-llama/CodeLlama-34b-Instruct-hf" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "llama3.1-it-8b" + model_path: "meta-llama/Meta-Llama-3.1-8B-Instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "llama3.1-it-70b" + model_path: "meta-llama/Meta-Llama-3.1-70B-Instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "tinyllama-1.1b" + model_path: "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 2048 + batch_size: 3 + +- name: "phi3-small-it-7.3b" + model_path: "microsoft/Phi-3-small-128k-instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "phi3-medium-it-14b" + model_path: "microsoft/Phi-3-medium-128k-instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "phi3-mini-it-3.8b" + model_path: "microsoft/Phi-3-mini-128k-instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "phi3.5-mini-it-3.8b" + model_path: "microsoft/Phi-3.5-mini-instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "phi3.5-moe-it-41.9b" + model_path: "microsoft/Phi-3.5-MoE-instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "mixtral-v0.1-it-8x22b" + model_path: "mistralai/Mixtral-8x22B-Instruct-v0.1" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + comment: "Does not work with 1 GPU" + +- name: "mixtral-v0.1-it-8x7b" + model_path: "mistralai/Mixtral-8x7B-Instruct-v0.1" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "mistral-v0.3-it-7b" + model_path: "mistralai/Mistral-7B-Instruct-v0.3" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "mistral-nemo-it-2407-12.2b" + model_path: "mistralai/Mistral-Nemo-Instruct-2407" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "mistral-large-it-2407-123b" + model_path: "mistralai/Mistral-Large-Instruct-2407" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "codestral-v0.1-22b" + model_path: "mistralai/Codestral-22B-v0.1" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 4000 + batch_size: 50 + +- name: "qwen2.5-Coder-7B-Instruct" + model_path: "Qwen/Qwen2.5-Coder-7B-Instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 4000 + batch_size: 50 + +- name: "finetuned-codestral-v0.1-22b-without-any" + model_path: "rbharmal/finetuned_Codestral-22B-v0.1" + quantization: "bitsandbytes" + lora_repo: "/home/ssegpu/rashida/TypeEvalPy/src/target_tools/real-world-llms/src/finetunellms/finetuned_without_any_Codestral-22B-v0.1" + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 4000 + batch_size: 40 + +- name: "finetuned-qwen2.5-Coder-7B-Instruct-without-any" + model_path: "rbharmal/finetuned-qwen-2.5-Coder-7B-Instruct" + quantization: "bitsandbytes" + lora_repo: "/home/ssegpu/rashida/TypeEvalPy/src/target_tools/real-world-llms/src/finetunellms/finetuned_without_any_qwen2.5-Coder-7B-Instruct" + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 4000 + batch_size: 50 + +- name: "qwen2.5-Coder-14B-Instruct" + model_path: "Qwen/Qwen2.5-Coder-14B-Instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "qwen2.5-Coder-32B-Instruct" + model_path: "Qwen/Qwen2.5-Coder-32B-Instruct" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 8 + +- name: "llama-3.1-Nemotron-70B-Instruct-HF" + model_path: "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 8192 + batch_size: 3 + +- name: "deepSeek-R1-Distill-Qwen-14B" + model_path: "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B" + quantization: "bitsandbytes" + lora_repo: null + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: 4000 + batch_size: 15 + +# MODELS NOT SUPPORTED CURRENTLY! + +# - name: "codellama-python-7b" +# model_path: "meta-llama/CodeLlama-7b-Python-hf" +# quantization: "bitsandbytes" +# lora_repo: null +# use_system_prompt: true +# use_vllms_for_evaluation: false +# max_model_len: 8192 +# batch_size: 12 + +# - name: "codellama-python-13b" +# model_path: "meta-llama/CodeLlama-13b-Python-hf" +# quantization: "bitsandbytes" +# lora_repo: null +# use_system_prompt: true +# use_vllms_for_evaluation: false +# max_model_len: 8192 +# batch_size: 12 + +# - name: "codellama-python-34b" +# model_path: "meta-llama/CodeLlama-34b-Python-hf" +# quantization: "bitsandbytes" +# lora_repo: null +# use_system_prompt: true +# use_vllms_for_evaluation: false +# max_model_len: 8192 +# batch_size: 12 + +custom_models: + - name: "tinyllama-ft-1.1b" + model_path: "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + quantization: "bitsandbytes" + lora_repo: "/home/ssegpu/fine-tuning-apsv/models_single_label/TinyLlama-TinyLlama-1-1B-Chat-v1-0-single-label-v1" + use_system_prompt: true + use_vllms_for_evaluation: true + max_model_len: 2048 + batch_size: 12 + + +openai_models: +- name: "gpt-3.5-turbo" + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: null + +- name: "gpt-4o" + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: null + +- name: "gpt-4o-mini" + use_system_prompt: true + use_vllms_for_evaluation: false + max_model_len: null diff --git a/src/target_tools/real-world-llms/src/openai_helpers.py b/src/target_tools/real-world-llms/src/openai_helpers.py new file mode 100644 index 000000000..c64aceed6 --- /dev/null +++ b/src/target_tools/real-world-llms/src/openai_helpers.py @@ -0,0 +1,56 @@ +from langchain_openai import ChatOpenAI +import time +import random +from concurrent.futures import ThreadPoolExecutor, as_completed + + +def get_response(model, prompt): + time.sleep(random.randint(2, 4)) # Avoid rate limiting + try: + output = model.invoke(prompt) + except Exception as e: + print(f"Failed to process prompt: {prompt}") + print(e) + return "" + + return output.content + + +def process_requests( + model_name, + prompts, + openai_api_key, + temperature=0, + print_responses: bool = False, + max_new_tokens: int = 128, + max_workers: int = 1, +): + """Continuously process a list of prompts and handle the outputs.""" + model = ChatOpenAI( + model_name=model_name, + temperature=temperature, + openai_api_key=openai_api_key, + ) + responses = {} + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_article = { + executor.submit( + get_response, + model, + prompt, + ): req_id + for req_id, prompt in enumerate(prompts) + } + total_prompts = len(prompts) + completed_prompts = 0 + + for future in as_completed(future_to_article): + result = future.result() + req_id = future_to_article[future] + responses[req_id] = result + completed_prompts += 1 + print(f"Processed {completed_prompts}/{total_prompts}") + + # sort dict based on keys + responses = dict(sorted(responses.items())) + return responses.values() diff --git a/src/target_tools/real-world-llms/src/prompts.py b/src/target_tools/real-world-llms/src/prompts.py new file mode 100644 index 000000000..355a8603a --- /dev/null +++ b/src/target_tools/real-world-llms/src/prompts.py @@ -0,0 +1,662 @@ +typeevalpy_prompt_1 = """ +You will be provided with the following information: +1. Python code. The sample is delimited with triple backticks. +2. Sample JSON containing type inference information for the Python code in a specific format. +3. Examples of Python code and their inferred types. The examples are delimited with triple backticks. These examples are to be used as training data. + +Perform the following tasks: +1. Infer the types of various Python elements like function parameters, local variables, and function return types according to the given JSON format with the highest probability. +2. Provide your response in a valid JSON array of objects according to the training sample given. Do not provide any additional information except the JSON object. + + + +Python code: +``` +def id_func ( arg ): + x = arg + return x + +result = id_func (" String ") +result = id_func (1) +``` + +inferred types in JSON: +[ + { + "file": "simple_code.py", + "function": "id_func", + "line_number": 1, + "type": [ + "int", + "str" + ] + }, + { + "file": "simple_code.py", + "function": "id_func", + "line_number": 1, + "parameter": "arg", + "type": [ + "int", + "str" + ] + }, + { + "file": "simple_code.py", + "function": "id_func", + "line_number": 2, + "type":[ + "int", + "str" + ], + "variable": "x" + }, + { + "file": "simple_code.py", + "line_number": 5, + "type": [ + "str" + ], + "variable": "result" + }, + { + "file": "simple_code.py", + "line_number": 6, + "type":[ + "int" + ], + "variable": "result" + } +] + +Python code: +``` +def func(x): + return x + +a = func(2) +b = func(1.0) +b = 10 +c = 1.0 +``` + +The JSON object: +""" + +typeevalpy_prompt_2 = """ +You will be provided with the following information: +1. Python code. The sample is delimited with triple backticks. +2. Sample JSON containing type inference information for the Python code in a specific format. +3. Examples of Python code and their inferred types. The examples are delimited with triple backticks. These examples are to be used as training data. + +Perform the following tasks: +1. Infer the types of various Python elements like function parameters, local variables, and function return types according to the given JSON format with the highest probability. +2. Note that each element can be of more than one type according to the context of the program. +3. Provide your response in a JSON format according to the training sample given. Do not provide any additional information except the JSON. + +Python Code Sample: +``` +def id_func(arg): + x = arg + return x + +result = id_func("String") +result = id_func(1) +``` + +Inferred Types in JSON (Example): +``` +[ + { + "file": "simple_code.py", + "function": "id_func", + "line_number": 1, + "type": [ + "int", + "str" + ] + }, + { + "file": "simple_code.py", + "function": "id_func", + "line_number": 1, + "parameter": "arg", + "type": [ + "int", + "str" + ] + }, + { + "file": "simple_code.py", + "function": "id_func", + "line_number": 2, + "type":[ + "int", + "str" + ], + "variable": "x" + }, + { + "file": "simple_code.py", + "line_number": 5, + "type": [ + "str" + ], + "variable": "result" + }, + { + "file": "simple_code.py", + "line_number": 6, + "type":[ + "int" + ], + "variable": "result" + } +] +``` + +Next Python Code: + +``` +def func(x): + return x + +a = func(2) +b = func(1.0) +b = 10 +c = 1.0 +``` + +Your Task: Provide inferred types in JSON format for the above Python code, by adding the appropriate 'type' keys to the following JSON. +``` +[ + { + "file": "simple_code.py", + "function": "func", + "line_number": 1, + }, + { + "file": "simple_code.py", + "function": "func", + "line_number": 1, + "parameter": "x", + }, + { + "file": "simple_code.py", + "line_number": 4, + "variable": "a" + }, + { + "file": "simple_code.py", + "line_number": 5, + "variable": "b" + }, + { + "file": "simple_code.py", + "line_number": 6, + "variable": "b" + }, + { + "file": "simple_code.py", + "line_number": 7, + "variable": "c" + } +] +``` +""" + + +typeevalpy_prompt_2_template = """ +You will be provided with the following information: +1. Python code. The sample is delimited with triple backticks. +2. Sample JSON containing type inference information for the Python code in a specific format. +3. Examples of Python code and their inferred types. The examples are delimited with triple backticks. These examples are to be used as training data. + +Perform the following tasks: +1. Infer the types of various Python elements like function parameters, local variables, and function return types according to the given JSON format with the highest probability. +2. Provide your response in a valid JSON array of objects according to the training sample given. Do not provide any additional information except the JSON object. +3. {format_instructions} + + +Python code: +``` +def id_func ( arg ): + x = arg + return x + +result = id_func ("String") +result = id_func (1) +``` + +The JSON object: +``` +[ + {{ + "file": "simple_code.py", + "function": "id_func", + "line_number": 1, + "type": [ + "int", + "str" + ] + }}, + {{ + "file": "simple_code.py", + "function": "id_func", + "line_number": 1, + "parameter": "arg", + "type": [ + "int", + "str" + ] + }}, + {{ + "file": "simple_code.py", + "function": "id_func", + "line_number": 2, + "type":[ + "int", + "str" + ], + "variable": "x" + }}, + {{ + "file": "simple_code.py", + "line_number": 5, + "type": [ + "str" + ], + "variable": "result" + }}, + {{ + "file": "simple_code.py", + "line_number": 6, + "type":[ + "int" + ], + "variable": "result" + }} +] +``` + +Python code: +``` +{code} +``` + +Your Task: Provide inferred types in JSON format for the above Python code, by adding the appropriate 'type' keys to the following JSON object: +``` +{json} +``` +""" + +json_based_1 = """ +## Task Description + +You are required to analyze Python code samples and infer the types of different elements (e.g., function parameters, local variables, function return types) based on provided JSON schema. Your responses should adhere strictly to the specified JSON format. + +## Information Provided + +1. Python Code: Enclosed within triple backticks (```). +2. JSON Schema: Outlines the format for type inference. +3. Training Data: Examples of Python code with their inferred types, also within triple backticks. + +## Tasks to Perform + +1. Type Inference: Based on the provided JSON format, infer the types of various elements in the Python code (function parameters, local variables, function return types) with high accuracy. +2. JSON Response: Provide your inferences in a JSON array of objects, strictly following the training sample format. Exclude any additional information outside of this JSON object. +3. Adherence to JSON Schema: Ensure the output is a valid JSON instance conforming to the provided JSON schema. + +## Example: + +Python Code with filename 'simple_code.py': +``` +def id_func (arg): + x = arg + return x + +result = id_func ("String") +result = id_func (1) +``` + +Answer as JSON object: +``` +[ + {{ + "file": "simple_code.py", + "function": "id_func", + "line_number": 1, + "type": [ + "int", + "str" + ] + }}, + {{ + "file": "simple_code.py", + "function": "id_func", + "line_number": 1, + "parameter": "arg", + "type": [ + "int", + "str" + ] + }}, + {{ + "file": "simple_code.py", + "function": "id_func", + "line_number": 2, + "type":[ + "int", + "str" + ], + "variable": "x" + }}, + {{ + "file": "simple_code.py", + "line_number": 5, + "type": [ + "str" + ], + "variable": "result" + }}, + {{ + "file": "simple_code.py", + "line_number": 6, + "type":[ + "int" + ], + "variable": "result" + }} +] +``` + +## Your Current Task + +Infer types for the following Python code and provide a concise JSON response based on the given schema. + +{format_instructions} + +Python Code with filename '{filename}': +``` +{code} +``` + +Answer as JSON object: +""" + +json_based_2 = """ +You will be provided with the following information: +1. Python code. The sample is delimited with triple backticks. +2. Sample JSON containing type inference information for the Python code in a specific format. +3. Examples of Python code and their inferred types. The examples are delimited with triple backticks. These examples are to be used as training data. + +Perform the following tasks: +1. Infer the types of various Python elements like function parameters, local variables, and function return types according to the given JSON format with the highest probability. +2. Provide your response in a valid JSON array of objects according to the training sample given. Do not provide any additional information except the JSON object. +3. {format_instructions} + + +Python Code with filename 'simple_code.py': +``` +def id_func ( arg ): + x = arg + return x + +result = id_func ("String") +result = id_func (1) +``` + +The JSON object: +``` +[ + {{ + "file": "simple_code.py", + "function": "id_func", + "line_number": 1, + "type": [ + "int", + "str" + ] + }}, + {{ + "file": "simple_code.py", + "function": "id_func", + "line_number": 1, + "parameter": "arg", + "type": [ + "int", + "str" + ] + }}, + {{ + "file": "simple_code.py", + "function": "id_func", + "line_number": 2, + "type":[ + "int", + "str" + ], + "variable": "x" + }}, + {{ + "file": "simple_code.py", + "line_number": 5, + "type": [ + "str" + ], + "variable": "result" + }}, + {{ + "file": "simple_code.py", + "line_number": 6, + "type":[ + "int" + ], + "variable": "result" + }} +] +``` + +Python Code with filename '{filename}': +``` +{code} +``` + +The JSON object: +""" + + +questions_based_1 = """ +## Task Description + +Analyze the provided Python code and determine the types of various elements. Answer the following questions based on your analysis. + +Python Code: +{code} + +Questions: +{questions} + +Your Answers: +{answers} +""" + +questions_based_2_system = ( + "You will examine and identify the data types of various elements such as function" + " parameters, local variables, and function return types in the given Python code." +) + +questions_based_2 = """ +## Task Description + +**Objective**: Examine and identify the data types of various elements such as function parameters, local variables, and function return types in the given Python code. + +**Instructions**: +1. For each question below, provide a concise, one-word answer indicating the data type. +2. For arguments and variables inside a function, list every data type they take within the current program context as a comma separated list. +3. Do not include additional explanations or commentary in your answers. +4. If a type's nested level exceeds 2, replace all components at that level and beyond with Any + +**Python Code Provided**: + +{code} + + +**Questions**: +{questions} + +**Format for Answers**: +- Provide your answer next to each question number, using only one word. +- Example: + 1. int + 2. float + 3. str + 4. List[List[Any]] + +**Your Answers**: +{answers}""" + +questions_based_2_ft = """[INST] <> You will examine and identify the data types of various elements such as function parameters, local variables, and function return types in the given Python code. <> + +## Task Description + +**Objective**: Examine and identify the data types of various elements such as function parameters, local variables, and function return types in the given Python code. + +**Instructions**: +1. For each question below, provide a concise, one-word answer indicating the data type. +2. For arguments and variables inside a function, list every data type they take within the current program context as a comma separated list. +3. Do not include additional explanations or commentary in your answers. + +**Python Code Provided**: +```python +{code} +``` + +**Questions**: +{questions} + +**Format for Answers**: +- Provide your answer next to each question number, using only one word. +- Example: + 1. int + 2. float + 3. str + +**Your Answers**: +{answers} +[/INST] """ + + +questions_based_3 = """ +## Task Description + +**Objective**: Examine and identify the data types of various elements such as function parameters, local variables, and function return types in the given Python code. + +**Instructions**: +1. For each question below, provide a concise, one-word answer indicating the data type. +2. For arguments and variables inside a function, list every data type they take according to the inputs in the current program context as a comma separated list. +3. Do not include additional explanations or commentary in your answers. +4. Example of Python code, questions, and answers are given below. These examples are to be used as training data. + +**Format for Answers**: +- Provide your answer next to each question number, using only one word. +- Example: + 1. int + 2. float + 3. str + +**Example Python Code**: +```python +a = 1 +b = 1.0 +c = "hello" +``` + +**Example Questions**: +1. What is the type of the variable 'a' at line 1, column 1? Reply in one word. +2. What is the type of the variable 'b' at line 2, column 1? Reply in one word. +3. What is the type of the variable 'c' at line 3, column 1? Reply in one word. + +**Example Answers**: +1. int +2. float +3. str + +**Python Code**: +```python +{code} +``` + +**Questions**: +{questions} + +**Answers**: +{answers} +""" + +questions_based_4_system = ( + "You will examine and identify the data types of various elements such as function" + " parameters, local variables, and function return types in the given Python code." +) + +questions_based_4 = """[INST] +Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. + +### Instruction: +**Objective**: Examine and identify the data types of various elements such as function parameters, local variables, and function return types in the given Python code. + +1. For each question below, provide a concise, one-word answer indicating the data type. +2. For arguments and variables inside a function, list every data type they take within the current program context as a comma separated list. +3. Do not include additional explanations or commentary in your answers. + +### Input: +**Python Code Provided**: +```python +{code} +``` + +**Questions**: +{questions} + +[/INST] +""" + +# Prompt templates vllms + +prompt_template_questions_based_2 = [ + { + "role": "system", + "content": "You will examine and identify the data types of various elements such as function parameters, local variables, and function return types in the given Python code.", + }, + { + "role": "user", + "content": questions_based_2, + }, +] + +prompt_template_questions_based_2_no_sys = [ + { + "role": "user", + "content": "You will examine and identify the data types of various elements such as function parameters, local variables, and function return types in the given Python code." + + "\n\n" + + questions_based_2, + }, +] + +prompt_template_masked_code_based_1 = [ + { + "role": "system", + "content": "You will examine and identify the data types of various elements such as function parameters, local variables, and function return types in the given Python code.", + }, + { + "role": "user", + "content": "## Task Description\n\n", + }, +] + +prompt_template_masked_code_based_1_no_sys = [ + { + "role": "user", + "content": "## Task Description\n\n", + }, +] \ No newline at end of file diff --git a/src/target_tools/real-world-llms/src/result_translator.py b/src/target_tools/real-world-llms/src/result_translator.py new file mode 100644 index 000000000..cf5b0474c --- /dev/null +++ b/src/target_tools/real-world-llms/src/result_translator.py @@ -0,0 +1,229 @@ +import ast +import json +import os +from typing import List, Dict, Any + +def parse_annotation(annotation_node: ast.AST) -> str: + """ + Convert an AST annotation node into its original source code string. + + Args: + annotation_node (ast.AST): The AST node representing a type annotation. + + Returns: + str: The string representation of the type annotation, or "None" if missing. + """ + if annotation_node is None: + return "None" + return ast.unparse(annotation_node) # Converts AST back to source code. + + +def get_type_annotations_from_content(source: str, filename: str) -> List[Dict[str, Any]]: + """ + Extract all type annotations from a Python source code string. + + Args: + source (str): The source code content as a string. + filename (str): The filename of the source file (used for metadata). + + Returns: + List[Dict[str, Any]]: A list of annotation metadata including file, line number, column, + function or variable name, and type string. + """ + try: + tree = ast.parse(source) + except SyntaxError as e: + print(f"File-level SyntaxError in {filename}: {e}") + return [] # Skip file if it contains a top-level syntax error + + annotations = [] + source_lines = source.splitlines() + + class TypeAnnotationVisitor(ast.NodeVisitor): + """ + AST visitor that extracts function parameter types, return types, + and annotated variable types. + """ + def __init__(self, filename): + self.filename = os.path.basename(filename).replace("_gt.json", ".py") + self.current_class = None + self.current_function = None + self.processed_variables = set() # Track already extracted variables to prevent duplicates + + def visit_ClassDef(self, node: ast.ClassDef): + self.current_class = node.name + self.generic_visit(node) + self.current_class = None + + def visit_FunctionDef(self, node: ast.FunctionDef): + self.current_function = node.name + function_name = f"{self.current_class}.{self.current_function}" if self.current_class else self.current_function + + try: + line = source_lines[node.lineno - 1] + name_col_offset = line.index(node.name) + 1 + + # Parameters: normal + keyword-only + for arg in node.args.args + node.args.kwonlyargs: + if arg.annotation: + param_id = (node.lineno, arg.arg) + if param_id not in self.processed_variables: + annotations.append({ + "file": self.filename, + "line_number": arg.lineno, + "col_offset": arg.col_offset + 1, + "parameter": arg.arg, + "function": function_name, + "type": [parse_annotation(arg.annotation)] + }) + self.processed_variables.add(param_id) + + # *args + if node.args.vararg and node.args.vararg.annotation: + vararg_id = (node.args.vararg.lineno, node.args.vararg.arg) + if vararg_id not in self.processed_variables: + annotations.append({ + "file": self.filename, + "line_number": node.args.vararg.lineno, + "col_offset": node.args.vararg.col_offset + 1, + "parameter": f"*{node.args.vararg.arg}", + "function": function_name, + "type": [parse_annotation(node.args.vararg.annotation)] + }) + self.processed_variables.add(vararg_id) + + # **kwargs + if node.args.kwarg and node.args.kwarg.annotation: + kwarg_id = (node.args.kwarg.lineno, node.args.kwarg.arg) + if kwarg_id not in self.processed_variables: + annotations.append({ + "file": self.filename, + "line_number": node.args.kwarg.lineno, + "col_offset": node.args.kwarg.col_offset + 1, + "parameter": f"**{node.args.kwarg.arg}", + "function": function_name, + "type": [parse_annotation(node.args.kwarg.annotation)] + }) + self.processed_variables.add(kwarg_id) + + # Return type + if node.returns: + func_id = (node.lineno, function_name) + if func_id not in self.processed_variables: + annotations.append({ + "file": self.filename, + "line_number": node.lineno, + "col_offset": name_col_offset, + "function": function_name, + "type": [parse_annotation(node.returns)] + }) + self.processed_variables.add(func_id) + + self.generic_visit(node) + + except Exception as e: + print(f"Error in function '{function_name}' in {filename}: {e}") + + self.current_function = None + + def visit_AnnAssign(self, node: ast.AnnAssign): + """ + Visit annotated assignments (e.g., x: int = 5) and extract variable type annotations. + """ + try: + if isinstance(node.target, ast.Name): + variable_name = node.target.id + function_name = self.current_function if self.current_function else None + var_id = (node.lineno, variable_name) + + if var_id not in self.processed_variables: + # If it's a class variable (not in a function), prepend class name + if self.current_class and not function_name: + variable_name = f"{self.current_class}.{variable_name}" + + annotation_entry = { + "file": self.filename, + "line_number": node.lineno, + "col_offset": node.col_offset + 1, + "variable": variable_name, + "type": [parse_annotation(node.annotation)] + } + + if function_name: + annotation_entry["function"] = function_name + + annotations.append(annotation_entry) + self.processed_variables.add(var_id) + + except Exception as e: + print(f"Error processing annotated assignment in {self.filename}: {e}") + + visitor = TypeAnnotationVisitor(filename) + visitor.visit(tree) + return annotations + + +def format_annotations_for_ground_truth(annotations: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Format raw annotation records into a simplified structure for ground truth comparisons. + + Args: + annotations (List[Dict[str, Any]]): Raw extracted annotation dictionaries. + + Returns: + List[Dict[str, Any]]: Formatted annotation dictionaries with only necessary keys. + """ + formatted_annotations = [] + for annotation in annotations: + formatted_annotation = { + "file": annotation["file"], + "line_number": annotation["line_number"], + "col_offset": annotation["col_offset"], + "type": annotation["type"], + } + + # Add relevant identifier fields + if "function" in annotation: + formatted_annotation["function"] = annotation["function"] + if "variable" in annotation: + formatted_annotation["variable"] = annotation["variable"] + if "parameter" in annotation: + formatted_annotation["parameter"] = annotation["parameter"] + + formatted_annotations.append(formatted_annotation) + + return formatted_annotations + + +def translate_output_to_annotations(source: str, filename: str) -> str: + """ + Convert source code into a JSON-formatted list of type annotations. + + Args: + source (str): Python source code as a string. + filename (str): Path to the file (used for metadata). + + Returns: + str: JSON string representing extracted and formatted type annotations. + """ + annotations = get_type_annotations_from_content(source, filename) + formatted_annotations = format_annotations_for_ground_truth(annotations) + return json.dumps(formatted_annotations, indent=4) + + +def main(): + """ + Entry point for testing the annotation extraction tool. + Reads a Python file and prints extracted annotations in JSON format. + """ + test_file = "/home/ssegpu/rashida/TypeEvalPy/src/target_tools/real-world-llms/src/.scrapy/main.py" # Replace with your test file path + + with open(test_file, "r") as f: + source_code = f.read() + + output_json = translate_output_to_annotations(source_code, test_file) + print(output_json) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/target_tools/real-world-llms/src/runner.py b/src/target_tools/real-world-llms/src/runner.py new file mode 100644 index 000000000..c5d5a677a --- /dev/null +++ b/src/target_tools/real-world-llms/src/runner.py @@ -0,0 +1,714 @@ +import argparse +import json +import logging +import multiprocessing +import os +import re +import shutil +import sys +import time +import traceback +from pathlib import Path +from sys import stdout +from typing import List, Optional +import psutil # Add this import + +import prompts +import translator +import utils +import vllm_helpers +import transformers_helpers +import openai_helpers + +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest +import gc +import torch +from tqdm import tqdm +import result_translator # Import the translation module +from datetime import datetime # Import datetime for timestamp generation + + +AUTOFIX_WITH_OPENAI = False +REQUEST_TIMEOUT = 60 +USE_MULTIPROCESSING_FOR_TERMINATION = True +MAX_TOKENS = 64 +TEMPARATURE = 0.001 +MAX_NEW_TOKENS = 1024 + +PROMPTS_MAP = { + "json_based_1": prompts.json_based_1, + "json_based_2": prompts.json_based_2, + "questions_based_1": prompts.questions_based_1, + "questions_based_2": prompts.questions_based_2, + "questions_based_3": prompts.questions_based_3, + "questions_based_4": prompts.questions_based_4, + "questions_based_2_ft": prompts.questions_based_2_ft, +} + +# Set max_split_size_mb to avoid memory fragmentation +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" + +script_dir = Path(__file__).parent +# Create a logger +logger = logging.getLogger("runner") +logger.setLevel(logging.INFO) + +if utils.is_running_in_docker(): + file_handler = logging.FileHandler("/tmp/llm_log.log", mode="w") +else: + file_handler = logging.FileHandler("llm_log.log", mode="w") + +file_handler.setLevel(logging.INFO) + +console_handler = logging.StreamHandler(stdout) +console_handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +file_handler.setFormatter(formatter) +console_handler.setFormatter(formatter) +logger.addHandler(file_handler) +logger.addHandler(console_handler) + +# Set the PYTORCH_CUDA_ALLOC_CONF environment variable +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" + + +def log_memory_usage(): + """Logs the current memory usage of the system and CUDA.""" + process = psutil.Process(os.getpid()) + mem_info = process.memory_info() + logger.debug(f"System memory usage: {mem_info.rss / (1024 ** 3):.2f} GB") + + if torch.cuda.is_available(): + for i in range(torch.cuda.device_count()): + logger.debug(f"CUDA memory usage for device {i}:") + logger.debug( + f" Allocated: {torch.cuda.memory_allocated(i) / (1024 ** 3):.2f} GB" + ) + logger.debug( + f" Cached: {torch.cuda.memory_reserved(i) / (1024 ** 3):.2f} GB" + ) + + +def get_available_gpu_memory(): + """Returns the available GPU memory in GB.""" + if torch.cuda.is_available(): + available_memory = torch.cuda.get_device_properties( + 0 + ).total_memory - torch.cuda.memory_allocated(0) + return available_memory / (1024**3) + return 0 + + +def get_prompt_mapping( + result_dir, prompt_template, use_system_prompt=False, token_limit=4096 +): + """ + Traverse the directory structure, pair .json and _gt.json files, + and generate a combined id_mapping using get_prompt_mapping logic. + """ + base_path = Path(result_dir) + id_mapping = {} + idx = 0 + + # Walk through train, test, valid folders + for subfolder in ["test"]: + folder_path = base_path / subfolder + if not folder_path.exists(): + continue + + # Get .json and _gt.json files + code_json_files = sorted(folder_path.glob("*.json")) + gt_json_files = sorted(folder_path.glob("*_gt.json")) + + # Ensure files are paired correctly + json_pairs = {} + for code_json in code_json_files: + # Find the corresponding _gt.json + gt_json = folder_path / f"{code_json.stem}_gt.json" + if gt_json.exists(): + json_pairs[code_json] = gt_json + + # Process each pair + for code_json, gt_json in json_pairs.items(): + result_json_path = code_json.with_name(code_json.stem + "_result.json") + existing_results = {} + + # Check if the result JSON file exists and load existing results + if result_json_path.exists(): + with open(result_json_path, "r") as result_file: + existing_results = json.load(result_file) + + # Create a set of existing file paths for quick lookup + existing_files = {entry["file"] for entry in existing_results} + + with open(code_json, "r") as code_file: + code_data = json.load(code_file) + + with open(gt_json, "r") as gt_file: + gt_data = json.load(gt_file) + + # Create gt_mapping + gt_mapping = {} + for entry in gt_data: + file_path = entry["file"] + if file_path not in gt_mapping: + gt_mapping[file_path] = [] + gt_mapping[file_path].append(entry) + + # Process the code and generate prompts + for project_name, project_info in code_data.items(): + src_files = project_info.get("src_files", {}) + + for file_path, file_info in src_files.items(): + source_code = file_info.get("source_code", "") + + if source_code == "": + continue + + # Skip if the file is already present in existing results + if file_path in existing_files: + continue + + # Fetch the typing information for the file from gt_mapping + type_info_list = gt_mapping.get(file_path, []) + if len(type_info_list) == 0: + continue + + # Generate the prompt + prompt = utils.get_prompt( + prompt_template, + source_code, + type_info_list, + use_system_prompt=use_system_prompt, + file_path=file_path, + token_limit=token_limit, + ) + + if prompt is None: + continue + + # Store the result in id_mapping + id_mapping[idx] = { + "project_name": project_name, + "file_path": file_path, + "json_filepath": str(gt_json), + "result_filepath": str(result_json_path), + "result_dump_filepath": str( + code_json.with_name(code_json.stem + "_result_dump.txt") + ), + "prompt": prompt, + } + idx += 1 + + return id_mapping + + +def create_result_json_file_from_answers(file_info, output_raw, prompt_template): + # Clean the raw output + output = re.sub(r"```json", "", output_raw) + output = re.sub(r"```", "", output) + output = re.sub(r"<\|assistant\|>\\n", "", output) + + # Append raw output to the dump file for debugging + with open(file_info["result_dump_filepath"], "a") as f: + f.write(output_raw + "\n") + + # Generate and translate content based on the prompt template + if prompt_template in [ + "prompt_template_questions_based_2", + ]: + answers_json = utils.generate_json_from_answers( + file_info["file_path"], file_info["json_filepath"], output + ) + translated_json = translator.translate_content(answers_json) + else: + translated_json = translator.translate_content(output) + + # Load existing results if the file exists + existing_results = [] + if os.path.exists(file_info["result_filepath"]): + with open(file_info["result_filepath"], "r") as f: + try: + existing_results = json.load(f) + except json.JSONDecodeError: + logger.warning( + f"Invalid JSON in {file_info['result_filepath']}. Starting fresh." + ) + + # Append new results to the existing ones + if isinstance(translated_json, list): + existing_results.extend(translated_json) + else: + existing_results.append(translated_json) + + # Save the combined results back to the result file + is_valid_json = utils.generate_json_file( + file_info["result_filepath"], existing_results + ) + + if not is_valid_json: + logger.error(f"{file_info['file_path']} failed: Not a valid JSON") + raise utils.JsonException("json") + + logger.debug(f"Accessed {file_info['file_path']} successfully.") + logger.debug(f"Results appended to {file_info['result_filepath']} successfully.") + + +def create_result_json_file_from_code(file_info, output_raw, prompt_template): + # Ensure output_raw is a string to prevent TypeError + if not isinstance(output_raw, str): + output_raw = str(output_raw) if output_raw is not None else "" + + # Clean up the output by removing unnecessary formatting + code = re.search(r"```(?:.*?)\n(.*?)```", output_raw, re.DOTALL) + + if code: + output_cleaned = code.group(1).strip() + else: + output_cleaned = re.sub(r"```json|```|<\|assistant\|>\\n", "", output_raw) + + # Save the raw output to the result dump filepath + with open(file_info["result_dump_filepath"], "w") as f: + f.write(output_raw) + + # Determine the filename, falling back if "filename" key is missing + filename = file_info["json_filepath"] + if filename is None: + # Generate a fallback filename with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + fallback_dir = "outputs" # You may specify any preferred directory + os.makedirs(fallback_dir, exist_ok=True) + filename = os.path.join(fallback_dir, f"output_{timestamp}.json") + logger.warning( + f"'filename' key missing in file_info; saving output to {filename}" + ) + + # Directly translate the cleaned source code output to JSON annotations + translated_json = result_translator.translate_output_to_annotations( + output_cleaned, filename + ) + + # Validate and save the translated JSON to the final result file + result_filepath = file_info.get("result_filepath", filename) + if utils.generate_json_file(result_filepath, translated_json): + logger.debug( + f"Processed file: {file_info.get('file_path', filename)} successfully." + ) + else: + logger.error(f"{file_info.get('file_path', filename)} failed: Not a valid JSON") + raise utils.JsonException("json") + + +def list_json_files(folder_path): + json_files = sorted(Path(folder_path).rglob("*.json")) + return json_files + + +def model_evaluation_vllm( + model_name, + prompt_template, + python_files, + engine, + results_dst, + use_system_prompt=False, + lora_request=None, + sampling_params=None, +): + + id_mapping = get_prompt_mapping(prompt_template, python_files, use_system_prompt) + + prompts = [x["prompt"] for x in id_mapping.values()] + + processed_prompts = engine.tokenizer.tokenizer.apply_chat_template( + prompts, tokenize=False, add_generation_template=True + ) + + request_outputs = vllm_helpers.process_requests( + engine, processed_prompts, sampling_params, lora_request + ) + + for r_output in request_outputs: + file_info = id_mapping[int(r_output.request_id)] + + output_raw = r_output.outputs[0].text + create_result_json_file(file_info, output_raw, prompt_template) + + +def model_evaluation_transformers( + model_name, + prompt_template, + json_files, + pipe, + results_dst, + use_system_prompt=False, + initial_batch_size=32, + token_limit=3000, + max_memory=65, +): + id_mapping = get_prompt_mapping( + results_dst, prompt_template, use_system_prompt, token_limit + ) + + prompt_id_mapping_pairs = [(x["prompt"], x) for x in id_mapping.values()] + sorted_prompt_id_mapping_pairs = sorted( + prompt_id_mapping_pairs, key=lambda x: utils.get_token_count(x[0]) + ) + sorted_prompts = [pair[0] for pair in sorted_prompt_id_mapping_pairs] + sorted_id_mapping = [pair[1] for pair in sorted_prompt_id_mapping_pairs] + + progress_bar = tqdm(total=len(sorted_prompts), desc="Processing Prompts") + + i = 0 + batch_size = initial_batch_size + + while i < len(sorted_prompts): + current_batch = [] + current_token_count = 0 + + while ( + i < len(sorted_prompts) + and current_token_count + utils.get_token_count(sorted_prompts[i]) + <= token_limit * batch_size + ): + prompt_token_count = utils.get_token_count(sorted_prompts[i]) + logger.debug(f"Prompt {sorted_id_mapping[i]['file_path']}: {prompt_token_count} tokens") + current_batch.append(sorted_prompts[i]) + current_token_count += prompt_token_count + i += 1 + + log_memory_usage() + + try: + request_outputs = transformers_helpers.process_requests( + pipe, + current_batch, + max_new_tokens=MAX_NEW_TOKENS, + batch_size=len(current_batch), + ) + + for id, r_output in enumerate(request_outputs): + file_info = sorted_id_mapping[id + i - len(current_batch)] + output_raw = r_output[0]["generated_text"][-1]["content"] + create_result_json_file_from_answers( + file_info, output_raw, prompt_template + ) + + progress_bar.update(len(current_batch)) + del request_outputs, current_batch + gc.collect() + torch.cuda.empty_cache() + + except torch.cuda.OutOfMemoryError as e: + logger.error(f"CUDA out of memory error while processing batch starting at index {i - len(current_batch)}") + logger.error(f"Error details: {e}") + # logger.error(f"Batch details: {[sorted_id_mapping[i - len(current_batch) + j]['file_path'] for j in range(len(current_batch))]}") + log_memory_usage() + torch.cuda.empty_cache() + + batch_size = max(1, batch_size - 3) # Reduce batch size but ensure it's at least 1 + i -= len(current_batch) # Roll back the index to retry from unprocessed prompts + print(f"Reducing batch size to {batch_size}") + continue # Retry with a smaller batch size + + log_memory_usage() + progress_bar.close() + + +def model_evaluation_openai( + model_name, + prompt_template, + openai_key, + python_files, + results_dst, + use_system_prompt=False, +): + + id_mapping = get_prompt_mapping(prompt_template, json_files, use_system_prompt) + + # id_mapping = get_prompt_mapping(prompt_template, '/home/pysse/TypeEvalPy/src/target_tools/real-world-llms/src/real-world-dataset/train.json','/home/pysse/TypeEvalPy/src/target_tools/real-world-llms/src/real-world-dataset/train_translated.json' , use_system_prompt) + + prompts = [x["prompt"] for x in id_mapping.values()] + + utils.get_prompt_cost(prompts) + utils.dump_ft_jsonl(id_mapping, f"{results_dst}/ft_dataset_{model_name}.jsonl") + utils.dump_batch_prompt_jsonl( + id_mapping, + f"{results_dst}/batch_prompt_{model_name}.jsonl", + model=model_name, + ) + + request_outputs = openai_helpers.process_requests( + model_name, + prompts, + openai_key, + max_new_tokens=MAX_NEW_TOKENS, + max_workers=1, + ) + + for id, r_output in enumerate(request_outputs): + file_info = id_mapping[id] + + # Store raw output from LLM + output_raw = r_output + create_result_json_file(file_info, output_raw, prompt_template) + + +def main_runner(args, runner_config, models_to_run, openai_models_models_to_run): + runner_start_time = time.time() + for model in models_to_run: + error_count = 0 + timeout_count = 0 + json_count = 0 + files_analyzed = 0 + model_start_time = 0 # Initialize model_start_time here + + # Create result folder for model specific results + bechmark_path = Path(args.bechmark_path) + results_src = bechmark_path + if args.results_dir is None: + results_dst = bechmark_path.parent / model["name"] / bechmark_path.name + else: + results_dst = Path(args.results_dir) / model["name"] / bechmark_path.name + os.makedirs(results_dst, exist_ok=True) + + utils.copy_folder(results_src, results_dst) + + json_files = list_json_files(results_dst) + + if model["use_vllms_for_evaluation"]: + engine = vllm_helpers.initialize_engine( + model["model_path"], + model["quantization"], + model["lora_repo"], + model["max_model_len"], + ) + lora_request = None + if model["lora_repo"] is not None: + lora_request = LoRARequest( + f"{model['name']}-lora", 1, model["lora_repo"] + ) + + sampling_params = SamplingParams( + temperature=TEMPARATURE, top_p=0.95, max_tokens=MAX_TOKENS + ) + model_start_time = time.time() + model_evaluation_vllm( + model["name"], + args.prompt_id, + json_files, + engine, + results_dst, + use_system_prompt=model["use_system_prompt"], + lora_request=lora_request, + sampling_params=sampling_params, + ) + + del engine + gc.collect() + torch.cuda.empty_cache() + else: + if model["lora_repo"] is None: + model_path = model["model_path"] + lora_repo = None + else: + model_path = model["model_path"] + lora_repo = model["lora_repo"] + + pipe = None + try: + pipe = transformers_helpers.load_model_and_configurations( + args.hf_token, model_path, model["quantization"], TEMPARATURE, lora_repo + ) + model_start_time = time.time() + model_evaluation_transformers( + model["name"], + args.prompt_id, + json_files, + pipe, + results_dst, + use_system_prompt=model["use_system_prompt"], + initial_batch_size=model["batch_size"], + token_limit=model["max_model_len"], + ) + + except Exception as e: + logger.error(f"Error in model {model['name']}: {e}") + error_count += 1 + traceback.print_exc() + finally: + if pipe is not None: + del pipe + gc.collect() + torch.cuda.empty_cache() + + logger.info( + f"Model {model['name']} finished in {time.time()-model_start_time:.2f} seconds" + ) + logger.info("Running translator") + translator.main_translator(results_dst) + + # running gpt models + for model in openai_models_models_to_run: + error_count = 0 + timeout_count = 0 + json_count = 0 + files_analyzed = 0 + + # Create result folder for model specific results + bechmark_path = Path(args.bechmark_path) + results_src = bechmark_path + if args.results_dir is None: + results_dst = bechmark_path.parent / model["name"] / bechmark_path.name + else: + results_dst = Path(args.results_dir) / model["name"] / bechmark_path.name + os.makedirs(results_dst, exist_ok=True) + + utils.copy_folder(results_src, results_dst) + + python_files = list_json_files(results_dst) + + python_files = python_files[:2] + + model_start_time = time.time() + model_evaluation_openai( + model["name"], + args.prompt_id, + args.openai_key, + python_files, + results_dst, + use_system_prompt=model["use_system_prompt"], + ) + + logger.info( + f"Model {model['name']} finished in {time.time()-model_start_time:.2f} seconds" + ) + logger.info("Running translator") + translator.main_translator(results_dst) + + logger.info( + f"Runner finished in {time.time()-runner_start_time:.2f} seconds, with errors:" + f" {error_count} | JSON errors: {json_count}" + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--bechmark_path", + help="Specify the benchmark path", + default="/tmp/micro-benchmark", + ) + + parser.add_argument( + "--results_dir", + help="Specify the benchmark path", + default=None, + ) + + parser.add_argument("--hf_token", help="Specify the hf token", required=True) + + parser.add_argument( + "--openai_key", help="Specify the OpenAI Auth Key", required=False + ) + + parser.add_argument("--prompt_id", help="Specify the prompt ID", required=True) + + parser.add_argument( + "--models_config", + type=str, + default=f"{script_dir}/models_config.yaml", + ) + + parser.add_argument( + "--models", + nargs="+", + type=str, + help="Space-separated list of models", + ) + + parser.add_argument( + "--custom_models", + nargs="+", + type=str, + help="Space-separated list of custom models", + ) + + parser.add_argument( + "--openai_models", + nargs="+", + type=str, + help="Space-separated list of openai models", + ) + + parser.add_argument( + "--enable_streaming", + help="If LLM response should be streamed", + type=bool, + default=False, + ) + + args = parser.parse_args() + + # Set HF token + os.environ["HF_TOKEN"] = args.hf_token + + models_config = utils.load_models_config(parser.parse_args().models_config) + runner_config = utils.load_runner_config(parser.parse_args().models_config) + + models_to_run = [] + openai_models_models_to_run = [] + # check if args.models are in models_config + + if args.models: + for model in args.models: + if model not in models_config["models"]: + logger.error(f"Model {model} not found in models_config") + sys.exit(-1) + else: + models_to_run.append(models_config["models"][model]) + + # check if args.custom_models are in models_config + if args.custom_models: + for model in args.custom_models: + if model not in models_config["custom_models"]: + logger.error(f"Model {model} not found in models_config") + sys.exit(-1) + else: + models_to_run.append(models_config["custom_models"][model]) + + if args.openai_models: + for model in args.openai_models: + if model not in models_config["openai_models"]: + logger.error(f"Model {model} not found in models_config") + sys.exit(-1) + else: + openai_models_models_to_run.append( + models_config["openai_models"][model] + ) + + torch.cuda.empty_cache() + main_runner(args, runner_config, models_to_run, openai_models_models_to_run) + +# example usage: +""" +python3.10 runner.py \ +--bechmark_path /mnt/hf_cache/rashida_manytype4py/many-types-4-py-dataset/rw-benchmark \ +--prompt_id prompt_template_questions_based_2 \ +--models codestral-v0.1-22b qwen2.5-Coder-7B-Instruct \ +--hf_token \ +--openai_key token \ +--enable_streaming True \ +--models_config /home/ssegpu/rashida/TypeEvalPy/src/target_tools/real-world-llms/src/models_config.yaml \ +--results_dir /home/ssegpu/rashida/TypeEvalPy/results + +python runner.py \ +--bechmark_path /home/ssegpu/TypeEvalPy/TypeEvalPy/autogen_typeevalpy_benchmark \ +--prompt_id prompt_template_questions_based_2 \ +--models llama3.1-it:8b \ +--hf_token \ +--openai_key token \ +--enable_streaming True \ +--models_config /home/ssegpu/TypeEvalPy/TypeEvalPy/src/target_tools/llms/src/models_config.yaml \ +--results_dir /home/ssegpu/TypeEvalPy/TypeEvalPy/.scrapy/results_full_1 +""" diff --git a/src/target_tools/real-world-llms/src/syntax_checker.py b/src/target_tools/real-world-llms/src/syntax_checker.py new file mode 100644 index 000000000..318b47439 --- /dev/null +++ b/src/target_tools/real-world-llms/src/syntax_checker.py @@ -0,0 +1,41 @@ +import os +import ast + +def check_syntax(file_path): + """ + Check the syntax of a Python file. + Returns None if the file is syntactically correct, otherwise returns the error message. + """ + try: + with open(file_path, "r", encoding="utf-8") as file: + source_code = file.read() + ast.parse(source_code, filename=file_path) + return None # No syntax errors + except SyntaxError as e: + return f"SyntaxError in {file_path}: {e}" + +def find_files_with_syntax_errors(root_directory): + """ + Recursively walk through all .py files in a directory and check for syntax errors. + Prints out the files with syntax errors. + """ + files_with_errors = [] + + for subdir, _, files in os.walk(root_directory): + for file_name in files: + if file_name.endswith('.py'): + file_path = os.path.join(subdir, file_name) + error_message = check_syntax(file_path) + if error_message: + files_with_errors.append(error_message) + + if files_with_errors: + print("Files with syntax errors:") + for error in files_with_errors: + print(error) + else: + print("No syntax errors found in any files.") + +if __name__ == "__main__": + root_directory = "/home/ssegpu/rashida/TypeEvalPy/src/target_tools/real-world-llms/src/runner.py" + find_files_with_syntax_errors(root_directory) \ No newline at end of file diff --git a/src/target_tools/real-world-llms/src/test_annotator.py b/src/target_tools/real-world-llms/src/test_annotator.py new file mode 100644 index 000000000..64983eda7 --- /dev/null +++ b/src/target_tools/real-world-llms/src/test_annotator.py @@ -0,0 +1,99 @@ +import libcst as cst +import os + +# Function to check if all annotations in a file are MASK +def check_annotations_with_mask(file_path): + try: + with open(file_path, "r") as modified_file: + modified_code = modified_file.read() + except Exception as e: + print(f"Could not read file {file_path}: {e}") + return False + + try: + # Parse the code to check annotations + modified_tree = cst.parse_module(modified_code) + except cst.ParserSyntaxError as e: + print(f"Skipping {file_path}: Syntax error detected at {e}.") + return False # Indicate failure to parse, treated as a failed check + except Exception as e: + print(f"Unexpected error while parsing {file_path}: {e}") + return False + + # Define a visitor class to collect all annotations and assignments + class AnnotationChecker(cst.CSTVisitor): + def __init__(self): + self.all_annotations_masked = True # Default assumption: all are masked + + def visit_Param(self, node: cst.Param): + # Check if the parameter annotation is MASK + if not node.annotation or not isinstance(node.annotation.annotation, cst.Name) or node.annotation.annotation.value != "MASK": + self.all_annotations_masked = False + print(f"Annotation check failed in {file_path}: Parameter '{node.name.value}' is missing MASK annotation.") + + def visit_AnnAssign(self, node: cst.AnnAssign): + # Check if the variable annotation is MASK (for variables with explicit annotations) + if not isinstance(node.annotation.annotation, cst.Name) or node.annotation.annotation.value != "MASK": + self.all_annotations_masked = False + if isinstance(node.target, cst.Name): + print(f"Annotation check failed in {file_path}: Variable '{node.target.value}' is missing MASK annotation.") + + def visit_FunctionDef(self, node: cst.FunctionDef): + # Check if the return type annotation is MASK, even if not explicitly present + if node.returns: + if not isinstance(node.returns.annotation, cst.Name) or node.returns.annotation.value != "MASK": + self.all_annotations_masked = False + print(f"Annotation check failed in {file_path}: Return type of function '{node.name.value}' not masked with MASK.") + else: + # Log an error if the function does not have a return type at all + self.all_annotations_masked = False + print(f"Annotation check failed in {file_path}: Function '{node.name.value}' is missing return type annotation with MASK.") + + # Check each parameter within the function + for param in node.params.params: + if not param.annotation or not isinstance(param.annotation.annotation, cst.Name) or param.annotation.annotation.value != "MASK": + self.all_annotations_masked = False + print(f"Annotation check failed in {file_path}: Parameter '{param.name.value}' in function '{node.name.value}' is missing MASK annotation.") + + def visit_Assign(self, node: cst.Assign): + # Check unannotated variables, logging if an annotation is missing + for target in node.targets: + if isinstance(target.target, cst.Name): + # Only log if no annotation is present for standalone assignments + self.all_annotations_masked = False + print(f"Annotation check failed in {file_path}: Variable '{target.target.value}' is missing MASK annotation.") + + # Use the visitor to verify annotations + checker = AnnotationChecker() + modified_tree.visit(checker) + + # Return whether all annotations were masked + if checker.all_annotations_masked: + print(f"File {file_path} passed the annotation check.") + return checker.all_annotations_masked + +# Main function to test all .py files in a directory for MASK annotations +def main(): + root_directory = '/home/ssegpu/rashida/TypeEvalPy/micro-benchmark/python_features' + all_files_passed = True + + for subdir, _, files in os.walk(root_directory): + for file_name in files: + if file_name.endswith('.py'): + file_path = os.path.join(subdir, file_name) + + # Check if all annotations were masked with MASK + if not check_annotations_with_mask(file_path): + print(f"Test failed or skipped: Not all annotations were replaced with MASK in {file_path}.") + all_files_passed = False + else: + print(f"Test passed: All annotations were correctly replaced with MASK in {file_path}.") + + # Final result + if all_files_passed: + print("All files passed: All annotations were correctly replaced with MASK.") + else: + print("Some files failed or were skipped due to errors: Not all annotations were replaced with MASK.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/target_tools/real-world-llms/src/transformers_helpers.py b/src/target_tools/real-world-llms/src/transformers_helpers.py new file mode 100644 index 000000000..8734caf72 --- /dev/null +++ b/src/target_tools/real-world-llms/src/transformers_helpers.py @@ -0,0 +1,118 @@ +import transformers +import torch +from tqdm import tqdm +from torch.utils.data import Dataset +from peft import PeftModel + +class ListDataset(Dataset): + + def __init__(self, original_list): + self.original_list = original_list + + def __len__(self): + return len(self.original_list) + + def __getitem__(self, i): + return self.original_list[i] + + +DEFAULT_CHAT_TEMPLATE = ( + "{% if messages[0]['role'] == 'system' %}" + "{% set loop_messages = messages[1:] %}" # Extract system message if it's present + "{% set system_message = messages[0]['content'] %}" + "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" + "{% set loop_messages = messages %}" # Or use the default system message if the flag is set + "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" + "{% else %}" + "{% set loop_messages = messages %}" + "{% set system_message = false %}" + "{% endif %}" + "{% for message in loop_messages %}" # Loop over all non-system messages + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" + "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" + "{% endif %}" + "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message + "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" + "{% else %}" + "{% set content = message['content'] %}" + "{% endif %}" + "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way + "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" + "{% elif message['role'] == 'system' %}" + "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" + "{% elif message['role'] == 'assistant' %}" + "{{ ' ' + content.strip() + ' ' + eos_token }}" + "{% endif %}" + "{% endfor %}" +) + + +# Load model, tokenizer and create pipeline +def load_model_and_configurations( + HF_TOKEN, model_name, use_quantized_model=True, temperature=0.001, lora_repo=None +): + temperature = temperature + + bnb_config = transformers.BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + ) + + model = transformers.AutoModelForCausalLM.from_pretrained( + model_name, + device_map="auto", + quantization_config=bnb_config if use_quantized_model else None, + token=HF_TOKEN, + trust_remote_code=True, + torch_dtype="bfloat16" if model_name.startswith("google") else "auto", + attn_implementation=( + "flash_attention_2" if model_name.startswith("microsoft") else None + ), + ) + + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_name, token=HF_TOKEN, trust_remote_code=True, padding_side="left", truncation=True + ) + + # padding should be padding_side='left' for llama models + + if lora_repo: + model = PeftModel.from_pretrained(model, lora_repo) + + + if not tokenizer.chat_template: + tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE + print("Default Chat template set") + + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + + pipe = transformers.pipeline( + "text-generation", + model=model, + tokenizer=tokenizer, + pad_token_id=tokenizer.eos_token_id, + # temperature=temperature, + ) + + return pipe + + +def process_requests( + pipe, + prompts, + print_responses: bool = False, + max_new_tokens: int = 128, + batch_size: int = 1, +): + """Continuously process a list of prompts and handle the outputs.""" + # dataset = ListDataset(prompts) # TODO: issues making this work, mainly to show progress bar + # print(f"Processing {len(prompts)} prompts for model {pipe.model.name_or_path}") + responses = [ + i for i in pipe(prompts, max_new_tokens=max_new_tokens, batch_size=batch_size) + ] + + return responses diff --git a/src/target_tools/real-world-llms/src/translator.py b/src/target_tools/real-world-llms/src/translator.py new file mode 100644 index 000000000..07bfe2342 --- /dev/null +++ b/src/target_tools/real-world-llms/src/translator.py @@ -0,0 +1,200 @@ +import argparse +import json +import os +import re +from pathlib import Path +import utils + +def normalize_type(type_str, nested_level=0): + """ + Normalize the type string by removing module prefixes and simplifying typing constructs. + Example: 'builtins.str' -> 'str', + 'typing.Tuple[builtins.str, builtins.float]' -> 'Tuple[str, float]', + 'musictaxonomy.spotify.models.spotifyuser' -> 'SpotifyUser', + 'List[List[Tuple[str]]]' -> 'List[List[Any]]' if nested level > 2. + """ + + if type_str is None: + return None + + # Remove extra quotes if present + if type_str.startswith('"') and type_str.endswith('"'): + type_str = type_str.strip('"') + + # Mapping of module prefixes to remove + type_mappings = { + "builtins.": "", + "typing.": "", + } + # Additional type mappings + additional_type_mappings = { + "integer": "int", + "string": "str", + "dictonary": "dict", + "method": "Callable", + "func": "Callable", + "function": "Callable", + "none": "None", + "Nonetype": "None", + "nonetype": "None", + "NoneType": "None", + } + + if type_str is None: + return None + + # Replace module prefixes + for prefix, replacement in type_mappings.items(): + type_str = type_str.replace(prefix, replacement) + + # Apply additional type mappings + type_str = additional_type_mappings.get(type_str, type_str) + + # Handle generic types (e.g., Tuple[], List[], Dict[]) + if "[" in type_str and "]" in type_str: + base_type, generic_content = type_str.split("[", 1) + generic_content = generic_content.rsplit("]", 1)[0] + # Process the generic parameters recursively + generic_params = [] + bracket_level = 0 + param = "" + for char in generic_content: + if char == "[": + bracket_level += 1 + param += char + elif char == "]": + bracket_level -= 1 + param += char + elif char == "," and bracket_level == 0: + generic_params.append(param.strip()) + param = "" + else: + param += char + if param: + generic_params.append(param.strip()) + + # If nested level is greater than 0, replace with Any + if nested_level > 0: + normalized_params = ["Any"] + else: + normalized_params = [normalize_type(param, nested_level + 1) for param in generic_params] + + return f"{base_type}[{', '.join(normalized_params)}]" + + + # Handle fully qualified names by extracting the last segment + if "." in type_str: + return type_str.split(".")[-1] + + # Return the simplified type + return type_str + + +def translate_pipeline(text, functions): + for func in functions: + text = func(text) + return text + + +def extract_class_name(text): + match = re.search(r"", text) + return match.group(1) if match else text + + +def extract_common_patterns(text): + try: + # Adjusted regular expression to match the specific patterns as instructed + pattern = ( + r"\|\|Return type of `.*?`:" + r" (\w+)|Return type: (\w+)|The return type of '.*?' is (\w+)|The type of" + r" '.*?' is a (\w+)|Type of `.*?`: (\w+)|Type of `.*?` is `(\w+)`|`.*?`" + r" return type: `(\w+)`|`.*?` is a function call that returns an (\w+)" + r" value|`(\w+)`: `\w+`|column \d+: `(\w+)`|column \d+ is '(\w+)'|type of" + r" '.*?': `(\w+)`" + ) + + # Extracting the types with the adjusted pattern + extracted_types = [] + matches = re.findall(pattern, text) + if matches: + for match in matches: + # Filter out empty matches and add the found type + found_types = [m for m in match if m] + if found_types: + return found_types[0] + else: + return text + + except Exception as e: + return text + + +# Create a list of functions to apply +functions = [extract_class_name, extract_common_patterns] + + +def translate_content(data): + type_mapping = { + "integer": "int", + "string": "str", + "dictonary": "dict", + "method": "Callable", + "func": "Callable", + "function": "Callable", + "none": "None", + "Nonetype": "None", + "nonetype": "None", + "NoneType": "None", + } + + try: + if isinstance(data, str): + data = json.loads(data) + except Exception as e: + print(f"Not a valid JSON: {e}") + raise utils.JsonException + + for entry in data: + if "type" in entry: + entry["type"] = [normalize_type(entry["type"][0])] + else: + entry["type"] = [] + + return data + + +def list_json_files(folder_path): + python_files = sorted(Path(folder_path).rglob("*.json")) + return python_files + + +def main_translator(benchmark_path): + json_files = list_json_files(benchmark_path) + error_count = 0 + for file in json_files: + try: + with open(file) as f: + data = json.load(f) + + # Run the inference here and gather results in /tmp/results + translated = translate_content(data) + + json_data = json.dumps(translated, indent=4) + with open(file, "w") as file: + file.write(json_data) + + except Exception as e: + print(f"Command returned non-zero exit status: {e} for file: {file}") + error_count += 1 + + print(f"Translator finished with errors:{error_count}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--benchmark_path", help="Specify the benchmark path", required=True + ) + + args = parser.parse_args() + main_translator(args.benchmark_path) \ No newline at end of file diff --git a/src/target_tools/real-world-llms/src/utils.py b/src/target_tools/real-world-llms/src/utils.py new file mode 100644 index 000000000..3d4b56e3f --- /dev/null +++ b/src/target_tools/real-world-llms/src/utils.py @@ -0,0 +1,538 @@ +import json +import os +import re +import shutil +import sys +import yaml + +import requests +import logging +import prompts +import copy +import tiktoken +import csv +from multiprocessing import Pool +from tqdm import tqdm + +logger = logging.getLogger("runner") +logger.setLevel(logging.INFO) + +# Initialize counters +exceeded_limit_count = 0 +within_limit_count = 0 + + +class JsonException(Exception): + pass + + +class TimeoutException(Exception): + pass + + +def is_ollama_online(server_url): + try: + res = requests.get(server_url) + # Check if the request was successful + if res.status_code == 200: + # Check the content of the response + if res.text == "Ollama is running": + return True + return False + except requests.exceptions.RequestException as e: + # Handle any exceptions that occur during the request + print(f"An error occurred: {e}") + return False + + +def copy_folder(src, dst): + """ + Copies a folder from the source (src) to the destination (dst). + If the destination folder exists, it retains its content and continues. + If the destination folder does not exist, it is created. + + :param src: Source folder path + :param dst: Destination folder path + """ + # Check if the source directory exists + if not os.path.exists(src): + print(f"Source folder {src} does not exist.") + return + + # Copy the folder, keeping existing contents in destination if it exists + if os.path.exists(dst): + print(f"Destination folder {dst} already exists. Retaining its contents.") + else: + print(f"Destination folder {dst} does not exist. Creating it.") + + # Copy contents from source to destination + shutil.copytree(src, dst, dirs_exist_ok=True) + print(f"Folder copied from {src} to {dst}.") + + +def is_running_in_docker(): + """Check if Python is running inside a Docker container.""" + return ( + os.path.exists("/.dockerenv") + or os.environ.get( # Check if the /.dockerenv file exists + "DOCKER_CONTAINER", False + ) + or os.environ.get( # Check if DOCKER_CONTAINER environment variable is set + "DOCKER_IMAGE_NAME", False + ) # Check if DOCKER_IMAGE_NAME environment variable is set + ) + + +def generate_json_file(filename, type_info): + # Generate JSON file with type information + try: + if isinstance(type_info, list): + pass + else: + type_info = json.loads(type_info) + is_valid_json = True + except Exception as e: + is_valid_json = False + print(f"Not a valid JSON: {e}") + + json_data = json.dumps(type_info, indent=4) + with open(filename, "w") as file: + file.write(json_data) + + return is_valid_json + + +def generate_json_from_answers(repo, gt_json_file, answers): + try: + with open(gt_json_file, "r") as file: + gt_data = json.load(file) + + if isinstance(answers, str) and not re.search( + r"^\s*\d+\.\s+", answers, re.MULTILINE + ): + # Extract type from the answers string + type_match = re.search(r"^[^\n]+", answers) + if type_match: + extracted_type = type_match.group(0).strip() + parsed_answers = {0: extracted_type} + else: + parsed_answers = {0: answers.strip()} + else: + pattern = re.compile(r"^\s*(\d+)\.\s*(.+)\s*$", re.MULTILINE) + parsed_answers = pattern.findall(answers) + parsed_answers = {int(x) - 1: y for x, y in parsed_answers} + + # if len(gt_data) != len(parsed_answers): + # return [] + + # Filter gt_data to only include instances where the file name matches the repo + repo_gt_data = [entry for entry in gt_data if entry.get("file") == repo] + + answers_json_data = [] + for fact in range(len(repo_gt_data)): + _result = repo_gt_data[fact] + _result.pop("type") + if fact in parsed_answers: + _result["type"] = [parsed_answers[fact].strip()] + answers_json_data.append(_result) + + return answers_json_data + except Exception as e: + print("Error generating json from questions") + print(e) + return [] + + +def generate_answers_for_fine_tuning(json_data, file_path): + # Read and parse the JSON file + + repo_data = [entry for entry in json_data if entry.get("file") == file_path] + counter = 1 + answers = [] + for fact in repo_data: + answers.append(f"{counter}. {', '.join(fact['type'])}") + counter += 1 + + return "\n".join(answers) + + +def generate_questions_from_json(json_file): + # Read and parse the JSON file + with open(json_file, "r") as file: + data = json.load(file) + + questions = [] + + for entry in data: + file = entry["file"] + line_number = entry["line_number"] + col_offset = entry["col_offset"] + + # Generate different questions based on the content of each entry + # Function Return type + if "function" in entry and "parameter" not in entry and "variable" not in entry: + question = ( + "What is the return type of the function" + f" '{entry['function']}' at line {line_number}, column" + f" {col_offset}?" + ) + # Function Parameter type + elif "parameter" in entry: + question = ( + f"What is the type of the parameter '{entry['parameter']}' at line" + f" {line_number}, column {col_offset}, within the function" + f" '{entry['function']}'?" + ) + # Variable in a function type + elif "variable" in entry and "function" not in entry: + question = ( + f"What is the type of the variable '{entry['variable']}' at line" + f" {line_number}, column {col_offset}?" + ) + elif "variable" in entry and "function" in entry: + question = ( + f"What is the type of the variable '{entry['variable']}' at line" + f" {line_number}, column {col_offset}, within the function" + f" '{entry['function']}'?" + ) + else: + print("ERROR! Type could not be converted to types") + questions.append(question) + + if len(data) != len(questions): + print("ERROR! Type questions length does not match json length") + sys.exit(-1) + + questions = [f"{x}. {y}" for x, y in zip(range(1, len(questions) + 1), questions)] + return questions + + +def load_models_config(config_path): + models_config = {"models": {}, "custom_models": {}, "openai_models": {}} + with open(config_path, "r") as file: + config_data = yaml.safe_load(file) + for model_data in config_data["models"]: + models_config["models"][model_data["name"]] = model_data + for model_data in config_data["custom_models"]: + models_config["custom_models"][model_data["name"]] = model_data + for model_data in config_data["openai_models"]: + models_config["openai_models"][model_data["name"]] = model_data + + return models_config + + +def load_runner_config(config_path): + with open(config_path, "r") as file: + config_data = yaml.safe_load(file) + + return config_data["runner_config"] + + +def gather_code_files_from_test_folder(test_folder, language_extension="py"): + """Recursively gathers all code files with the specified language extension.""" + code_files = [] + for root, _, files in os.walk(test_folder): + for file in files: + if file.endswith(f".{language_extension}"): + code_files.append(os.path.join(root, file)) + return code_files + + +def get_token_count(text, prompt_id=None): + """ + Calculate the token count for the input text. + Supports strings, lists of strings, and lists of dictionaries with 'content' keys. + """ + # Load the tokenizer + encoding = tiktoken.encoding_for_model("gpt-4o") + + if isinstance(text, list): + # If text is a list of dictionaries or strings + if all(isinstance(item, dict) and "content" in item for item in text): + # Extract 'content' field from dictionaries + text = " ".join(item["content"] for item in text) + else: + # Join list of strings + text = " ".join(str(item) for item in text) + + # Encode and count tokens + tokens = encoding.encode(text) + token_count = len(tokens) + + return token_count + + +def generate_questions_from_metadata(metadata): + """ + Generates questions based on the metadata passed (previously read from JSON). + """ + questions = [] + + for entry in metadata: + file = entry.get("file") + line_number = entry.get("line_number", "unknown") + col_offset = entry.get("col_offset", "unknown") + + # Ensure we have either 'function', 'parameter', or 'variable' + if "function" in entry and "parameter" not in entry and "variable" not in entry: + question = f"What is the return type of the function '{entry['function']}' at line {line_number}, column {col_offset}?" + elif "parameter" in entry: + question = f"What is the type of the parameter '{entry['parameter']}' at line {line_number}, column {col_offset}, within the function '{entry['function']}'?" + elif "variable" in entry and "function" not in entry: + question = f"What is the type of the variable '{entry['variable']}' at line {line_number}, column {col_offset}?" + elif "variable" in entry and "function" in entry: + question = f"What is the type of the variable '{entry['variable']}' at line {line_number}, column {col_offset}, within the function '{entry['function']}'?" + else: + print(f"ERROR! Type could not be converted to types for entry: {entry}") + continue + + questions.append(question) + + # Number the questions + questions = [f"{x}. {y}" for x, y in zip(range(1, len(questions) + 1), questions)] + + return questions + + +def truncate_prompt( + prompt, token_limit, tokenizer=tiktoken.encoding_for_model("gpt-4o") +): + """ + Truncate the prompt to fit within the specified token limit using a tokenizer. + :param prompt: The original prompt (list of dictionaries). + :param token_limit: The maximum number of tokens allowed. + :param tokenizer: The tokenizer to count tokens. + :return: The truncated prompt. + """ + total_tokens = 0 + truncated_prompt = [] + + for message in prompt: + # Tokenize the content of the message + try: + message_tokens = tokenizer.encode( + message["content"] + ) # Adjust tokenizer usage as needed + except TypeError: + # Fallback if the tokenizer doesn't have encode + message_tokens = tokenizer.tokenize(message["content"]) + + token_count = len(message_tokens) + + if total_tokens + token_count > token_limit: + # Calculate remaining tokens and truncate the message + remaining_tokens = token_limit - total_tokens + truncated_message_tokens = message_tokens[:remaining_tokens] + truncated_message = tokenizer.decode(truncated_message_tokens) + truncated_prompt.append( + {"role": message["role"], "content": truncated_message} + ) + break + else: + truncated_prompt.append(message) + total_tokens += token_count + + return truncated_prompt + + +def get_prompt( + prompt_id, + source_code, + metadata=None, + answers_placeholders=True, + use_system_prompt=True, + file_path=None, + token_limit=8192, +): + """ + Generates a prompt based on the given prompt_id, metadata, and file path. + + Args: + prompt_id (str): Identifier for the prompt template. + file_path (str): Path to the file associated with the prompt. + metadata (list): Metadata used to generate questions (if applicable). + answers_placeholders (bool): Whether to include placeholder answers. + use_system_prompt (bool): Whether to include the system prompt. + + Returns: + dict: The generated prompt. + """ + if prompt_id in ["prompt_template_questions_based_2"]: + # Use metadata to generate questions + if metadata is None: + raise ValueError("Metadata is required for this prompt template.") + + questions_from_metadata = generate_questions_from_metadata(metadata) + prompt_data = { + "code": f"```{source_code}```", + "questions": "\n".join(questions_from_metadata), + "answers": ( + "\n".join([f"{x}." for x in range(1, len(questions_from_metadata) + 1)]) + if answers_placeholders + else "" + ), + } + + if use_system_prompt: + prompt = copy.deepcopy(eval(f"prompts.{prompt_id}")) + prompt[1]["content"] = prompt[1]["content"].format(**prompt_data) + else: + prompt = copy.deepcopy(eval(f"prompts.{prompt_id}_no_sys")) + prompt[0]["content"] = prompt[0]["content"].format(**prompt_data) + + elif prompt_id in ["prompt_template_masked_code_based_1"]: + json_filepath = str(source_code).replace(".py", "_gt.json") + test_dir = os.path.dirname(json_filepath) + code_files = gather_code_files_from_test_folder(test_dir) + + # Concatenate code contents with masked file input + code = "" + for code_file in code_files: + try: + with open(code_file, "r") as file: + masked_code_content = ( + file.read() + ) # Assuming files are already masked + relative_path = os.path.relpath(code_file, test_dir) + # Add filename to the code content for context + code += f"```{relative_path}\n{masked_code_content}```\n\n" + except FileNotFoundError: + logger.warning(f"Code file {code_file} not found. Skipping.") + + prompt_data = { + "code": code, + "instructions": ( + "You are given a Python code snippet where all type annotations are currently represented by the placeholder '[MASK]'. " + "Your task is to replace '[MASK]' with the most appropriate Python type annotations, such as 'str', 'int', 'callable', etc., " + "for all function return types, variable annotations, and function parameters. " + "\n\nStrict Requirements:\n" + "1. Maintain the exact same structure, formatting, and indentation as in the input code.\n" + "2. Do not alter the line numbers or remove existing blank lines.\n" + "3. Do not add any additional blank lines or comments.\n" + "4. Do not add any explanations or extra information in the output.\n" + "5. Only return the annotated version of the code.\n" + "6. Ensure proper and consistent type annotations wherever applicable." + ), + } + + if use_system_prompt: + prompt = copy.deepcopy(eval(f"prompts.{prompt_id}")) + prompt[1]["content"] = "{instructions}\n\n{code}".format(**prompt_data) + else: + prompt = copy.deepcopy(eval(f"prompts.{prompt_id}_no_sys")) + prompt[0]["content"] = "{instructions}\n\n{code}".format(**prompt_data) + + else: + raise ValueError(f"Unknown prompt_id: {prompt_id}") + + # Calculate token count + token_counts = get_token_count(prompt, prompt_id) + + if token_counts > token_limit: + global exceeded_limit_count + exceeded_limit_count += 1 + # Load existing data if the file exists + if os.path.exists("exceeded_token_limit_files.json"): + with open("exceeded_token_limit_files.json", "r") as file: + data = json.load(file) + else: + data = {"file_paths": []} + + # Append the new file path + data["file_paths"].append(file_path) + + # Write the updated data back to the file + with open("exceeded_token_limit_files.json", "w") as file: + json.dump(data, file, indent=4) + return None + else: + global within_limit_count + within_limit_count += 1 + + return prompt + + +def process_mapping(mapping): + assistant_message = { + "role": "assistant", + "content": generate_answers_for_fine_tuning( + mapping["json_data"], mapping["file_path"] + ), + } + mapping["prompt"].append(assistant_message) + return mapping["prompt"] + + +def dump_ft_jsonl(id_mapping, output_file): + # Load the first mapping's JSON file + first_mapping = next(iter(id_mapping.values())) + with open(first_mapping["json_filepath"], "r") as file: + json_data = json.load(file) + + mappings = copy.deepcopy(id_mapping) + with Pool() as pool: + with tqdm(total=len(mappings), desc="Processing mappings") as pbar: + prompts = [] + for mapping in mappings.values(): + mapping["json_data"] = json_data + result = process_mapping(mapping) + prompts.append(result) + pbar.update() + + with open(output_file, "w") as output: + for _m in prompts: + message = {"messages": _m} + output.write(json.dumps(message, ensure_ascii=False)) + output.write("\n") + + +def dump_batch_prompt_jsonl( + id_mapping, output_file, id_prefix="types", model="gpt-4o-mini" +): + with open(output_file, "w") as output: + for idx, _m in id_mapping.items(): + prompt_dict = { + "custom_id": f"request-{id_prefix}-{idx}", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": model, + "messages": _m["prompt"], + "max_tokens": 250, + }, + } + output.write(json.dumps(prompt_dict)) + output.write("\n") + + +def get_prompt_cost(prompts): + """ + Retrieves the token count of the given text. + + Args: + text (str): The text to be tokenized. + + Returns: + int: The token count. + """ + + prices_per_token = { + "gpt-4o": 0.000005, + "gpt-4o-mini": 0.00000015, + } + + for model, price in prices_per_token.items(): + encoding = tiktoken.encoding_for_model(model) + number_of_tokens = len(encoding.encode(str(prompts))) + logger.info( + f"Number of tokens for model `{model}`: {number_of_tokens}" + + f" Cost: {number_of_tokens * price:.5f}" + ) + + +# Example usage: +# loader = ConfigLoader("models_config.yaml") +# loader.load_config() +# models = loader.get_models() +# for model in models: +# print(model.name, model.model_path) diff --git a/src/target_tools/real-world-llms/src/vllm_helpers.py b/src/target_tools/real-world-llms/src/vllm_helpers.py new file mode 100644 index 000000000..b0be99fe9 --- /dev/null +++ b/src/target_tools/real-world-llms/src/vllm_helpers.py @@ -0,0 +1,173 @@ +""" +This example shows how to use LoRA with different quantization techniques +for offline inference. + +Requires HuggingFace credentials for access. +""" + +import gc +from typing import List, Optional, Tuple + +import torch +from huggingface_hub import snapshot_download + +from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.lora.request import LoRARequest +import utils + + +def create_test_prompts( + lora_path: str, +) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: + return [ + # this is an example of using quantization without LoRA + ( + "My name is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + None, + ), + # the next three examples use quantization with LoRA + ( + "my name is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + LoRARequest("lora-test-1", 1, lora_path), + ), + ( + "The capital of USA is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + LoRARequest("lora-test-2", 1, lora_path), + ), + ( + "The capital of France is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + LoRARequest("lora-test-3", 1, lora_path), + ), + ] + + +def process_requests( + engine: LLMEngine, + prompts: List[Tuple[str, SamplingParams, Optional[LoRARequest]]], + sampling_params: SamplingParams, + lora_request: Optional[LoRARequest], + print_responses: bool = False, +): + """Continuously process a list of prompts and handle the outputs.""" + request_id = 0 + responses = {} + total_prompts = len(prompts) + + while prompts or engine.has_unfinished_requests(): + if prompts: + prompt = prompts.pop(0) + engine.add_request( + str(request_id), prompt, sampling_params, lora_request=lora_request + ) + request_id += 1 + + request_outputs: List[RequestOutput] = engine.step() + for request_output in request_outputs: + if request_output.finished: + if responses.get(request_output.request_id) is None: + responses[request_output.request_id] = request_output + print(f"Processed {len(responses)}/{total_prompts}") + if print_responses: + print("----------------------------------------------------") + print(f"Prompt: {request_output.prompt}") + print(f"Output: {request_output.outputs[0].text}") + + return responses.values() + + +def initialize_engine( + model: str, quantization: str, lora_repo: Optional[str], max_model_len: int = 8192 +) -> LLMEngine: + """Initialize the LLMEngine.""" + + if quantization == "bitsandbytes": + # https://docs.vllm.ai/en/stable/quantization/bnb.html + # QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique. + # It quantizes the model when loading, with some config info from the + # LoRA adapter repo. So need to set the parameter of load_format and + # qlora_adapter_name_or_path as below. + if lora_repo: + engine_args = EngineArgs( + model=model, + quantization=quantization, + qlora_adapter_name_or_path=lora_repo, + load_format="bitsandbytes", + enable_lora=True, + max_lora_rank=64, + max_model_len=max_model_len, + # set it only in GPUs of limited memory + enforce_eager=True, + trust_remote_code=True, + ) + else: + engine_args = EngineArgs( + model=model, + quantization=quantization, + load_format="bitsandbytes", + max_model_len=max_model_len, + enforce_eager=True, + trust_remote_code=True, + ) + else: + engine_args = EngineArgs( + model=model, + max_model_len=max_model_len, + enforce_eager=True, + trust_remote_code=True, + ) + return LLMEngine.from_engine_args(engine_args) + + +def main(): + """Main function that sets up and runs the prompt processing.""" + + test_configs = [ + { + "name": "qlora_inference_example", + "model": "huggyllama/llama-7b", + "quantization": "bitsandbytes", + "lora_repo": "timdettmers/qlora-flan-7b", + }, + { + "name": "AWQ_inference_with_lora_example", + "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + "quantization": "awq", + "lora_repo": "jashing/tinyllama-colorist-lora", + }, + { + "name": "GPTQ_inference_with_lora_example", + "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + "quantization": "gptq", + "lora_repo": "jashing/tinyllama-colorist-lora", + }, + ] + + for test_config in test_configs: + print(f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~") + engine = initialize_engine( + test_config["model"], test_config["quantization"], test_config["lora_repo"] + ) + lora_path = snapshot_download(repo_id=test_config["lora_repo"]) + test_prompts = create_test_prompts(lora_path) + process_requests(engine, test_prompts) + + # Clean up the GPU memory for the next test + del engine + gc.collect() + torch.cuda.empty_cache() + + +if __name__ == "__main__": + main() diff --git a/src/target_tools/type4py/Dockerfile b/src/target_tools/type4py/Dockerfile index c782f09fa..d9a4c1736 100644 --- a/src/target_tools/type4py/Dockerfile +++ b/src/target_tools/type4py/Dockerfile @@ -8,3 +8,6 @@ COPY src /tmp/src RUN pip install --upgrade pip #RUN pip install . RUN pip install -r /app/requirements.txt + +# Run app.py when the container launches +ENTRYPOINT ["python", "/tmp/src/runner.py"] \ No newline at end of file diff --git a/src/target_tools/type4py/src/runner.py b/src/target_tools/type4py/src/runner.py index b076f5ff8..d28feafca 100644 --- a/src/target_tools/type4py/src/runner.py +++ b/src/target_tools/type4py/src/runner.py @@ -56,20 +56,28 @@ def process_file(file_path): def main_runner(args): + logger.info("args: %s", args) python_files = list_python_files(args.bechmark_path) + logger.info(f"Found {len(python_files)} python files") error_count = 0 - for file in python_files: + for i, file in enumerate(python_files): try: + logger.info(f"Processing file {i+1}/{len(python_files)}: {file}") # logger.debug(file) inferred = process_file(file) translated = translator.translate_content(inferred) + logger.info(f"Translated: {translated}") json_file_path = str(file).replace(".py", "_result.json") + result_dump_path = str(file).replace(".py", "_dump.json") with open(json_file_path, "w") as json_file: json.dump(translated, json_file, indent=4) + with open(result_dump_path, "w") as json_file: + json.dump(inferred, json_file, indent=4) + except Exception as e: logger.info(f"Command returned non-zero exit status: {e} for file: {file}") error_count += 1