Skip to content
Snippets Groups Projects
Commit 68ad38c8 authored by s200545's avatar s200545
Browse files

More changes in the main.py, and updated README.md

parent a49ba3ad
No related branches found
No related tags found
No related merge requests found
# formast
A AST formatting tool
An AST formatting tool to run on java files, with different formatting options. Available options are:
- tokenize
- ast
- relative ast
- sorted compressed ast
## Develop
To run the executable:
Remember to download tree-sitter-java and run the following command (uncomment it in the __main__.py) to generate the language file, when you are running formastfor the first time.
```console
pdm run python -m formast <args>
Language.build_library(
# Store the library in the `build` directory
'build/my-languages.so',
# Include one or more languages
[
'../../vendor/tree-sitter-java'
]
)
´´´
To see the available options:
```console
pdm run formast --help
```
How to run:
To run the executable:
```console
pdm run python -m formast <args>
```
2>&1 | tee log.txt
pdm run regit -vv --repo "C:\Users\boran\OneDrive\DTU\BSc Thesis\babyrepos\onlinebookstore" -o "C:\Users\boran\OneDrive\DTU\BSc Thesis\babyrepos\onlinebookstore_ast10" -p '**\*.java' -m mapping.csv -- formast --writeast --overwrite {}
pdm run python .\formast\commitdiff.py -w "C:\Users\boran\OneDrive\DTU\BSc Thesis\babyrepos\onlinebookstore" output_nws.csv
```
......@@ -9,6 +9,7 @@ from pathlib import Path
log = logging.getLogger(__name__)
## Uncomment this block if you don't have the language file yet, first time running formast
# Language.build_library(
# # Store the library in the `build` directory
# 'build/my-languages.so',
......@@ -19,6 +20,7 @@ log = logging.getLogger(__name__)
# ]
# )
# Load the language (assuming the language is created, if not run the commented code above)
language_file = Path(__file__).absolute().parent.parent.parent / 'build' / 'my-languages.so'
JAVA_LANGUAGE = Language(str(language_file), 'java')
......@@ -26,23 +28,18 @@ JAVA_LANGUAGE = Language(str(language_file), 'java')
@click.option("--writetoken", is_flag=True, help="Write the new file with the tokenized content")
@click.option("--writeast", is_flag=True, help="Write the new file with the AST content")
@click.option("--writerelativeast", is_flag=True, help="Write the AST content with relative positions")
@click.option("--writecompast", is_flag=True, help="Write the new file with the compressed AST content and hashing")
@click.option("--writecompastsort", is_flag=True, help="Write the new file with the compressed AST content and sorted hashing")
@click.option("--overwrite", is_flag=True, help="Overwrite the original .java file with the new content")
@click.option("-v", "--verbose", count=True, help="Increase output verbosity")
@click.argument("file_path", type=str)
## Formast
def formast(file_path, writetoken, writeast, writerelativeast, writecompast, writecompastsort, overwrite, verbose):
def formast(file_path, writetoken, writeast, writerelativeast, writecompastsort, overwrite, verbose):
# initialize logging
logging.basicConfig(level=verbose)
# Load the language (assuming the language is created, if not run the commented code above)
language_file = Path(__file__).absolute().parent.parent.parent / 'build' / 'my-languages.so'
log.debug("Using language file: %s", language_file)
JAVA_LANGUAGE = Language(str(language_file), 'java')
parser = Parser()
parser.set_language(JAVA_LANGUAGE)
......@@ -51,14 +48,14 @@ def formast(file_path, writetoken, writeast, writerelativeast, writecompast, wri
line = sys.stdin.readline()
if not line:
break
process(Path(line.strip()), parser, overwrite, writetoken, writeast, writerelativeast, writecompast, writecompastsort)
process(Path(line.strip()), parser, overwrite, writetoken, writeast, writerelativeast, writecompastsort)
log.info("processed %s" % line)
sys.stdout.write("ok\n")
sys.stdout.flush()
else:
process(Path(file_path), parser, overwrite, writetoken, writeast, writerelativeast, writecompast, writecompastsort)
process(Path(file_path), parser, overwrite, writetoken, writeast, writerelativeast, writecompastsort)
def process(file_path, parser, overwrite, writetoken, writeast, writerelativeast, writecompast, writecompastsort):
def process(file_path, parser, overwrite, writetoken, writeast, writerelativeast, writecompastsort):
log.info(f"Processing {file_path}...")
......@@ -73,8 +70,6 @@ def process(file_path, parser, overwrite, writetoken, writeast, writerelativeast
if node.child_count == 0:
f.write(node.text.decode('utf-8'))
f.write("\n")
#tree = parse_java_file(parser, file_path)
#save_tokenized_file(file_path, tree)
# AST based
elif writeast:
with open(file_path, "rb") as f:
......@@ -83,14 +78,6 @@ def process(file_path, parser, overwrite, writetoken, writeast, writerelativeast
ast_code = process_tree_ast(tree)
with open(file_path.with_suffix(".ast"), 'w', encoding='utf-8') as f:
f.write(ast_code)
# Compressed AST based
elif writecompast:
with open(file_path, "rb") as f:
code = f.read()
tree = parser.parse(code)
ast_code = process_tree_comp(tree)
with open(file_path.with_suffix(".ast"), 'w', encoding='utf-8') as f:
f.write(ast_code)
# Sorted and compressed AST based
elif writecompastsort:
with open(file_path, "rb") as f:
......@@ -204,50 +191,7 @@ def process_tree_ast_relatively(tree):
process_node(tree.root_node)
return '\n'.join(lines)
## Processes a tree, compressed AST with hashing
def process_tree_comp(tree):
if tree is None:
raise ValueError("The tree object must not be None")
lookup = {}
lines = []
def process_node(node):
if node is None:
raise ValueError("The tree object does not have the expected structure")
line = ""
if node.children:
children = []
for child in node.children:
child_result = process_node(child)
if child_result is not None:
children.append(child_result)
line = 'B {} {}'.format(node.type, ' '.join(map(str, children)))
else:
try:
text = node.text.decode('utf-8')
except UnicodeDecodeError:
raise ValueError("The text of the leaf nodes must be encoded using utf-8")
line = 'L {}'.format(text)
idx = lookup.get(line)
if idx is not None:
return idx
hash_object = hashlib.sha256(line.encode('utf-8'))
hash_value = int.from_bytes(hash_object.digest()[:8], byteorder='big', signed=True)
idx = base64.urlsafe_b64encode(hash_value.to_bytes(8, byteorder='big', signed=True)).rstrip(b'=').decode('ascii')
lines.append('{} {}'.format(idx, line))
lookup[line] = idx
return idx
process_node(tree.root_node)
return '\n'.join(lines)
#Same as before, but this time sort hash values in the output
# Show every instance of the program just once as an AST, and hash the values
def process_tree_comp_sorted(tree):
if tree is None:
raise ValueError("The tree object must not be None")
......@@ -291,27 +235,6 @@ def process_tree_comp_sorted(tree):
sorted_lines = sorted(lines, key=lambda x: x.split()[0])
return '\n'.join(sorted_lines)
## Parse the java file and return the tree
def parse_java_file(parser, file_path):
if not is_java_file(file_path):
raise ValueError('File must have a .java extension')
with open(file_path, 'r', encoding='utf-8') as f:
src = f.read()
tree_word = parser.parse(src.encode("utf8"))
#tree_word = parser.parse(src)
return tree_word
## Save tokenized file
def save_tokenized_file(file_path, tree):
with open(file_path.with_suffix(".ast"),'w', encoding='utf-8') as f:
for node in traverse(tree):
if node.child_count == 0:
f.write(node.text.decode('utf-8'))
f.write("\n")
## Check if the file is a java file
def is_java_file(file_path):
return os.path.splitext(file_path)[1] == '.java'
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment