More changes in the main.py, and updated README.md

68ad38c8 · s200545 · a49ba3ad · 68ad38c8 · 68ad38c8
Commit 68ad38c8 authored 2 years ago by s200545
--- a/README.md
+++ b/README.md
 # formast

-A AST formatting tool
+An AST formatting tool to run on java files, with different formatting options. Available options are: 
+    - tokenize
+    - ast 
+    - relative ast
+    - sorted compressed ast


 ## Develop

-To run the executable:
+Remember to download tree-sitter-java and run the following command (uncomment it in the __main__.py) to generate the language file, when you are running formastfor the first time.

 ```console
-pdm run python -m formast <args>
+ Language.build_library(
+     # Store the library in the `build` directory
+     'build/my-languages.so',
+
+     # Include one or more languages
+     [
+         '../../vendor/tree-sitter-java'
+     ]
+ )
+ ´´´
+
+To see the available options:
+```console
+pdm run formast --help
 ```

-How to run: 
+To run the executable:
+```console
+pdm run python -m formast <args>
 ```
-2>&1 | tee log.txt 
-
-pdm run regit -vv --repo "C:\Users\boran\OneDrive\DTU\BSc Thesis\babyrepos\onlinebookstore" -o "C:\Users\boran\OneDrive\DTU\BSc Thesis\babyrepos\onlinebookstore_ast10" -p '**\*.java' -m mapping.csv -- formast --writeast --overwrite {}


-pdm run python .\formast\commitdiff.py -w "C:\Users\boran\OneDrive\DTU\BSc Thesis\babyrepos\onlinebookstore" output_nws.csv
-
-```
--- a/src/formast/__main__.py
+++ b/src/formast/__main__.py
@@ -9,6 +9,7 @@ from pathlib import Path

 log = logging.getLogger(__name__)

+## Uncomment this block if you don't have the language file yet, first time running formast
 # Language.build_library(
 #     # Store the library in the `build` directory
 #     'build/my-languages.so',
@@ -19,6 +20,7 @@ log = logging.getLogger(__name__)
 #     ]
 # )

+# Load the language (assuming the language is created, if not run the commented code above)
 language_file = Path(__file__).absolute().parent.parent.parent / 'build' / 'my-languages.so'
 JAVA_LANGUAGE = Language(str(language_file), 'java')

@@ -26,23 +28,18 @@ JAVA_LANGUAGE = Language(str(language_file), 'java')
 @click.option("--writetoken", is_flag=True, help="Write the new file with the tokenized content")
 @click.option("--writeast", is_flag=True, help="Write the new file with the AST content")
 @click.option("--writerelativeast", is_flag=True, help="Write the AST content with relative positions")
-@click.option("--writecompast", is_flag=True, help="Write the new file with the compressed AST content and hashing")
 @click.option("--writecompastsort", is_flag=True, help="Write the new file with the compressed AST content and sorted hashing")
 @click.option("--overwrite", is_flag=True, help="Overwrite the original .java file with the new content")
 @click.option("-v", "--verbose", count=True, help="Increase output verbosity")
 @click.argument("file_path", type=str) 

-
 ## Formast 
-def formast(file_path, writetoken, writeast, writerelativeast, writecompast, writecompastsort, overwrite, verbose):
+def formast(file_path, writetoken, writeast, writerelativeast, writecompastsort, overwrite, verbose):

    # initialize logging
    logging.basicConfig(level=verbose)

-    # Load the language (assuming the language is created, if not run the commented code above)
-    language_file = Path(__file__).absolute().parent.parent.parent / 'build' / 'my-languages.so'
    log.debug("Using language file: %s", language_file)
-    JAVA_LANGUAGE = Language(str(language_file), 'java')
    parser = Parser()
    parser.set_language(JAVA_LANGUAGE)

@@ -51,14 +48,14 @@ def formast(file_path, writetoken, writeast, writerelativeast, writecompast, wri
            line = sys.stdin.readline()
            if not line:
                break
-            process(Path(line.strip()), parser, overwrite, writetoken, writeast, writerelativeast, writecompast, writecompastsort)
+            process(Path(line.strip()), parser, overwrite, writetoken, writeast, writerelativeast, writecompastsort)
            log.info("processed %s" % line)
            sys.stdout.write("ok\n")
            sys.stdout.flush()
    else:
-        process(Path(file_path), parser, overwrite, writetoken, writeast, writerelativeast, writecompast, writecompastsort)
+        process(Path(file_path), parser, overwrite, writetoken, writeast, writerelativeast, writecompastsort)
        
-def process(file_path, parser, overwrite, writetoken, writeast, writerelativeast, writecompast, writecompastsort):
+def process(file_path, parser, overwrite, writetoken, writeast, writerelativeast, writecompastsort):

    log.info(f"Processing {file_path}...")
    
@@ -73,8 +70,6 @@ def process(file_path, parser, overwrite, writetoken, writeast, writerelativeast
                if node.child_count == 0:
                    f.write(node.text.decode('utf-8'))
                    f.write("\n")
-        #tree = parse_java_file(parser, file_path)
-        #save_tokenized_file(file_path, tree)
    # AST based
    elif writeast:
        with open(file_path, "rb") as f:
@@ -83,14 +78,6 @@ def process(file_path, parser, overwrite, writetoken, writeast, writerelativeast
        ast_code = process_tree_ast(tree)
        with open(file_path.with_suffix(".ast"), 'w', encoding='utf-8') as f:
            f.write(ast_code)
-    # Compressed AST based
-    elif writecompast:
-        with open(file_path, "rb") as f:
-            code = f.read()
-        tree = parser.parse(code)
-        ast_code = process_tree_comp(tree)
-        with open(file_path.with_suffix(".ast"), 'w', encoding='utf-8') as f:
-            f.write(ast_code)
    # Sorted and compressed AST based
    elif writecompastsort:
        with open(file_path, "rb") as f:
@@ -204,50 +191,7 @@ def process_tree_ast_relatively(tree):
    process_node(tree.root_node)
    return '\n'.join(lines)

-## Processes a tree, compressed AST with hashing
-def process_tree_comp(tree):
-    if tree is None:
-        raise ValueError("The tree object must not be None")
-
-    lookup = {}
-    lines = []
-
-    def process_node(node):
-        if node is None:
-            raise ValueError("The tree object does not have the expected structure")
-
-        line = ""
-
-        if node.children:
-            children = []
-            for child in node.children:
-                child_result = process_node(child)
-                if child_result is not None:
-                    children.append(child_result)
-            line = 'B {} {}'.format(node.type, ' '.join(map(str, children)))
-        else:
-            try:
-                text = node.text.decode('utf-8')
-            except UnicodeDecodeError:
-                raise ValueError("The text of the leaf nodes must be encoded using utf-8")
-            line = 'L {}'.format(text)
-
-        idx = lookup.get(line)
-        if idx is not None:
-            return idx
-
-        hash_object = hashlib.sha256(line.encode('utf-8'))
-        hash_value = int.from_bytes(hash_object.digest()[:8], byteorder='big', signed=True)
-        idx = base64.urlsafe_b64encode(hash_value.to_bytes(8, byteorder='big', signed=True)).rstrip(b'=').decode('ascii')
-        lines.append('{} {}'.format(idx, line))
-        lookup[line] = idx
-
-        return idx
-
-    process_node(tree.root_node)
-    return '\n'.join(lines)
-
-#Same as before, but this time sort hash values in the output
+# Show every instance of the program just once as an AST, and hash the values
 def process_tree_comp_sorted(tree):
    if tree is None:
        raise ValueError("The tree object must not be None")
@@ -291,27 +235,6 @@ def process_tree_comp_sorted(tree):
    sorted_lines = sorted(lines, key=lambda x: x.split()[0])
    return '\n'.join(sorted_lines)    

-## Parse the java file and return the tree
-def parse_java_file(parser, file_path):
-    if not is_java_file(file_path):
-        raise ValueError('File must have a .java extension')
-    
-    with open(file_path, 'r', encoding='utf-8') as f:
-        src = f.read()
-
-    tree_word = parser.parse(src.encode("utf8"))
-    #tree_word = parser.parse(src)
-
-    return tree_word 
-
-## Save tokenized file
-def save_tokenized_file(file_path, tree):
-    with open(file_path.with_suffix(".ast"),'w', encoding='utf-8') as f:
-        for node in traverse(tree):
-            if node.child_count == 0:
-                    f.write(node.text.decode('utf-8'))
-                    f.write("\n")
-
 ## Check if the file is a java file
 def is_java_file(file_path):
    return os.path.splitext(file_path)[1] == '.java'