Skip to content
Snippets Groups Projects
Commit 58b3ee4e authored by chrg's avatar chrg
Browse files

Major cleanup

parent e15aaab0
No related branches found
No related tags found
No related merge requests found
from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from typing import Callable from typing import *
import subprocess import subprocess
import csv import csv
import logging import logging
...@@ -18,122 +17,100 @@ from . import utils ...@@ -18,122 +17,100 @@ from . import utils
log = logging.getLogger("regit") log = logging.getLogger("regit")
@dataclass # Some code borrowed from https://github.com/newren/git-filter-repo/blob/main/contrib/filter-repo-demos/lint-history
class BlobHandler: @contextmanager
from subprocess import Popen def make_blobreader(repo: git.Repo):
with utils.popen(
repo: git.Repo ["git", "-C", repo.working_dir, "cat-file", "--batch"],
blobs_handled: dict[int, int] = field(default_factory=dict) stdin=subprocess.PIPE,
is_relevant: Callable[[Path], bool] = lambda _: True stdout=subprocess.PIPE,
transform: Callable[[Path, bytes], bytes] = lambda _, b: b ) as gitcat:
filter: fr.RepoFilter | None = None
gitcat: Popen | None = None
bar: tqdm.tqdm | None = None
def __enter__(self) -> "BlobHandler":
from subprocess import PIPE
log.debug("Starting blob handler") if gitcat.stdin is None or gitcat.stdout is None:
self.gitcat = utils.popen(
["git", "-C", self.repo.working_dir, "cat-file", "--batch"],
stdin=PIPE,
stdout=PIPE,
)
if self.gitcat.stdin is None or self.gitcat.stdout is None:
raise RuntimeError("Could not start git cat-file") raise RuntimeError("Could not start git cat-file")
return self
def __exit__(self, exc_type, exc_val, exc_tb): stdin, stdout = gitcat.stdin, gitcat.stdout
if self.gitcat is None:
return
if self.gitcat.stdin is not None:
self.gitcat.stdin.close()
self.gitcat.wait()
if self.bar: def read_blob(blob_id: bytes) -> bytes:
self.bar.close()
# Some code borrowed from https://github.com/newren/git-filter-repo/blob/main/contrib/filter-repo-demos/lint-history
def __call__(self, commit: fr.Commit, metadata):
assert self.filter is not None
for change in commit.file_changes:
filename = Path(change.filename.decode("utf-8"))
if change.type == b"D" or not self.is_relevant(filename):
continue
if self.bar:
self.bar.update(1)
if change.blob_id not in self.blobs_handled:
content = self.read_blob(change.blob_id)
blob = fr.Blob(self.transform(filename, content))
self.filter.insert(blob)
self.blobs_handled[change.blob_id] = blob.id
change.blob_id = self.blobs_handled[change.blob_id]
def read_blob(self, blob_id: bytes):
log.debug("Reading blob %s", blob_id) log.debug("Reading blob %s", blob_id)
assert self.gitcat is not None
# To get the typecheck to pass # To get the typecheck to pass
stdin, stdout = self.gitcat.stdin, self.gitcat.stdout
assert stdin is not None and stdout is not None assert stdin is not None and stdout is not None
stdin.write(blob_id + b"\n") stdin.write(blob_id + b"\n") # type: ignore
stdin.flush() stdin.flush()
_, _, objsize = stdout.readline().split() _, _, objsize = stdout.readline().split()
return stdout.read(int(objsize) + 1)[:-1] return stdout.read(int(objsize) + 1)[:-1] # type: ignore
yield read_blob
@contextmanager
def mktransformer(program, args, batch, on_error):
with tempfile.TemporaryDirectory() as folder:
folder = Path(folder)
pargs = list(args) def run_filter(
repo: git.Repo,
update: Callable[[Path, fr.RepoFilter, bytes], bytes] = lambda _, _n, x: x,
is_relevant: Callable[[Path], bool] = lambda x: True,
options: Optional[fr.FilteringOptions] = None,
) -> fr.RepoFilter:
"""Create a blob handler"""
if batch: blobs_handled = dict()
formatproc = utils.popen(
[program] + pargs, def commit_callback(commit: fr.Commit, metadata):
stdin=subprocess.PIPE, nonlocal filter
stdout=subprocess.PIPE, for change in commit.file_changes:
universal_newlines=False, filename = Path(change.filename.decode("utf-8"))
if change.type == b"D" or not is_relevant(filename):
continue
if change.blob_id not in blobs_handled:
blobs_handled[change.blob_id] = update(filename, filter, change.blob_id)
if change.blob_id != blobs_handled[change.blob_id]:
change.blob_id = blobs_handled[change.blob_id]
if options is None:
options = fr.FilteringOptions.parse_args( # type: ignore
["--prune-empty", "never", "--quiet"],
error_on_empty=False,
) )
if formatproc.stdin is None or formatproc.stdout is None:
raise RuntimeError("Could not start the formatting program")
fin, fout = formatproc.stdin, formatproc.stdout filter = fr.RepoFilter(
options,
commit_callback=commit_callback,
)
with utils.chdir(repo.working_dir):
filter.run()
return filter
@contextmanager
def make_stream_transformer(program, args, config):
def transformer(file: Path, content: bytes) -> bytes: def transformer(file: Path, content: bytes) -> bytes:
with utils.tfile(folder, file.name, content) as tmp_file:
try: try:
with utils.timeit(file.name): return utils.run_stdout([program] + list(args), input=content)
fin.write(str(tmp_file).encode("utf-8") + b"\n") # type: ignore
fin.flush()
log.debug("Waiting for respond.")
reps = fout.readline().decode() # type:ignore
log.debug("Process respond %s:", reps)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
on_error = config.get("on_error", "fail")
if on_error in "fail": if on_error in "fail":
file = Path(file.name).with_suffix(".input").absolute()
log.error("Writing argument to %s", file)
with open(file, "wb") as f:
f.write(content)
raise raise
elif on_error == "warn": elif on_error == "warn":
log.warn("Process failed: ", e.cmd) log.warn("Process failed: ", e.cmd)
return content
elif on_error == "revert": elif on_error == "revert":
return content return content
elif on_error == "ignore": elif on_error == "ignore":
log.warn("Process failed: ", e.cmd) return content
else: else:
log.error("Unknown 'on error' value") log.error("Unknown 'on error' value")
raise KeyError() raise KeyError()
with open(tmp_file, "rb") as f:
return f.read()
yield transformer yield transformer
formatproc.stdin.close()
formatproc.wait() @contextmanager
elif "{}" in pargs: def make_file_transformer(program, args, config):
with tempfile.TemporaryDirectory() as folder:
folder = Path(folder)
pargs = list(args)
ix = pargs.index("{}") ix = pargs.index("{}")
def transformer(file: Path, content: bytes) -> bytes: def transformer(file: Path, content: bytes) -> bytes:
...@@ -141,23 +118,83 @@ def mktransformer(program, args, batch, on_error): ...@@ -141,23 +118,83 @@ def mktransformer(program, args, batch, on_error):
pargs[ix] = str(tmp_file) pargs[ix] = str(tmp_file)
try: try:
utils.run([program] + pargs) utils.run([program] + pargs)
except subprocess.CalledProcessError: except subprocess.CalledProcessError as e:
on_error = config.get("on_error", "fail")
if on_error in "fail":
file = Path(file.name).with_suffix(".input").absolute() file = Path(file.name).with_suffix(".input").absolute()
log.error("Writing argument to %s", file) log.error("Writing argument to %s", file)
with open(file, "wb") as f: with open(file, "wb") as f:
f.write(content) f.write(content)
raise raise
elif on_error == "warn":
log.warn("Process failed: ", e.cmd)
elif on_error == "revert":
return content
elif on_error == "ignore":
log.warn("Process failed: ", e.cmd)
else:
log.error("Unknown 'on error' value")
raise KeyError()
with open(tmp_file, "rb") as f: with open(tmp_file, "rb") as f:
return f.read() return f.read()
yield transformer yield transformer
else:
@contextmanager
def make_batch_transformer(program, args, **config):
with (
tempfile.TemporaryDirectory() as folder,
utils.popen(
[program] + list(args),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
universal_newlines=False,
) as formatproc,
):
folder = Path(folder)
if formatproc.stdin is None or formatproc.stdout is None:
raise RuntimeError("Could not start the formatting program")
fin, fout = formatproc.stdin, formatproc.stdout
def transformer(file: Path, content: bytes) -> bytes: def transformer(file: Path, content: bytes) -> bytes:
return utils.run_stdout([program] + pargs, input=content) with utils.tfile(folder, file.name, content) as tmp_file:
with utils.timeit(file.name):
fin.write(str(tmp_file).encode("utf-8") + b"\n") # type: ignore
fin.flush()
log.debug("Waiting for respond.")
reps = fout.readline().decode() # type:ignore
log.debug("Process respond %s:", reps)
with open(tmp_file, "rb") as f:
return f.read()
yield transformer yield transformer
formatproc.stdin.close()
formatproc.wait()
def countfiles(repo: git.Repo, is_relevant: Callable[[Path], bool]) -> Counter[Path]:
"""count the number of files that will be processed"""
from collections import Counter
cnt = Counter()
def count(f, _, x):
cnt.update([f])
return x
run_filter(
repo=repo,
update=count,
is_relevant=is_relevant,
)
return cnt
@click.command() @click.command()
@click.option( @click.option(
...@@ -178,6 +215,13 @@ def mktransformer(program, args, batch, on_error): ...@@ -178,6 +215,13 @@ def mktransformer(program, args, batch, on_error):
help="the glob-pattern to match files.", help="the glob-pattern to match files.",
type=str, type=str,
) )
@click.option(
"-t",
"--type",
help="the type of transformation to apply.",
type=click.Choice(["batch", "stream", "file", "auto"], case_sensitive=False),
default="auto",
)
@click.option( @click.option(
"-m", "-m",
"--mapping", "--mapping",
...@@ -203,11 +247,11 @@ def regit( ...@@ -203,11 +247,11 @@ def regit(
pattern: str | None, pattern: str | None,
output: Path, output: Path,
mapping, mapping,
type: str,
program: Path, program: Path,
on_error: str,
args: tuple[str], args: tuple[str],
verbose: int, verbose: int,
batch: bool, **config,
): ):
"""A simple program that runs a command on every commit on a repo.""" """A simple program that runs a command on every commit on a repo."""
...@@ -224,6 +268,18 @@ def regit( ...@@ -224,6 +268,18 @@ def regit(
repo = git.Repo.clone_from(url=repo, to_path=output, no_local=True) repo = git.Repo.clone_from(url=repo, to_path=output, no_local=True)
log.info("Cloned repo to %s", output) log.info("Cloned repo to %s", output)
if type == "auto":
if "{}" in args:
type = "file"
else:
type = "stream"
transformer = {
"file": make_file_transformer,
"stream": make_stream_transformer,
"batch": make_batch_transformer,
}[type]
def is_relevant(file: Path): def is_relevant(file: Path):
if pattern is None: if pattern is None:
return True return True
...@@ -231,49 +287,27 @@ def regit( ...@@ -231,49 +287,27 @@ def regit(
log.debug("Check if %s matched pattern %s", file, match) log.debug("Check if %s matched pattern %s", file, match)
return match return match
if True: with utils.timeit("counting files", logfn=log.info):
options = fr.FilteringOptions.parse_args( cnt = countfiles(repo, is_relevant)
["--prune-empty", "never", "--quiet"],
error_on_empty=False,
)
from collections import Counter with (
transformer(program, args, config) as transform,
cnt = Counter() make_blobreader(repo) as blobreader,
tqdm.tqdm(total=cnt.total(), unit="files", desc="formatting files") as bar,
def find_files(commit, metadata): ):
for change in commit.file_changes:
filename = Path(change.filename.decode("utf-8"))
if change.type == b"D" or not is_relevant(filename):
continue
cnt.update([filename])
filter = fr.RepoFilter(options, commit_callback=find_files)
with utils.timeit("prefilter", log.info), utils.chdir(repo.working_dir):
filter.run()
log.debug(f"Continue to format {cnt.total()} files") def update(filename: Path, filter: fr.RepoFilter, blob_id: bytes) -> bytes:
for f, c in cnt.most_common(): bar.update()
log.debug(f"{f}: {c}") content = blobreader(blob_id)
blob = fr.Blob(transform(filename, content))
filter.insert(blob)
return blob.id # type: ignore
with mktransformer(program, args, batch, on_error) as transformer: filter = run_filter(
handler = BlobHandler( repo=repo,
repo, update=update,
is_relevant=is_relevant, is_relevant=is_relevant,
transform=transformer,
bar=tqdm.tqdm(total=cnt.total()),
)
log.debug("Starting handler")
with handler:
options = fr.FilteringOptions.parse_args(
["--prune-empty", "never"],
error_on_empty=False,
) )
filter = fr.RepoFilter(options, commit_callback=handler)
handler.filter = filter
with utils.timeit("git filter", log.info), utils.chdir(repo.working_dir):
filter.run()
if mapping: if mapping:
log.debug("Writing mapping...") log.debug("Writing mapping...")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment