cli.py
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import dulwich.repo
import os
import sys
from functools import partial
from urllib.parse import urlparse
from swh.model import hashutil
from swh.model.identifiers import (
origin_identifier,
snapshot_identifier,
parse_swhid,
swhid,
SWHID,
CONTENT,
DIRECTORY,
)
from swh.model.exceptions import ValidationError
from swh.model.from_disk import Content, Directory
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
# Mapping between dulwich types and Software Heritage ones. Used by snapshot ID
# computation.
_DULWICH_TYPES = {
b"blob": "content",
b"tree": "directory",
b"commit": "revision",
b"tag": "release",
}
class SWHIDParamType(click.ParamType):
name = "persistent identifier"
def convert(self, value, param, ctx):
try:
parse_swhid(value)
return value # return as string, as we need just that
except ValidationError as e:
self.fail("%s is not a valid SWHID. %s." % (value, e), param, ctx)
def swhid_of_file(path):
object = Content.from_file(path=path).get_data()
return swhid(CONTENT, object)
def swhid_of_file_content(data):
object = Content.from_bytes(mode=644, data=data).get_data()
return swhid(CONTENT, object)
def swhid_of_dir(path):
object = Directory.from_disk(path=path).get_data()
return swhid(DIRECTORY, object)
def swhid_of_origin(url):
swhid = SWHID(object_type="origin", object_id=origin_identifier({"url": url}))
return str(swhid)
def swhid_of_git_repo(path):
repo = dulwich.repo.Repo(path)
branches = {}
for ref, target in repo.refs.as_dict().items():
obj = repo[target]
if obj:
branches[ref] = {
"target": hashutil.bytehex_to_hash(target),
"target_type": _DULWICH_TYPES[obj.type_name],
}
else:
branches[ref] = None
for ref, target in repo.refs.get_symrefs().items():
branches[ref] = {
"target": target,
"target_type": "alias",
}
snapshot = {"branches": branches}
swhid = SWHID(object_type="snapshot", object_id=snapshot_identifier(snapshot))
return str(swhid)
def identify_object(obj_type, follow_symlinks, obj):
if obj_type == "auto":
if obj == "-" or os.path.isfile(obj):
obj_type = "content"
elif os.path.isdir(obj):
obj_type = "directory"
else:
try: # URL parsing
if urlparse(obj).scheme:
obj_type = "origin"
else:
raise ValueError
except ValueError:
raise click.BadParameter("cannot detect object type for %s" % obj)
swhid = None
if obj == "-":
content = sys.stdin.buffer.read()
swhid = swhid_of_file_content(content)
elif obj_type in ["content", "directory"]:
path = obj.encode(sys.getfilesystemencoding())
if follow_symlinks and os.path.islink(obj):
path = os.path.realpath(obj)
if obj_type == "content":
swhid = swhid_of_file(path)
elif obj_type == "directory":
swhid = swhid_of_dir(path)
elif obj_type == "origin":
swhid = swhid_of_origin(obj)
elif obj_type == "snapshot":
swhid = swhid_of_git_repo(obj)
else: # shouldn't happen, due to option validation
raise click.BadParameter("invalid object type: " + obj_type)
# note: we return original obj instead of path here, to preserve user-given
# file name in output
return (obj, swhid)
@click.command(context_settings=CONTEXT_SETTINGS)
@click.option(
"--dereference/--no-dereference",
"follow_symlinks",
default=True,
help="follow (or not) symlinks for OBJECTS passed as arguments "
+ "(default: follow)",
)
@click.option(
"--filename/--no-filename",
"show_filename",
default=True,
help="show/hide file name (default: show)",
)
@click.option(
"--type",
"-t",
"obj_type",
default="auto",
type=click.Choice(["auto", "content", "directory", "origin", "snapshot"]),
help="type of object to identify (default: auto)",
)
@click.option(
"--verify",
"-v",
metavar="SWHID",
type=SWHIDParamType(),
help="reference identifier to be compared with computed one",
)
@click.argument("objects", nargs=-1, required=True)
def identify(obj_type, verify, show_filename, follow_symlinks, objects):
"""Compute the Software Heritage persistent identifier (SWHID) for the given
source code object(s).
For more details about SWHIDs see:
\b
https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
Tip: you can pass "-" to identify the content of standard input.
\b
Examples:
\b
$ swh identify fork.c kmod.c sched/deadline.c
swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3 fork.c
swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2 kmod.c
swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82 sched/deadline.c
\b
$ swh identify --no-filename /usr/src/linux/kernel/
swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab
\b
$ git clone --mirror https://forge.softwareheritage.org/source/helloworld.git
$ swh identify --type snapshot helloworld.git/
swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93 helloworld.git
""" # NoQA # overlong lines in shell examples are fine
if verify and len(objects) != 1:
raise click.BadParameter("verification requires a single object")
results = map(partial(identify_object, obj_type, follow_symlinks), objects)
if verify:
swhid = next(results)[1]
if verify == swhid:
click.echo("SWHID match: %s" % swhid)
sys.exit(0)
else:
click.echo("SWHID mismatch: %s != %s" % (verify, swhid))
sys.exit(1)
else:
for (obj, swhid) in results:
msg = swhid
if show_filename:
msg = "%s\t%s" % (swhid, os.fsdecode(obj))
click.echo(msg)
if __name__ == "__main__":
identify()