Revision e7cb0b4455c85b53aeba40f88ffddcf6d4002498 authored by Johannes Schindelin on 11 May 2018, 14:03:54 UTC, committed by Jeff King on 22 May 2018, 03:50:11 UTC
When we started to catch NTFS short names that clash with .git, we only
looked for GIT~1. This is sufficient because we only ever clone into an
empty directory, so .git is guaranteed to be the first subdirectory or
file in that directory.

However, even with a fresh clone, .gitmodules is *not* necessarily the
first file to be written that would want the NTFS short name GITMOD~1: a
malicious repository can add .gitmodul0000 and friends, which sorts
before `.gitmodules` and is therefore checked out *first*. For that
reason, we have to test not only for ~1 short names, but for others,
too.

It's hard to just adapt the existing checks in is_ntfs_dotgit(): since
Windows 2000 (i.e., in all Windows versions still supported by Git),
NTFS short names are only generated in the <prefix>~<number> form up to
number 4. After that, a *different* prefix is used, calculated from the
long file name using an undocumented, but stable algorithm.

For example, the short name of .gitmodules would be GITMOD~1, but if it
is taken, and all of ~2, ~3 and ~4 are taken, too, the short name
GI7EBA~1 will be used. From there, collisions are handled by
incrementing the number, shortening the prefix as needed (until ~9999999
is reached, in which case NTFS will not allow the file to be created).

We'd also want to handle .gitignore and .gitattributes, which suffer
from a similar problem, using the fall-back short names GI250A~1 and
GI7D29~1, respectively.

To accommodate for that, we could reimplement the hashing algorithm, but
it is just safer and simpler to provide the known prefixes. This
algorithm has been reverse-engineered and described at
https://usn.pw/blog/gen/2015/06/09/filenames/, which is defunct but
still available via https://web.archive.org/.

These can be recomputed by running the following Perl script:

-- snip --
use warnings;
use strict;

sub compute_short_name_hash ($) {
        my $checksum = 0;
        foreach (split('', $_[0])) {
                $checksum = ($checksum * 0x25 + ord($_)) & 0xffff;
        }

        $checksum = ($checksum * 314159269) & 0xffffffff;
        $checksum = 1 + (~$checksum & 0x7fffffff) if ($checksum & 0x80000000);
        $checksum -= (($checksum * 1152921497) >> 60) * 1000000007;

        return scalar reverse sprintf("%x", $checksum & 0xffff);
}

print compute_short_name_hash($ARGV[0]);
-- snap --

E.g., running that with the argument ".gitignore" will
result in "250a" (which then becomes "gi250a" in the code).

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Jeff King <peff@peff.net>
1 parent 0fc333b
Raw File
base85.c
#include "cache.h"

#undef DEBUG_85

#ifdef DEBUG_85
#define say(a) fprintf(stderr, a)
#define say1(a,b) fprintf(stderr, a, b)
#define say2(a,b,c) fprintf(stderr, a, b, c)
#else
#define say(a) do { /* nothing */ } while (0)
#define say1(a,b) do { /* nothing */ } while (0)
#define say2(a,b,c) do { /* nothing */ } while (0)
#endif

static const char en85[] = {
	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
	'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
	'U', 'V', 'W', 'X', 'Y', 'Z',
	'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
	'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
	'u', 'v', 'w', 'x', 'y', 'z',
	'!', '#', '$', '%', '&', '(', ')', '*', '+', '-',
	';', '<', '=', '>', '?', '@', '^', '_',	'`', '{',
	'|', '}', '~'
};

static char de85[256];
static void prep_base85(void)
{
	int i;
	if (de85['Z'])
		return;
	for (i = 0; i < ARRAY_SIZE(en85); i++) {
		int ch = en85[i];
		de85[ch] = i + 1;
	}
}

int decode_85(char *dst, const char *buffer, int len)
{
	prep_base85();

	say2("decode 85 <%.*s>", len / 4 * 5, buffer);
	while (len) {
		unsigned acc = 0;
		int de, cnt = 4;
		unsigned char ch;
		do {
			ch = *buffer++;
			de = de85[ch];
			if (--de < 0)
				return error("invalid base85 alphabet %c", ch);
			acc = acc * 85 + de;
		} while (--cnt);
		ch = *buffer++;
		de = de85[ch];
		if (--de < 0)
			return error("invalid base85 alphabet %c", ch);
		/* Detect overflow. */
		if (0xffffffff / 85 < acc ||
		    0xffffffff - de < (acc *= 85))
			return error("invalid base85 sequence %.5s", buffer-5);
		acc += de;
		say1(" %08x", acc);

		cnt = (len < 4) ? len : 4;
		len -= cnt;
		do {
			acc = (acc << 8) | (acc >> 24);
			*dst++ = acc;
		} while (--cnt);
	}
	say("\n");

	return 0;
}

void encode_85(char *buf, const unsigned char *data, int bytes)
{
	say("encode 85");
	while (bytes) {
		unsigned acc = 0;
		int cnt;
		for (cnt = 24; cnt >= 0; cnt -= 8) {
			unsigned ch = *data++;
			acc |= ch << cnt;
			if (--bytes == 0)
				break;
		}
		say1(" %08x", acc);
		for (cnt = 4; cnt >= 0; cnt--) {
			int val = acc % 85;
			acc /= 85;
			buf[cnt] = en85[val];
		}
		buf += 5;
	}
	say("\n");

	*buf = 0;
}

#ifdef DEBUG_85
int main(int ac, char **av)
{
	char buf[1024];

	if (!strcmp(av[1], "-e")) {
		int len = strlen(av[2]);
		encode_85(buf, av[2], len);
		if (len <= 26) len = len + 'A' - 1;
		else len = len + 'a' - 26 - 1;
		printf("encoded: %c%s\n", len, buf);
		return 0;
	}
	if (!strcmp(av[1], "-d")) {
		int len = *av[2];
		if ('A' <= len && len <= 'Z') len = len - 'A' + 1;
		else len = len - 'a' + 26 + 1;
		decode_85(buf, av[2]+1, len);
		printf("decoded: %.*s\n", len, buf);
		return 0;
	}
	if (!strcmp(av[1], "-t")) {
		char t[4] = { -1,-1,-1,-1 };
		encode_85(buf, t, 4);
		printf("encoded: D%s\n", buf);
		return 0;
	}
}
#endif
back to top