Revision a4e7771b2bda7fb77cab0e25ecf1c04d945915f8 authored by Jenkins for Software Heritage on 05 April 2019, 14:43:27 UTC, committed by Jenkins for Software Heritage on 05 April 2019, 14:43:27 UTC
with Debian dir 7e55ec456893f5a7d8dd65a779c8417ea5afcd0f
2 parent s eb66a47 + de805da
Raw File
087.sql
-- SWH DB schema upgrade
-- from_version: 86
-- to_version: 87
-- description: indexer: Add indexer's new content properties table

insert into dbversion(version, release, description)
      values(87, now(), 'Work In Progress');

-- Properties (mimetype, encoding, etc...)
create table content_mimetype (
id sha1 primary key references content(sha1) not null,
  mimetype bytea not null,
  encoding bytea not null
);

comment on table content_mimetype is 'Metadata associated to a raw content';
comment on column content_mimetype.mimetype is 'Raw content Mimetype';
comment on column content_mimetype.encoding is 'Raw content encoding';

-- check which entries of tmp_bytea are missing from content_mimetype
--
-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
-- 2. call this function
create or replace function swh_mimetype_missing()
    returns setof sha1
    language plpgsql
as $$
begin
    return query
	(select id::sha1 from tmp_bytea as tmp
	 where not exists
	     (select 1 from content_mimetype as c where c.id = tmp.id));
    return;
end
$$;

comment on function swh_mimetype_missing() IS 'Filter missing mimetype';

-- add tmp_content_mimetype entries to content_mimetype, skipping duplicates
--
-- operates in bulk: 0. swh_mktemp(content_mimetype), 1. COPY to tmp_content_mimetype,
-- 2. call this function
create or replace function swh_mimetype_add()
    returns void
    language plpgsql
as $$
begin
    insert into content_mimetype (id, mimetype, encoding)
	select id, mimetype, encoding
	from tmp_content_mimetype
        on conflict do nothing;
    return;
end
$$;

COMMENT ON FUNCTION swh_mimetype_add() IS 'Add new content mimetype';

create type languages as enum (
  'abap',
  'abnf',
  'actionscript',
  'actionscript-3',
  'ada',
  'adl',
  'agda',
  'alloy',
  'ambienttalk',
  'antlr',
  'antlr-with-actionscript-target',
  'antlr-with-c#-target',
  'antlr-with-cpp-target',
  'antlr-with-java-target',
  'antlr-with-objectivec-target',
  'antlr-with-perl-target',
  'antlr-with-python-target',
  'antlr-with-ruby-target',
  'apacheconf',
  'apl',
  'applescript',
  'arduino',
  'aspectj',
  'aspx-cs',
  'aspx-vb',
  'asymptote',
  'autohotkey',
  'autoit',
  'awk',
  'base-makefile',
  'bash',
  'bash-session',
  'batchfile',
  'bbcode',
  'bc',
  'befunge',
  'blitzbasic',
  'blitzmax',
  'bnf',
  'boo',
  'boogie',
  'brainfuck',
  'bro',
  'bugs',
  'c',
  'c#',
  'c++',
  'c-objdump',
  'ca65-assembler',
  'cadl',
  'camkes',
  'cbm-basic-v2',
  'ceylon',
  'cfengine3',
  'cfstatement',
  'chaiscript',
  'chapel',
  'cheetah',
  'cirru',
  'clay',
  'clojure',
  'clojurescript',
  'cmake',
  'cobol',
  'cobolfree',
  'coffeescript',
  'coldfusion-cfc',
  'coldfusion-html',
  'common-lisp',
  'component-pascal',
  'coq',
  'cpp-objdump',
  'cpsa',
  'crmsh',
  'croc',
  'cryptol',
  'csound-document',
  'csound-orchestra',
  'csound-score',
  'css',
  'css+django/jinja',
  'css+genshi-text',
  'css+lasso',
  'css+mako',
  'css+mozpreproc',
  'css+myghty',
  'css+php',
  'css+ruby',
  'css+smarty',
  'cuda',
  'cypher',
  'cython',
  'd',
  'd-objdump',
  'darcs-patch',
  'dart',
  'debian-control-file',
  'debian-sourcelist',
  'delphi',
  'dg',
  'diff',
  'django/jinja',
  'docker',
  'dtd',
  'duel',
  'dylan',
  'dylan-session',
  'dylanlid',
  'earl-grey',
  'easytrieve',
  'ebnf',
  'ec',
  'ecl',
  'eiffel',
  'elixir',
  'elixir-iex-session',
  'elm',
  'emacslisp',
  'embedded-ragel',
  'erb',
  'erlang',
  'erlang-erl-session',
  'evoque',
  'ezhil',
  'factor',
  'fancy',
  'fantom',
  'felix',
  'fish',
  'fortran',
  'fortranfixed',
  'foxpro',
  'fsharp',
  'gap',
  'gas',
  'genshi',
  'genshi-text',
  'gettext-catalog',
  'gherkin',
  'glsl',
  'gnuplot',
  'go',
  'golo',
  'gooddata-cl',
  'gosu',
  'gosu-template',
  'groff',
  'groovy',
  'haml',
  'handlebars',
  'haskell',
  'haxe',
  'hexdump',
  'html',
  'html+cheetah',
  'html+django/jinja',
  'html+evoque',
  'html+genshi',
  'html+handlebars',
  'html+lasso',
  'html+mako',
  'html+myghty',
  'html+php',
  'html+smarty',
  'html+twig',
  'html+velocity',
  'http',
  'hxml',
  'hy',
  'hybris',
  'idl',
  'idris',
  'igor',
  'inform-6',
  'inform-6-template',
  'inform-7',
  'ini',
  'io',
  'ioke',
  'irc-logs',
  'isabelle',
  'j',
  'jade',
  'jags',
  'jasmin',
  'java',
  'java-server-page',
  'javascript',
  'javascript+cheetah',
  'javascript+django/jinja',
  'javascript+genshi-text',
  'javascript+lasso',
  'javascript+mako',
  'javascript+mozpreproc',
  'javascript+myghty',
  'javascript+php',
  'javascript+ruby',
  'javascript+smarty',
  'jcl',
  'json',
  'json-ld',
  'julia',
  'julia-console',
  'kal',
  'kconfig',
  'koka',
  'kotlin',
  'lasso',
  'lean',
  'lesscss',
  'lighttpd-configuration-file',
  'limbo',
  'liquid',
  'literate-agda',
  'literate-cryptol',
  'literate-haskell',
  'literate-idris',
  'livescript',
  'llvm',
  'logos',
  'logtalk',
  'lsl',
  'lua',
  'makefile',
  'mako',
  'maql',
  'mask',
  'mason',
  'mathematica',
  'matlab',
  'matlab-session',
  'minid',
  'modelica',
  'modula-2',
  'moinmoin/trac-wiki-markup',
  'monkey',
  'moocode',
  'moonscript',
  'mozhashpreproc',
  'mozpercentpreproc',
  'mql',
  'mscgen',
  'msdos-session',
  'mupad',
  'mxml',
  'myghty',
  'mysql',
  'nasm',
  'nemerle',
  'nesc',
  'newlisp',
  'newspeak',
  'nginx-configuration-file',
  'nimrod',
  'nit',
  'nix',
  'nsis',
  'numpy',
  'objdump',
  'objdump-nasm',
  'objective-c',
  'objective-c++',
  'objective-j',
  'ocaml',
  'octave',
  'odin',
  'ooc',
  'opa',
  'openedge-abl',
  'pacmanconf',
  'pan',
  'parasail',
  'pawn',
  'perl',
  'perl6',
  'php',
  'pig',
  'pike',
  'pkgconfig',
  'pl/pgsql',
  'postgresql-console-(psql)',
  'postgresql-sql-dialect',
  'postscript',
  'povray',
  'powershell',
  'powershell-session',
  'praat',
  'prolog',
  'properties',
  'protocol-buffer',
  'puppet',
  'pypy-log',
  'python',
  'python-3',
  'python-3.0-traceback',
  'python-console-session',
  'python-traceback',
  'qbasic',
  'qml',
  'qvto',
  'racket',
  'ragel',
  'ragel-in-c-host',
  'ragel-in-cpp-host',
  'ragel-in-d-host',
  'ragel-in-java-host',
  'ragel-in-objective-c-host',
  'ragel-in-ruby-host',
  'raw-token-data',
  'rconsole',
  'rd',
  'rebol',
  'red',
  'redcode',
  'reg',
  'resourcebundle',
  'restructuredtext',
  'rexx',
  'rhtml',
  'roboconf-graph',
  'roboconf-instances',
  'robotframework',
  'rpmspec',
  'rql',
  'rsl',
  'ruby',
  'ruby-irb-session',
  'rust',
  's',
  'sass',
  'scala',
  'scalate-server-page',
  'scaml',
  'scheme',
  'scilab',
  'scss',
  'shen',
  'slim',
  'smali',
  'smalltalk',
  'smarty',
  'snobol',
  'sourcepawn',
  'sparql',
  'sql',
  'sqlite3con',
  'squidconf',
  'stan',
  'standard-ml',
  'supercollider',
  'swift',
  'swig',
  'systemverilog',
  'tads-3',
  'tap',
  'tcl',
  'tcsh',
  'tcsh-session',
  'tea',
  'termcap',
  'terminfo',
  'terraform',
  'tex',
  'text-only',
  'thrift',
  'todotxt',
  'trafficscript',
  'treetop',
  'turtle',
  'twig',
  'typescript',
  'urbiscript',
  'vala',
  'vb.net',
  'vctreestatus',
  'velocity',
  'verilog',
  'vgl',
  'vhdl',
  'viml',
  'x10',
  'xml',
  'xml+cheetah',
  'xml+django/jinja',
  'xml+evoque',
  'xml+lasso',
  'xml+mako',
  'xml+myghty',
  'xml+php',
  'xml+ruby',
  'xml+smarty',
  'xml+velocity',
  'xquery',
  'xslt',
  'xtend',
  'xul+mozpreproc',
  'yaml',
  'yaml+jinja',
  'zephir',
  'unknown'
);

-- Language metadata
create table content_language (
  id sha1 primary key references content(sha1) not null,
  lang languages not null
);

comment on table content_language is 'Language information on a raw content';
comment on column content_language.lang is 'Language information';

-- check which entries of tmp_bytea are missing from content_language
--
-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
-- 2. call this function
create or replace function swh_language_missing()
    returns setof sha1
    language plpgsql
as $$
begin
    return query
	(select id::sha1 from tmp_bytea as tmp
	 where not exists
	     (select 1 from content_language as c where c.id = tmp.id));
    return;
end
$$;

COMMENT ON FUNCTION swh_language_missing() IS 'Filter missing content language';


-- add tmp_content_language entries to content_language, skipping duplicates
--
-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to tmp_content_language,
-- 2. call this function
create or replace function swh_language_add()
    returns void
    language plpgsql
as $$
begin
    insert into content_language (id, lang)
	select id, lang
	from tmp_content_language
        on conflict do nothing;
    return;
end
$$;

COMMENT ON FUNCTION swh_language_add() IS 'Add new content language';
back to top