https://github.com/bartoszzielinski/deep-fbanks
Revision 7b91f2919d63064cad53cf733f334d9de5c08bc9 authored by Bartosz Zieliński on 22 January 2018, 15:40:09 UTC, committed by Bartosz Zieliński on 22 January 2018, 15:40:09 UTC
1 parent 7520cd0
Tip revision: 7b91f2919d63064cad53cf733f334d9de5c08bc9 authored by Bartosz Zieliński on 22 January 2018, 15:40:09 UTC
Why should I trust you analysis
Why should I trust you analysis
Tip revision: 7b91f29
get_dcnn_features.m
function [code, codeLoc] = get_dcnn_features(net, im, regions, varargin)
% GET_DCNN_FEATURES Get convolutional features for an image region
% This function extracts the DCNN (CNN+FV) for one or more regions in an image.
% These can be used as SIFT replacement in e.g. a Fisher Vector.
%
% MASK should be an array that has the same spatial dimensions of the
% imge IM, but one or more planes. Each plane specifies one or more
% non-overlapping image regions by associating to each pixel a
% corresponding region ID. IDs should be contiguous intergers starting
% from 1. Any pixel assogined the label 0 does not belong to any region
% in that plane.
%
% MASK is a relatively efficient manner of specifying multiple
% segmentations of the image.
%
% Note that both IM and MASK are cell arrays, allowing to process
% a number of images at the same time.
opts.useSIFT = false ;
opts.crop = true ;
%opts.scales = [0.5 0.75 1] ; %CUB
opts.scales = 2.^(1.5:-.5:-3); % as in CVPR14 submission
opts.encoder = [];
opts.numSpatialSubdivisions = 1 ;
opts.maxNumLocalDescriptorsReturned = +inf ;
opts = vl_argparse(opts, varargin) ;
if (numel(opts.numSpatialSubdivisions) == 1)
opts.numSpatialSubdivisions = opts.numSpatialSubdivisions * [1 1];
end
% Find geometric parameters of the representation. x is set to the
% leftmost pixel of the receptive field of the lefmost feature at
% the last level of the network. This is computed by backtracking
% from the last layer. Then we obtain a map
%
% x(u) = offset + stride * u
%
% from a feature index u to pixel coordinate v of the center of the
% receptive field.
if opts.useSIFT
binSize = 8;
offset = 1 + 3/2 * binSize ;
stride = 4;
border = binSize*2 ;
else
info = vl_simplenn_display(net) ;
%vl_simplenn_display(net) ;
x=1 ;
%
for l=numel(net.layers):-1:1
x=(x-1)*info.stride(2,l)-info.pad(2,l)+1 ;
end
offset = round(x + info.receptiveFieldSize(end)/2 - 0.5);
stride = prod(info.stride(1,:)) ;
border = ceil(info.receptiveFieldSize(end)/2 + 1);
averageColour = mean(mean(net.normalization.averageImage,1),2) ;
end
if ~iscell(im)
im = {im} ;
regions = {regions} ;
end
numNull = 0 ;
numReg = 0 ;
% for each image
for k=1:numel(im)
% crop region
im_w = size(im{k}, 1);
im_h = size(im{k}, 2);
min_ratio = min(im_w / (border + 2), im_h / (border + 2));
if (min_ratio < 1)
im{k} = imresize(im{k}, 1 / min_ratio);
regions{k}.basis = imresize(regions{k}.basis, 1/min_ratio, 'nearest');
end
[im_cropped, regions_cropped] = crop(opts, single(im{k}), regions{k}, border) ;
crop_h = size(im_cropped,1) ;
crop_w = size(im_cropped,2) ;
psi = cell(1, numel(regions_cropped.labels)) ;
loc = cell(1, numel(regions_cropped.labels)) ;
res = [] ;
% for each scale
for s=1:numel(opts.scales)
% 30 was added to prevent crash on 2nd conv layer for VGG-M; crashed in VOC07
% and MIT Indoor for images which had one edge under 100px
if min(crop_h,crop_w) * opts.scales(s) < max(border, 30), continue ; end
%if sqrt(crop_h*crop_w) * opts.scales(s) > 1640, continue ; end
if sqrt(crop_h*crop_w) * opts.scales(s) > 1024, continue ; end
% resize the cropped image and extract features everywhere
im_resized = imresize(im_cropped, opts.scales(s)) ;
if opts.useSIFT
[frames,descrs] = vl_dsift(mean(im_resized,3), ...
'size', binSize, ...
'step', stride, ...
'fast', 'floatdescriptors') ;
ur = unique(frames(1,:)) ;
vr = unique(frames(2,:)) ;
[u,v] = meshgrid(ur,vr) ;
%assert(isequal([u(:)';v(:)'], frames)) ;
else
im_resized = bsxfun(@minus, im_resized, averageColour) ;
if net.useGpu
im_resized = gpuArray(im_resized) ;
end
res = vl_simplenn(net, im_resized, [], res, ...
'conserveMemory', true, 'sync', true) ;
w = size(res(end).x,2) ;
h = size(res(end).x,1) ;
descrs = permute(gather(res(end).x), [3 1 2]) ;
descrs = reshape(descrs, size(descrs,1), []) ;
[u,v] = meshgrid(...
offset + (0:w-1) * stride, ...
offset + (0:h-1) * stride) ;
end
u_ = (u - 1) / opts.scales(s) + 1 ;
v_ = (v - 1) / opts.scales(s) + 1 ;
loc_ = [u_(:)';v_(:)'] ;
% for each region
for r = 1:numel(regions_cropped.labels)
mask_cropped = ismember(regions_cropped.basis, regions_cropped.labels{r}) ;
mask_resized = imresize(mask_cropped, opts.scales(s), 'nearest') ;
mask_features = mask_resized(sub2ind(size(mask_resized), v, u)) ;
psi{r}{s} = descrs(:, mask_features) ;
loc{r}{s} = loc_(:, mask_features) ;
if 0
figure(100) ; clf ;
imagesc(vl_imsc(im_resized)) ; hold on ;
plot(u,v,'g.') ;
plot(u(mask_features),v(mask_features),'ro') ;
axis equal ;
drawnow ;
end
end
end
for r = 1:numel(psi)
code{k}{r} = cat(2, psi{r}{:}) ;
codeLoc{k}{r} = cat(2, zeros(2,0), loc{r}{:}) ;
numReg = numReg + 1 ;
numNull = numNull + isempty(code{k}{r}) ;
end
end
if numNull > 0
fprintf('%s: %d out of %d regions with null DCNN descriptor\n', ...
mfilename, numNull, numReg) ;
end
% at this point code{i}{r} contains all local featrues for region r in
% image i
if isempty(opts.encoder)
% no gmm: return the local descriptors, but not too many!
rng(0) ;
if (~isinf(opts.maxNumLocalDescriptorsReturned))
for k=1:numel(code)
for r = 1:numel(code{k})
code{k}{r} = vl_colsubset(code{k}{r}, ...
opts.maxNumLocalDescriptorsReturned) ;
end
end
end
else
numSelDescr = 250000;
% encoding (supports BoVW, VLAD and FV)
for k=1:numel(code)
for r = 1:numel(code{k})
descrs = opts.encoder.projection * bsxfun(@minus, code{k}{r}, ...
opts.encoder.projectionCenter) ;
if opts.encoder.renormalize
descrs = bsxfun(@times, descrs, 1./max(1e-12, sqrt(sum(descrs.^2)))) ;
end
tmp = {} ;
break_u = get_intervals(codeLoc{k}{r}(1,:), opts.numSpatialSubdivisions(1)) ;
break_v = get_intervals(codeLoc{k}{r}(2,:), opts.numSpatialSubdivisions(2)) ;
for spu = 1:opts.numSpatialSubdivisions(1)
for spv = 1:opts.numSpatialSubdivisions(2)
sel = ...
break_u(spu) <= codeLoc{k}{r}(1,:) & codeLoc{k}{r}(1,:) < break_u(spu+1) & ...
break_v(spv) <= codeLoc{k}{r}(2,:) & codeLoc{k}{r}(2,:) < break_v(spv+1);
z = [];
switch (opts.encoder.encoderType)
case {'fv'}
sel_descrs = descrs(:, sel);
if (size(sel_descrs, 2) > numSelDescr)
sel_descrs = descrs(:, vl_colsubset(1: size(sel_descrs, 2), numSelDescr));
end
z = vl_fisher(sel_descrs, ...
opts.encoder.means, ...
opts.encoder.covariances, ...
opts.encoder.priors, ...
'Improved') ;
case {'bovwsq'}
[words, ~] = vl_kdtreequery(opts.encoder.kdtree, opts.encoder.words, ...
descrs, 'MaxComparisons', 100) ;
z = vl_binsum(zeros(opts.encoder.numWords,1), 1, double(words)) ;
z = sign(z) .* sqrt(abs(z));
z = bsxfun(@times, z, 1./max(1e-12, sqrt(sum(z .^ 2))));
case {'bovw'}
[words, ~] = vl_kdtreequery(opts.encoder.kdtree, opts.encoder.words, ...
descrs, 'MaxComparisons', 100) ;
z = vl_binsum(zeros(opts.encoder.numWords,1), 1, double(words)) ;
z = bsxfun(@times, z, 1./max(1e-12, sqrt(sum(z .^ 2))));
case {'vlad'}
[words, ~] = vl_kdtreequery(opts.encoder.kdtree, opts.encoder.words, ...
descrs, 'MaxComparisons', 15) ;
assign = zeros(opts.encoder.numWords, numel(words), 'single') ;
assign(sub2ind(size(assign), double(words), 1:numel(words))) = 1 ;
z = vl_vlad(descrs, opts.encoder.words, assign, ...
'SquareRoot','NormalizeComponents') ;
case {'llc'}
[words, ~] = vl_kdtreequery(opts.encoder.kdtree, ...
single(opts.encoder.words), ...
single(descrs), 'MaxComparisons', 500, 'NumNeighbors', 5);
z = LLCEncodeHelper(double(opts.encoder.words), ...
double(descrs), double(words), double(1e-4), false);
% z = sign(z) .* sqrt(abs(z));
z = bsxfun(@times, z, 1./max(1e-12, sqrt(sum(z .^ 2))));
end
tmp{end+1} = z;
end
end
% normalization keeps norm = 1
code{k}{r} = cat(1, tmp{:}) / (opts.numSpatialSubdivisions(1) * opts.numSpatialSubdivisions(2)) ;
end
code{k} = cat(2, code{k}{:}) ;
end
if nargout == 1
clear codeLoc;
end
end
% here code{i} is an array of FV descripors for each region, with one
% coulmn per region
function breaks = get_intervals(x,n)
if isempty(x)
breaks = ones(1,n+1) ;
else
x = sort(x(:)') ;
breaks = x(round(linspace(1, numel(x), n+1))) ;
end
breaks(end) = +inf ;
% ------------------------------------------------------------------------
function [imCrop, regionsCrop] = crop(opts, im, regions, border)
% -------------------------------------------------------------------------
box = enclosingBox(regions.basis) ;
% include a border around it (feature support)
w = diff(box([1 3])) + border ;
h = diff(box([2 4])) + border ;
bx = mean(box([1 3])) ;
by = mean(box([2 4])) ;
sbox = round([bx - w/2 ; by - h/2 ; bx + w/2 ; by + h/2]) ;
% clip it
sbox = boxclip(sbox, [size(im,2), size(im,1)]) ;
% crop image and mask
sx = sbox(1):sbox(3) ;
sy = sbox(2):sbox(4) ;
imCrop = im(sy, sx, :) ;
regionsCrop = regions ;
regionsCrop.basis = regions.basis(sy, sx, :) ;
% -------------------------------------------------------------------------
function box = enclosingBox(mask)
% -------------------------------------------------------------------------
[x,y] = meshgrid(1:size(mask,2), 1:size(mask,1)) ;
x = x(any(mask,3)) ;
y = y(any(mask,3)) ;
box = [min(x) ; min(y) ; max(x) ; max(y)] ;
Computing file changes ...