Scene Text Detection using CNN

References:

Minghui Liao, Baoguang Shi, Xiang Bai, Xinggang Wang, Wenyu Liu. "TextBoxes: A Fast Text Detector with a Single Deep Neural Network", AAAI 2017. https://arxiv.org/abs/1611.06779

Sources:

load model

textSpotter = TextBoxes();

detect text

im = which('handicapSign.jpg');
assert(~isempty(im), 'specify input image');
img = cv.imread(im, 'Color',true, 'FlipChannels',false);
[bboxes, confs] = textSpotter.detect(img);
fprintf('%d text detections\n', numel(bboxes));

62 text detections

post-processing

if true
    % non-maximum suppression
    ind = cv.Net.NMSBoxes(bboxes, confs, 0.3, 0.4);
    ind = ind + 1;  % zero to one-based indices
    bboxes = bboxes(ind);
    confs = confs(ind);
else
    % confidence thresholding
    idx = confs > 0.6;
    bboxes = bboxes(idx);
    confs = confs(idx);
end
fprintf('%d text detections after filtering\n', numel(bboxes));

6 text detections after filtering

show results

img = flip(img, 3);
for i=1:numel(bboxes)
    img = insertAnnotation(img, bboxes{i}, sprintf('%.2f', confs(i)), ...
        'Color',[255 0 255], 'TextColor',[255 255 255], 'Thickness',2, 'FontScale',0.3);
    fprintf('text box: [%3d %3d %3d %3d], confidence: %3.0f%%\n', ...
        bboxes{i}, confs(i)*100);
end
imshow(img), title('TextBox Demo')

Helper functions

function img = insertAnnotation(img, rect, str, varargin)
    % See also: insertObjectAnnotation, insertShape, insertText
    p = inputParser();
    p.addParameter('Alpha', 0.6);
    p.addParameter('Thickness', 1);
    p.addParameter('Color', [255 255 0]);
    p.addParameter('TextColor', [0 0 0]);
    p.addParameter('FontFace', 'HersheySimplex');
    p.addParameter('FontScale', 0.4);
    p.addParameter('AntiAlias', true);
    p.addParameter('Shape', 'rectangle');
    p.parse(varargin{:});
    opts = p.Results;
    opts.Shape = validatestring(opts.Shape, {'rectangle','circle'});
    thick = 1;

    [sz,b] = cv.getTextSize(str, 'Thickness',thick, ...
        'FontFace',opts.FontFace, 'FontScale',opts.FontScale);
    txt_rect = [rect(1), rect(2)-sz(2)-b, sz(1), sz(2)+b];
    txt_orig = [rect(1), rect(2)-b];

    if opts.AntiAlias
        alias = {'LineType','AA'};
    else
        alias = {'LineType',8};
    end

    overlay = img;
    if strcmp(opts.Shape, 'rectangle')
        overlay = cv.rectangle(overlay, rect, ...
            'Color',opts.Color, 'Thickness',opts.Thickness, alias{:});
    else
        c = rect(1:2) + rect(3:4)/2;
        r = max(rect(3:4)/2);
        overlay = cv.circle(overlay, c, r, ...
            'Color',opts.Color, 'Thickness',opts.Thickness, alias{:});
    end
    overlay = cv.rectangle(overlay, txt_rect, ...
        'Color',opts.Color, 'Thickness','Filled', alias{:});
    if opts.Thickness > 1
        overlay = cv.rectangle(overlay, txt_rect, ...
            'Color',opts.Color, 'Thickness',opts.Thickness, alias{:});
    end
    overlay = cv.putText(overlay, str, txt_orig, ...
        'FontFace',opts.FontFace, 'FontScale',opts.FontScale, ...
        'Color',opts.TextColor, 'Thickness',thick, alias{:});

    img = cv.addWeighted(img,1-opts.Alpha, overlay,opts.Alpha, 0);
end

text box: [403 260 272  44], confidence: 100%
text box: [402 434 303  43], confidence:  99%
text box: [451 390 187  40], confidence:  99%
text box: [385 354 166  38], confidence:  79%
text box: [467 618 181  37], confidence:  79%
text box: [466 309 162  39], confidence:  48%
Warning: Image is too big to fit on screen; displaying at 67%

Pretrained models

function dname = get_dnn_dir(dname)
    %GET_DNN_DIR  Path to model files, and show where to get them if missing

    dname = fullfile(mexopencv.root(), 'test', 'dnn', dname);
    b = isdir(dname);
    if ~b
        % display help of calling function
        % (assumed to be a local function in current file)
        st = dbstack(1);
        help([mfilename() filemarker() st(1).name])
    end
    assert(b, 'Missing model: %s', dname);
end

function net = TextBoxes()
    %TEXTDETECTOR  Text detector model [Caffe]
    %
    % homepage = https://github.com/MhLiao/TextBoxes
    %
    % ## Model
    %
    % file = test/dnn/TextBoxes/textbox.prototxt
    % url  = https://github.com/opencv/opencv_contrib/raw/3.4.1/modules/text/samples/textbox.prototxt
    % hash = c294416fe6d156b9383342d62b9158ab707170c0
    %
    % ## Weights
    %
    % file = test/dnn/TextBoxes/TextBoxes_icdar13.caffemodel
    % url  = https://www.dropbox.com/s/g8pjzv2de9gty8g/TextBoxes_icdar13.caffemodel?dl=0
    % hash = e4b32aef3db3d66fec0630c96006e10999f785c4
    % size = 90.68 MB
    %

    dname = get_dnn_dir('TextBoxes');
    net = cv.TextDetectorCNN(...
        fullfile(dname, 'textbox.prototxt'), ...
        fullfile(dname, 'TextBoxes_icdar13.caffemodel'), ...
        'DetectionSizes',[300 300]);
end