https://github.com/ssvb/tinymembench
Raw File
Tip revision: a2cf6d7e382e3aea1eb39173174d9fa28cad15f3 authored by Siarhei Siamashka on 14 February 2017, 21:58:50 UTC
Fix use of uninitialized variable 's' (reported by Coverity)
Tip revision: a2cf6d7
asm-opt.c
/*
 * Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include "asm-opt.h"

#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)

#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)

#if defined(__i386__) || defined(__amd64__)
#define FEATURES_ID "flags"
#elif defined(__arm__)
#define FEATURES_ID "Features"
#elif defined(__mips__)
#define FEATURES_ID "cpu model"
#else
#define FEATURES_ID "?"
#endif

static int check_feature (char *buffer, const char *feature)
{
  char *p;
  if (*feature == 0)
    return 0;
  if (strncmp(buffer, FEATURES_ID, strlen(FEATURES_ID)) != 0)
    return 0;
  buffer += strlen(FEATURES_ID);
  while (isspace(*buffer))
    buffer++;

  /* Check if 'feature' is present in the buffer as a separate word */
  while ((p = strstr(buffer, feature))) {
    if (p > buffer && !isspace(*(p - 1))) {
      buffer++;
      continue;
    }
    p += strlen(feature);
    if (*p != 0 && !isspace(*p)) {
      buffer++;
      continue;
    }
    return 1;
  }
  return 0;
}

static int parse_proc_cpuinfo(int bufsize, const char *feature)
{
  char *buffer = (char *)malloc(bufsize);
  FILE *fd;
  int feature_support = 0;

  if (!buffer)
    return 0;

  fd = fopen("/proc/cpuinfo", "r");
  if (fd) {
    while (fgets(buffer, bufsize, fd)) {
      if (!strchr(buffer, '\n') && !feof(fd)) {
        /* "impossible" happened - insufficient size of the buffer! */
        fclose(fd);
        free(buffer);
        return -1;
      }
      if (check_feature(buffer, feature))
        feature_support = 1;
    }
    fclose(fd);
  }
  free(buffer);
  return feature_support;
}

#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)

int check_cpu_feature(const char *feature)
{
  int bufsize = 1024;
  int result;
  while ((result = parse_proc_cpuinfo(bufsize, feature)) == -1)
  {
    bufsize *= 2;
    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
      return 0;
  }
  return result;
}

#else

int check_cpu_feature(const char *feature)
{
    return 0;
}

#endif

static bench_info empty[] = { { NULL, 0, NULL } };

#if defined(__i386__) || defined(__amd64__)

#include "x86-sse2.h"

static bench_info x86_sse2[] =
{
    { "MOVSB copy", 0, aligned_block_copy_movsb },
    { "MOVSD copy", 0, aligned_block_copy_movsd },
    { "SSE2 copy", 0, aligned_block_copy_sse2 },
    { "SSE2 nontemporal copy", 0, aligned_block_copy_nt_sse2 },
    { "SSE2 copy prefetched (32 bytes step)", 0, aligned_block_copy_pf32_sse2 },
    { "SSE2 copy prefetched (64 bytes step)", 0, aligned_block_copy_pf64_sse2 },
    { "SSE2 nontemporal copy prefetched (32 bytes step)", 0, aligned_block_copy_nt_pf32_sse2 },
    { "SSE2 nontemporal copy prefetched (64 bytes step)", 0, aligned_block_copy_nt_pf64_sse2 },
    { "SSE2 2-pass copy", 1, aligned_block_copy_sse2 },
    { "SSE2 2-pass copy prefetched (32 bytes step)", 1, aligned_block_copy_pf32_sse2 },
    { "SSE2 2-pass copy prefetched (64 bytes step)", 1, aligned_block_copy_pf64_sse2 },
    { "SSE2 2-pass nontemporal copy", 1, aligned_block_copy_nt_sse2 },
    { "SSE2 fill", 0, aligned_block_fill_sse2 },
    { "SSE2 nontemporal fill", 0, aligned_block_fill_nt_sse2 },
    { NULL, 0, NULL }
};

static bench_info x86_sse2_fb[] =
{
    { "MOVSD copy (from framebuffer)", 0, aligned_block_copy_movsd },
    { "MOVSD 2-pass copy (from framebuffer)", 1, aligned_block_copy_movsd },
    { "SSE2 copy (from framebuffer)", 0, aligned_block_copy_sse2 },
    { "SSE2 2-pass copy (from framebuffer)", 1, aligned_block_copy_sse2 },
    { NULL, 0, NULL }
};

static int check_sse2_support(void)
{
#ifdef __amd64__
    return 1; /* We assume that all 64-bit processors have SSE2 support */
#else
    int cpuid_feature_information;
    __asm__ volatile (
        /* According to Intel manual, CPUID instruction is supported
         * if the value of ID bit (bit 21) in EFLAGS can be modified */
        "pushf\n"
        "movl     (%%esp),   %0\n"
        "xorl     $0x200000, (%%esp)\n" /* try to modify ID bit */
        "popf\n"
        "pushf\n"
        "xorl     (%%esp),   %0\n"      /* check if ID bit changed */
        "jz       1f\n"
        "push     %%eax\n"
        "push     %%ebx\n"
        "push     %%ecx\n"
        "mov      $1,        %%eax\n"
        "cpuid\n"
        "pop      %%ecx\n"
        "pop      %%ebx\n"
        "pop      %%eax\n"
        "1:\n"
        "popf\n"
        : "=d" (cpuid_feature_information)
        :
        : "cc");
    return cpuid_feature_information & (1 << 26);
#endif
}

bench_info *get_asm_benchmarks(void)
{
    if (check_sse2_support())
        return x86_sse2;
    else
        return empty;
}

bench_info *get_asm_framebuffer_benchmarks(void)
{
    if (check_sse2_support())
        return x86_sse2_fb;
    else
        return empty;
}

#elif defined(__arm__)

#include "arm-neon.h"

static bench_info arm_neon[] =
{
    { "NEON read", 0, aligned_block_read_neon },
    { "NEON read prefetched (32 bytes step)", 0, aligned_block_read_pf32_neon },
    { "NEON read prefetched (64 bytes step)", 0, aligned_block_read_pf64_neon },
    { "NEON read 2 data streams", 0, aligned_block_read2_neon },
    { "NEON read 2 data streams prefetched (32 bytes step)", 0, aligned_block_read2_pf32_neon },
    { "NEON read 2 data streams prefetched (64 bytes step)", 0, aligned_block_read2_pf64_neon },
    { "NEON copy", 0, aligned_block_copy_neon },
    { "NEON copy prefetched (32 bytes step)", 0, aligned_block_copy_pf32_neon },
    { "NEON copy prefetched (64 bytes step)", 0, aligned_block_copy_pf64_neon },
    { "NEON unrolled copy", 0, aligned_block_copy_unrolled_neon },
    { "NEON unrolled copy prefetched (32 bytes step)", 0, aligned_block_copy_unrolled_pf32_neon },
    { "NEON unrolled copy prefetched (64 bytes step)", 0, aligned_block_copy_unrolled_pf64_neon },
    { "NEON copy backwards", 0, aligned_block_copy_backwards_neon },
    { "NEON copy backwards prefetched (32 bytes step)", 0, aligned_block_copy_backwards_pf32_neon },
    { "NEON copy backwards prefetched (64 bytes step)", 0, aligned_block_copy_backwards_pf64_neon },
    { "NEON 2-pass copy", 1, aligned_block_copy_neon },
    { "NEON 2-pass copy prefetched (32 bytes step)", 1, aligned_block_copy_pf32_neon },
    { "NEON 2-pass copy prefetched (64 bytes step)", 1, aligned_block_copy_pf64_neon },
    { "NEON unrolled 2-pass copy", 1, aligned_block_copy_unrolled_neon },
    { "NEON unrolled 2-pass copy prefetched (32 bytes step)", 1, aligned_block_copy_unrolled_pf32_neon },
    { "NEON unrolled 2-pass copy prefetched (64 bytes step)", 1, aligned_block_copy_unrolled_pf64_neon },
    { "NEON fill", 0, aligned_block_fill_neon },
    { "NEON fill backwards", 0, aligned_block_fill_backwards_neon },
    { "VFP copy", 0, aligned_block_copy_vfp },
    { "VFP 2-pass copy", 1, aligned_block_copy_vfp },
    { "ARM fill (STRD)", 0, aligned_block_fill_strd_armv5te },
    { "ARM fill (STM with 8 registers)", 0, aligned_block_fill_stm8_armv4 },
    { "ARM fill (STM with 4 registers)", 0, aligned_block_fill_stm4_armv4 },
    { "ARM copy prefetched (incr pld)", 0, aligned_block_copy_incr_armv5te },
    { "ARM copy prefetched (wrap pld)", 0, aligned_block_copy_wrap_armv5te },
    { "ARM 2-pass copy prefetched (incr pld)", 1, aligned_block_copy_incr_armv5te },
    { "ARM 2-pass copy prefetched (wrap pld)", 1, aligned_block_copy_wrap_armv5te },
    { NULL, 0, NULL }
};

static bench_info arm_v5te_vfp[] =
{
    { "VFP copy", 0, aligned_block_copy_vfp },
    { "VFP 2-pass copy", 1, aligned_block_copy_vfp },
    { "ARM fill (STRD)", 0, aligned_block_fill_strd_armv5te },
    { "ARM fill (STM with 8 registers)", 0, aligned_block_fill_stm8_armv4 },
    { "ARM fill (STM with 4 registers)", 0, aligned_block_fill_stm4_armv4 },
    { "ARM copy prefetched (incr pld)", 0, aligned_block_copy_incr_armv5te },
    { "ARM copy prefetched (wrap pld)", 0, aligned_block_copy_wrap_armv5te },
    { "ARM 2-pass copy prefetched (incr pld)", 1, aligned_block_copy_incr_armv5te },
    { "ARM 2-pass copy prefetched (wrap pld)", 1, aligned_block_copy_wrap_armv5te },
    { NULL, 0, NULL }
};

static bench_info arm_v5te[] =
{
    { "ARM fill (STRD)", 0, aligned_block_fill_strd_armv5te },
    { "ARM fill (STM with 8 registers)", 0, aligned_block_fill_stm8_armv4 },
    { "ARM fill (STM with 4 registers)", 0, aligned_block_fill_stm4_armv4 },
    { "ARM copy prefetched (incr pld)", 0, aligned_block_copy_incr_armv5te },
    { "ARM copy prefetched (wrap pld)", 0, aligned_block_copy_wrap_armv5te },
    { "ARM 2-pass copy prefetched (incr pld)", 1, aligned_block_copy_incr_armv5te },
    { "ARM 2-pass copy prefetched (wrap pld)", 1, aligned_block_copy_wrap_armv5te },
    { NULL, 0, NULL }
};

static bench_info arm_v4[] =
{
    { "ARM fill (STM with 8 registers)", 0, aligned_block_fill_stm8_armv4 },
    { "ARM fill (STM with 4 registers)", 0, aligned_block_fill_stm4_armv4 },
    { NULL, 0, NULL }
};

bench_info *get_asm_benchmarks(void)
{
    if (check_cpu_feature("neon"))
        return arm_neon;
    else if (check_cpu_feature("edsp") && check_cpu_feature("vfp"))
        return arm_v5te_vfp;
    else if (check_cpu_feature("edsp"))
        return arm_v5te;
    else
        return arm_v4;
}

static bench_info arm_neon_fb[] =
{
    { "NEON read (from framebuffer)", 0, aligned_block_read_neon },
    { "NEON copy (from framebuffer)", 0, aligned_block_copy_neon },
    { "NEON 2-pass copy (from framebuffer)", 1, aligned_block_copy_neon },
    { "NEON unrolled copy (from framebuffer)", 0, aligned_block_copy_unrolled_neon },
    { "NEON 2-pass unrolled copy (from framebuffer)", 1, aligned_block_copy_unrolled_neon },
    { "VFP copy (from framebuffer)", 0, aligned_block_copy_vfp },
    { "VFP 2-pass copy (from framebuffer)", 1, aligned_block_copy_vfp },
    { "ARM copy (from framebuffer)", 0, aligned_block_copy_incr_armv5te },
    { "ARM 2-pass copy (from framebuffer)", 1, aligned_block_copy_incr_armv5te },
    { NULL, 0, NULL }
};

static bench_info arm_v5te_vfp_fb[] =
{
    { "VFP copy (from framebuffer)", 0, aligned_block_copy_vfp },
    { "VFP 2-pass copy (from framebuffer)", 1, aligned_block_copy_vfp },
    { "ARM copy (from framebuffer)", 0, aligned_block_copy_incr_armv5te },
    { "ARM 2-pass copy (from framebuffer)", 1, aligned_block_copy_incr_armv5te },
    { NULL, 0, NULL }
};

static bench_info arm_v5te_fb[] =
{
    { "ARM copy (from framebuffer)", 0, aligned_block_copy_incr_armv5te },
    { "ARM 2-pass copy (from framebuffer)", 1, aligned_block_copy_incr_armv5te },
    { NULL, 0, NULL }
};

bench_info *get_asm_framebuffer_benchmarks(void)
{
    if (check_cpu_feature("neon"))
        return arm_neon_fb;
    else if (check_cpu_feature("edsp") && check_cpu_feature("vfp"))
        return arm_v5te_vfp_fb;
    else if (check_cpu_feature("edsp"))
        return arm_v5te_fb;
    else
        return empty;
}

#elif defined(__aarch64__)

#include "aarch64-asm.h"

static bench_info aarch64_neon[] =
{
    { "NEON LDP/STP copy", 0, aligned_block_copy_ldpstp_q_aarch64 },
    { "NEON LDP/STP copy pldl2strm (32 bytes step)", 0, aligned_block_copy_ldpstp_q_pf32_l2strm_aarch64 },
    { "NEON LDP/STP copy pldl2strm (64 bytes step)", 0, aligned_block_copy_ldpstp_q_pf64_l2strm_aarch64 },
    { "NEON LDP/STP copy pldl1keep (32 bytes step)", 0, aligned_block_copy_ldpstp_q_pf32_l1keep_aarch64 },
    { "NEON LDP/STP copy pldl1keep (64 bytes step)", 0, aligned_block_copy_ldpstp_q_pf64_l1keep_aarch64 },
    { "NEON LD1/ST1 copy", 0, aligned_block_copy_ld1st1_aarch64 },
    { "NEON STP fill", 0, aligned_block_fill_stp_q_aarch64 },
    { "NEON STNP fill", 0, aligned_block_fill_stnp_q_aarch64 },
    { "ARM LDP/STP copy", 0, aligned_block_copy_ldpstp_x_aarch64 },
    { "ARM STP fill", 0, aligned_block_fill_stp_x_aarch64 },
    { "ARM STNP fill", 0, aligned_block_fill_stnp_x_aarch64 },
    { NULL, 0, NULL }
};

static bench_info aarch64_neon_fb[] =
{
    { "NEON LDP/STP copy (from framebuffer)", 0, aligned_block_copy_ldpstp_q_aarch64 },
    { "NEON LDP/STP 2-pass copy (from framebuffer)", 1, aligned_block_copy_ldpstp_q_aarch64 },
    { "NEON LD1/ST1 copy (from framebuffer)", 0, aligned_block_copy_ld1st1_aarch64 },
    { "NEON LD1/ST1 2-pass copy (from framebuffer)", 1, aligned_block_copy_ld1st1_aarch64 },
    { "ARM LDP/STP copy (from framebuffer)", 0, aligned_block_copy_ldpstp_x_aarch64 },
    { "ARM LDP/STP 2-pass copy (from framebuffer)", 1, aligned_block_copy_ldpstp_x_aarch64 },
    { NULL, 0, NULL }
};

bench_info *get_asm_benchmarks(void)
{
    return aarch64_neon;
}

bench_info *get_asm_framebuffer_benchmarks(void)
{
    return aarch64_neon_fb;
}

#elif defined(__mips__) && defined(_ABIO32)

#include "mips-32.h"

static bench_info mips_32[] =
{
    { "MIPS32 copy prefetched (32 bytes step)", 0, aligned_block_copy_pf32_mips32 },
    { "MIPS32 2-pass copy prefetched (32 bytes step)", 1, aligned_block_copy_pf32_mips32 },
    { "MIPS32 fill prefetched (32 bytes step)", 0, aligned_block_fill_pf32_mips32 },
    { NULL, 0, NULL }
};

bench_info *get_asm_benchmarks(void)
{
    /* Enable only for MIPS32 processors which have 32 bytes cache line */
    if (check_cpu_feature("MIPS 24K") ||
        check_cpu_feature("MIPS 24Kc") ||
        check_cpu_feature("MIPS 24Kf") ||
        check_cpu_feature("MIPS 74K") ||
        check_cpu_feature("MIPS 74Kc") ||
        check_cpu_feature("MIPS 74Kf") ||
        check_cpu_feature("MIPS 1004K") ||
        check_cpu_feature("MIPS 1004Kc") ||
        check_cpu_feature("MIPS 1004Kf") ||
        check_cpu_feature("MIPS 1074K") ||
        check_cpu_feature("MIPS 1074Kc") ||
        check_cpu_feature("MIPS 1074Kf"))
    {
        return mips_32;
    }
    else
    {
        return empty;
    }
}

bench_info *get_asm_framebuffer_benchmarks(void)
{
    return empty;
}

#else

bench_info *get_asm_benchmarks(void)
{
    return empty;
}

bench_info *get_asm_framebuffer_benchmarks(void)
{
    return empty;
}

#endif
back to top