S - ko'p sonli elementlarni ketma-ketlikning oxiriga tez surishni amalga oshirish

Bir qator ushlab turish uchun oddiy tuzum bor:

struct array_of_a_type {
        size_t allocated_size;
        size_t elements; /* 1-index based */
        a_type *array;
};

Men oddiy funktsiyani yozmoqchiman, shunga o'xshash narsa:

bool simple_function(struct array_of_a_type *my_array, int a, int b, int c, int d)
{
    a_type new_chunk[] = {
        a,   b,   a+b, d,   c,
        c,   c,   c+d, b+d, a,
        a+c, b+c, c+d, c+d, c,
    };
    size_t size = sizeof(new_chunk)/sizeof(a_type);
    return push_to_array(my_array, new_chunk, size);
}

My_array - statik, global o'zgaruvchan. Quyida push_to_array ilovasi mavjud.

static bool push_to_array(struct array_of_a_type *a, a_type *new_chunk, size_t size)
{
    const size_t new_size = a->elements + size;
    const size_t old_size = a->elements;
    if (new_size > a->allocated_size) {
        /* The allocated_size is most of the time big enough.
           I’ve stripped this part of code to minimum. */
        a_type *tmp = realloc(a->array, new_size * sizeof(a_type));
        if (!tmp) {
            return true;
        } else {
            a->array = tmp;
            a->allocated_size = new_size;
        }
    }
    a->elements = new_size;
    memcpy(a->array + old_size, new_chunk, size * sizeof(a_type));
    return false;
}

My question:
How can I rewrite ‘simple_function’ to make more compilers generate code that will write directly to the destination? I would like the code to stay quite short and flexible.

Mening kodim ishlaydi. Afsuski, gcc (va eski clang) to'plamdagi vaqtinchalik ma'lumotni yaratadi va keyin uni maqsadga nusxalashtiradi. Yaratilgan x86_64 assemblerining bir qismi bo'lsa, quyida.

movq    8(%rsp), %rdx
movq    %rdx, 8(%rax)
movq    16(%rsp), %rdx
movq    %rdx, 16(%rax)
movq    24(%rsp), %rdx
movq    %rdx, 24(%rax)
movq    32(%rsp), %rdx
movq    %rdx, 32(%rax)

AMD uchun assembler bunga ega:

rep movsq

Yangi gulxan yaxshi ishlaydi. Men -O3 bilan tuzdim.

Bir elementni vaqtni qo'shgan kodni sinab ko'rdim. Afsuski, reallokni chaqirish uchun juda ko'p shartli otlar bor edi.

16
Sizning kodingiz ishlamaydi; realloc va boshqalar.
qo'shib qo'ydi muallif Carl Norum, manba
Sarlavha yangi elementlarni qator oxirigacha itarayotganligini bildiradi, lekin siz butun qatorni qayta yozyapsiz: memcpy (a-> array, ) eski va yangi tarkib uchun xotirani ajratishda: code> const size_t yangi_size = a-> elements + size;
qo'shib qo'ydi muallif 2501, manba
Men ro'yxatni tuzdim.
qo'shib qo'ydi muallif Michas, manba

7 javoblar

Samaradorlik uchun, ketma-ketlikni oshirish uchun mantiqni ajratish va qo'shimcha nusxani (to'plamdan ketma-ketlikka) oldini olish uchun qadriyatlarni (foydalanilmagan) uyalarga belgilash kerak.

Kodni oldindan belgilash uchun sizga yordamchi makronlarni yaratishingiz mumkin. Men "surish" bilan siz "qatorga qo'shish" degan ma'noni anglatadi. Agar siz "prepend" degan ma'noni anglatgan bo'lsangiz, qo'shimcha memmove() kerak.

Keling, sizda bor deb taxmin qilaylik

#include 
#include 

typedef int  array_data_type;

typedef struct {
    size_t           size;
    size_t           used;
    array_data_type *item;
} array_type;

#define ARRAY_INITIALIZER { 0, 0, NULL }

void array_free(array_type *const array)
{
    free(array->item);
    array->size = 0;
    array->used = 0;
    array->item = NULL;
}

void array_init(array_type *const array)
{
    array->size = 0;
    array->used = 0;
    array->item = NULL;
}

void array_init_size(array_type *const array, const size_t size)
{
    if (!size) {
        array->size = 0;
        array->used = 0;
        array->item = NULL;
        return;
    }

    array->item = malloc(size * sizeof array->item[0]);
    if (!array->item) {
        fprintf(stderr, "array_init_size(%p, %zu): Out of memory.\n", (void *)array, size);
        exit(EXIT_FAILURE);
    }
    array->size = size;
    array->used  = 0;
}

void array_grow_to(array_type *const array, size_t size)
{
    array_data_type *temp;

    if (size < 4)
        size = 4;
    else
    if (size < 16777216) {
        size |= size >> 1;
        size |= size >> 2;
        size |= size >> 4;
        size |= size >> 8;
        size |= size >> 16;
        size++;
    } else
        size = (size | 8388607) + 8388609;

    temp = realloc(array->item, size * sizeof array->item[0]);
    if (!temp) {
        fprintf(stderr, "array_grow_to(%p, %zu): Out of memory.\n", (void *)array, size);
        exit(EXIT_FAILURE);
    }

    array->item = temp;
    array->size = size;
}

static inline array_data_type *array_grow_by(array_type *const array, size_t const count)
{
    array_data_type *retval;

    if (array->used + count > array->size)
        array_grow_to(array, array->used + count);

    retval = array->item + array->used;
    array->used += count;
    return retval;
}

Men qatorda elementlar soni uchun ishlatilgan dan foydalanishni yaxshi ko'raman va qator uchun ajratilgan xotiraga ega bo'lgan elementlar soni uchun hajmi . Agar siz boshqa nomlarga ishlatilgan bo'lsangiz, qidiruvni amalga oshiring va o'zgartiring.

array_grow_to() adjusts the new size to at least 4, or the next power of two if less than 16,777,216, or to a larger multiple of 8,388,608. This limits the amount of allocated but unused memory for very large lists.

array_grow_by() ensures the array has room for count new elements, and returns the pointer to the first new unused element.

Agar siz quyidagi C99 preprocessor makroslarini aniqlasangiz,

#define MACRO_CONCATENATE(part1, ...)   part1 ## __VA_ARGS__

#define ARRAY_SET_N(array, count, ...)  MACRO_CONCATENATE(ARRAY_SET_, count)(array, count, __VA_ARGS__)
#define ARRAY_SET_0(...)
#define ARRAY_SET_1(a, n, v)        a[n-1] = v
#define ARRAY_SET_2(a, n, v, ...)   a[n-2] = v; ARRAY_SET_1(a, n, __VA_ARGS__)
#define ARRAY_SET_3(a, n, v, ...)   a[n-3] = v; ARRAY_SET_2(a, n, __VA_ARGS__)
#define ARRAY_SET_4(a, n, v, ...)   a[n-4] = v; ARRAY_SET_3(a, n, __VA_ARGS__)
#define ARRAY_SET_5(a, n, v, ...)   a[n-5] = v; ARRAY_SET_4(a, n, __VA_ARGS__)
#define ARRAY_SET_6(a, n, v, ...)   a[n-6] = v; ARRAY_SET_5(a, n, __VA_ARGS__)
#define ARRAY_SET_7(a, n, v, ...)   a[n-7] = v; ARRAY_SET_6(a, n, __VA_ARGS__)
#define ARRAY_SET_8(a, n, v, ...)   a[n-8] = v; ARRAY_SET_7(a, n, __VA_ARGS__)
#define ARRAY_SET_9(a, n, v, ...)   a[n-9] = v; ARRAY_SET_8(a, n, __VA_ARGS__)
#define ARRAY_SET_10(a, n, v, ...)  a[n-10] = v; ARRAY_SET_9(a, n, __VA_ARGS__)
#define ARRAY_SET_11(a, n, v, ...)  a[n-11] = v; ARRAY_SET_10(a, n, __VA_ARGS__)
#define ARRAY_SET_12(a, n, v, ...)  a[n-12] = v; ARRAY_SET_11(a, n, __VA_ARGS__)
#define ARRAY_SET_13(a, n, v, ...)  a[n-13] = v; ARRAY_SET_12(a, n, __VA_ARGS__)
#define ARRAY_SET_14(a, n, v, ...)  a[n-14] = v; ARRAY_SET_13(a, n, __VA_ARGS__)
#define ARRAY_SET_15(a, n, v, ...)  a[n-15] = v; ARRAY_SET_14(a, n, __VA_ARGS__)
#define ARRAY_SET_16(a, n, v, ...)  a[n-16] = v; ARRAY_SET_15(a, n, __VA_ARGS__)
#define ARRAY_SET_17(a, n, v, ...)  a[n-17] = v; ARRAY_SET_16(a, n, __VA_ARGS__)
#define ARRAY_SET_18(a, n, v, ...)  a[n-18] = v; ARRAY_SET_17(a, n, __VA_ARGS__)
#define ARRAY_SET_19(a, n, v, ...)  a[n-19] = v; ARRAY_SET_18(a, n, __VA_ARGS__)
#define ARRAY_SET_20(a, n, v, ...)  a[n-20] = v; ARRAY_SET_19(a, n, __VA_ARGS__)
#define ARRAY_SET_21(a, n, v, ...)  a[n-21] = v; ARRAY_SET_20(a, n, __VA_ARGS__)
#define ARRAY_SET_22(a, n, v, ...)  a[n-22] = v; ARRAY_SET_21(a, n, __VA_ARGS__)
#define ARRAY_SET_23(a, n, v, ...)  a[n-23] = v; ARRAY_SET_22(a, n, __VA_ARGS__)
#define ARRAY_SET_24(a, n, v, ...)  a[n-24] = v; ARRAY_SET_23(a, n, __VA_ARGS__)
#define ARRAY_SET_25(a, n, v, ...)  a[n-25] = v; ARRAY_SET_24(a, n, __VA_ARGS__)
#define ARRAY_SET_26(a, n, v, ...)  a[n-26] = v; ARRAY_SET_25(a, n, __VA_ARGS__)
#define ARRAY_SET_27(a, n, v, ...)  a[n-27] = v; ARRAY_SET_26(a, n, __VA_ARGS__)
#define ARRAY_SET_28(a, n, v, ...)  a[n-28] = v; ARRAY_SET_27(a, n, __VA_ARGS__)
#define ARRAY_SET_29(a, n, v, ...)  a[n-29] = v; ARRAY_SET_28(a, n, __VA_ARGS__)
#define ARRAY_SET_30(a, n, v, ...)  a[n-30] = v; ARRAY_SET_29(a, n, __VA_ARGS__)
#define ARRAY_SET_31(a, n, v, ...)  a[n-31] = v; ARRAY_SET_30(a, n, __VA_ARGS__)
#define ARRAY_SET_32(a, n, v, ...)  a[n-32] = v; ARRAY_SET_31(a, n, __VA_ARGS__)
#define ARRAY_SET_33(a, n, v, ...)  a[n-33] = v; ARRAY_SET_32(a, n, __VA_ARGS__)
#define ARRAY_SET_34(a, n, v, ...)  a[n-34] = v; ARRAY_SET_33(a, n, __VA_ARGS__)
#define ARRAY_SET_35(a, n, v, ...)  a[n-35] = v; ARRAY_SET_34(a, n, __VA_ARGS__)
#define ARRAY_SET_36(a, n, v, ...)  a[n-36] = v; ARRAY_SET_35(a, n, __VA_ARGS__)
#define ARRAY_SET_37(a, n, v, ...)  a[n-37] = v; ARRAY_SET_36(a, n, __VA_ARGS__)
#define ARRAY_SET_38(a, n, v, ...)  a[n-38] = v; ARRAY_SET_37(a, n, __VA_ARGS__)
#define ARRAY_SET_39(a, n, v, ...)  a[n-39] = v; ARRAY_SET_38(a, n, __VA_ARGS__)
#define ARRAY_SET_40(a, n, v, ...)  a[n-40] = v; ARRAY_SET_39(a, n, __VA_ARGS__)
#define ARRAY_SET_41(a, n, v, ...)  a[n-41] = v; ARRAY_SET_40(a, n, __VA_ARGS__)
#define ARRAY_SET_42(a, n, v, ...)  a[n-42] = v; ARRAY_SET_41(a, n, __VA_ARGS__)
#define ARRAY_SET_43(a, n, v, ...)  a[n-43] = v; ARRAY_SET_42(a, n, __VA_ARGS__)
#define ARRAY_SET_44(a, n, v, ...)  a[n-44] = v; ARRAY_SET_43(a, n, __VA_ARGS__)
#define ARRAY_SET_45(a, n, v, ...)  a[n-45] = v; ARRAY_SET_44(a, n, __VA_ARGS__)
#define ARRAY_SET_46(a, n, v, ...)  a[n-46] = v; ARRAY_SET_45(a, n, __VA_ARGS__)
#define ARRAY_SET_47(a, n, v, ...)  a[n-47] = v; ARRAY_SET_46(a, n, __VA_ARGS__)
#define ARRAY_SET_48(a, n, v, ...)  a[n-48] = v; ARRAY_SET_47(a, n, __VA_ARGS__)
#define ARRAY_SET_49(a, n, v, ...)  a[n-49] = v; ARRAY_SET_48(a, n, __VA_ARGS__)
#define ARRAY_SET_50(a, n, v, ...)  a[n-50] = v; ARRAY_SET_49(a, n, __VA_ARGS__)
#define ARRAY_SET_51(a, n, v, ...)  a[n-51] = v; ARRAY_SET_50(a, n, __VA_ARGS__)
#define ARRAY_SET_52(a, n, v, ...)  a[n-52] = v; ARRAY_SET_51(a, n, __VA_ARGS__)
#define ARRAY_SET_53(a, n, v, ...)  a[n-53] = v; ARRAY_SET_52(a, n, __VA_ARGS__)
#define ARRAY_SET_54(a, n, v, ...)  a[n-54] = v; ARRAY_SET_53(a, n, __VA_ARGS__)
#define ARRAY_SET_55(a, n, v, ...)  a[n-55] = v; ARRAY_SET_54(a, n, __VA_ARGS__)
#define ARRAY_SET_56(a, n, v, ...)  a[n-56] = v; ARRAY_SET_55(a, n, __VA_ARGS__)
#define ARRAY_SET_57(a, n, v, ...)  a[n-57] = v; ARRAY_SET_56(a, n, __VA_ARGS__)
#define ARRAY_SET_58(a, n, v, ...)  a[n-58] = v; ARRAY_SET_57(a, n, __VA_ARGS__)
#define ARRAY_SET_59(a, n, v, ...)  a[n-59] = v; ARRAY_SET_58(a, n, __VA_ARGS__)
#define ARRAY_SET_60(a, n, v, ...)  a[n-60] = v; ARRAY_SET_59(a, n, __VA_ARGS__)
#define ARRAY_SET_61(a, n, v, ...)  a[n-61] = v; ARRAY_SET_60(a, n, __VA_ARGS__)
#define ARRAY_SET_62(a, n, v, ...)  a[n-62] = v; ARRAY_SET_61(a, n, __VA_ARGS__)
#define ARRAY_SET_63(a, n, v, ...)  a[n-63] = v; ARRAY_SET_62(a, n, __VA_ARGS__)
#define ARRAY_SET_64(a, n, v, ...)  a[n-64] = v; ARRAY_SET_63(a, n, __VA_ARGS__)

#define ARRAY_APPEND_N(array, count, ...)                           \
    do {                                                            \
        array_data_type *const _base = array_grow_by(array, count); \
        ARRAY_SET_N(_base, count, __VA_ARGS__);                     \
    } while(0)

Keyin oddiy funktsiyani yozing

void simple_function(array_type *array,
                     const array_data_type a, const array_data_type b,
                     const array_data_type c, const array_data_type d)
{
    ARRAY_APPEND_N(array, 15, a,   b,   a+b, d,   c,
                              c,   c,   c+d, b+d, a,
                              a+c, b+c, c+d, c+d, c);
}

va uni (chertishdan tashqari)

void simple_function(array_type *array,
                     const array_data_type a, const array_data_type b,
                     const array_data_type c, const array_data_type d)
{
    do {
        array_data_type *const _base = array_grow_by(array, 15);
        _base[15 - 15] = a;
        _base[15 - 14] = b;
        _base[15 - 13] = a+b;
        _base[15 - 12] = d;
        _base[15 - 11] = c;
        _base[15 - 10] = c;
        _base[15 -  9] = c;
        _base[15 -  8] = c+d;
        _base[15 -  7] = b+d;
        _base[15 -  6] = a;
        _base[15 -  5] = a+c;
        _base[15 -  4] = b+c;
        _base[15 -  3] = c+d;
        _base[15 -  2] = c+d;
        _base[15 -  1] = c;
    } while(0);
}

Odatda, Intel/AMD64 me'morchiligi (va tegishli manzil rejimni qo'llab-quvvatlaydigan boshqa mimariler) ustida mukammal mashina kodi tashkil etadi. Ba'zi bir boshqa arxitekturada _base ni doimiy qilmaslik va uning o'rniga ( * (_ base ++) = v; ) moslashtirishi kerak.

Ibratli so'rovlar sonini hisoblash uchun PP_NARG() so'lini qo'llayotgan bo'lsangiz, so'lni qo'shishingiz mumkin

#define ARRAY_APPEND(array, ...) ARRAY_APPEND_N(array, PP_NARG(__VA_ARGS__), __VA_ARGS__)

bu holatda sizning vazifangiz soddalashtiriladi

void simple_function(array_type *array,
                     const array_data_type a, const array_data_type b,
                     const array_data_type c, const array_data_type d)
{
    ARRAY_APPEND(array, a,   b,   a+b, d,   c,
                        c,   c,   c+d, b+d, a,
                        a+c, b+c, c+d, c+d, c);
}

Dastlabki protsessor so'l argumentlari soni ba'zi bir kompilyatorlarda 64 ta cheklangan bo'lib, maksimal sonni cheklash uchun bitta so'l 62 ga qo'shilishi mumkin. Siz foydalanadigan kompilyatorga bog'liq holda siz ko'plab argumentlarni qo'llab-quvvatlash uchun makrolarni kengaytira olasiz, lekin boshqa kompilyatorlar bunga bardosh berishi mumkin.

9
qo'shib qo'ydi
Bu aslida x86-64 uchun juda yaxshi kodni tuzadi ( godbolt.org/g/OTWwjF ) . simple_function - bu tezkor yo'lda qo'shimcha tugmachalar/poplar (faqatgina array-> used + 15> array-> size branch uchun inlined kodida kerakli registrlar uchun) , lekin boshqa usulda faqat reallok uchun tez tekshiruv va keyinchalik to'liq matematikalar bilan aralashtirilgan ba'zi bir do'konlarni ko'rish mumkin. 8M incrementsgacha eksponentsional o'sish uchun sizning heuristik xisoblanadi va samarali-yetarli kodni tuzadi. (gcc afsuski, 2-darajali kuchga ega bo'lish uchun nolga teng miqdorni ishlatishning yo'lini topa olmaydi, lekin bu kod tezkor yo'lda emas.)
qo'shib qo'ydi muallif Peter Cordes, manba
@michas: rost emas. Yuqoridagi C kodi 32 va 64 bitli fayllarni o'z ichiga olgan turli xil arxitekturalar bo'yicha to'planishi mumkin. Assembler portativ emas. Hatto bir xil apparat me'morchiligi bo'yicha ABI (ayniqsa, chaqiruv parametrlarini saqlash) farq qiladi. Ko'p turli konventsiyalarni ko'rish uchun x86 chaqiriq qoidalarini ni tekshiring, x86-ning barcha x86 tizimlarida portativ bo'lmaganini o'rnatish.
qo'shib qo'ydi muallif Nominal Animal, manba
@Michas: bu haqda "may" bo'lishi mumkin; C99 protsessorlari bilan ishlaydi, men nazorat qildim. "Men sizning savolingizga faqat yoqimli va pushti deb hisoblaydigan javoblarni ko'rib chiqaman" kabi qo'shimcha talablarni qo'shmaysiz?
qo'shib qo'ydi muallif Nominal Animal, manba
Bu ishlashi mumkin, ammo oqlangan emas.
qo'shib qo'ydi muallif Michas, manba
@NominalAnimal Mening savolimni tuzdim. Kodni topishni istayman, bu xom assemblerdan ko'ra tushunish osonroqdir. Sizning javobingiz uchun minnatdorman. Men batafsilroq gapirishim kerak.
qo'shib qo'ydi muallif Michas, manba
@Tim Savol: S dan yaxshi assembler ishlab chiqarish haqida. Xom assembler kodi osonroq ko'rinadigan bo'lsa, u yuqori tildan foydalanishga yaroqsiz.
qo'shib qo'ydi muallif Michas, manba
@Michas ... bu javobda "xom assembler" yo'q.
qo'shib qo'ydi muallif Tim, manba

Ba'zi kodni qayta tiklash kerak.

Avval push_to_array funktsiyasiga o'xshash yordamchi funktsiyaga muhtojman, lekin bu faqat elementlarga yangi xotira ajratadi:

static inline bool increase_size(struct array_of_a_type *a, size_t size)
{
    const size_t new_size = a->elements + size;
    if (new_size > a->allocated_size) {
        a_type *tmp = realloc(a->array, new_size * sizeof(a_type));
        if (!tmp) {
            return true;
        } else {
            a->array = tmp;
            a->allocated_size = new_size;
        }
    }
    a->elements = new_size;
    return false;
}

Tasodifan, push_to_array funktsiyasini kodni takrorlashni oldini olish uchun o'zgartirish kerak:

static bool push_to_array(struct array_of_a_type *a, a_type *new_chunk, size_t size)
{
    bool failed = increase_size( a , size );
    if( failed )
    {
        return failed;
    }
    memcpy(a->array + ( a->elements - size ), new_chunk, size * sizeof(a_type));
    return false;
}

Simple_function endi vaqtinchalik qatorni ishlatmasdan yozishni osonlashtiradi:

bool simple_function(struct array_of_a_type *my_array, int a, int b, int c, int d)
{
    bool failed = increase_size( my_array , 15 );
    if( failed )
    {
        return failed;
    }

    size_t i = my_array->elements - 15;
    my_array->array[i] = a;
    my_array->array[i+1] = b;
    my_array->array[i+2] = a+b;
    my_array->array[i+3] = d;
    //... write the rest of the assignments
    my_array->array[i+14] = c;

    return false;
}
6
qo'shib qo'ydi
Bu erda men xuddi shunday javobni taklif qilaman. Juda aniq echim. Vaqtinchalik vaqt yaratish o'rniga qurish. @Michas bu metaprogramming emas, javoblarning egasi aytganidek, kodingizni qayta ishlash.
qo'shib qo'ydi muallif Amadeus, manba
@Michas Bu to'g'ri emas. Menda javob yo'q va metaprogrammni talab qilmaydi. Bu nimani anglatadi, oddiy oddiy dasturlash yoki undan ko'ra aniqroq: code refactoring va metaprogramming bilan hech qanday aloqasi yo'q.
qo'shib qo'ydi muallif 2501, manba
Javobingiz uchun tashakkur. Afsuski, ushbu kod metafrogrammani talab qiladi. Buni oldini olishni xohlayman.
qo'shib qo'ydi muallif Michas, manba
Men realloc funktsiyasini chiroyli to'ldirish uchun etarli darajada aqlli deb bilaman. Mening qatorim kattaligi ko'pincha juda ko'pdir. Savolimga ba'zi izohlar qo'shdim.
qo'shib qo'ydi muallif Michas, manba

Sizning kodingiz simple_function ning a_type majmuasini sizning xohishingizcha? Buning sababi, siz uni [] bilan ketma-ketlikda yaratadigan bir qator qilib yaratgansiz. Siz shunday qatorni yaratishingiz kerak:

a_type *ap = malloc( * sizeof(a_type));
atype[0] = a;
...

u holda return ap mumkin.

Shuningdek, siz bir vaqtning o'zida bir qatorga a'zolarni bosishni xohlaysiz, shunda siz statik qatorni saqlashingiz mumkin va keyin buni bajaring:

int i;
for (i = 0; i < ; i++)
    push_to_array(&my_array, new[i]);

va push_to_array funktsiyangizni biroz o'zgartirishi mumkin.

Bu erda birorta suyakka ishlov berishni topish mumkin, shuni ta'kidlash kerakki, o'stiradigan funktsiya qayta taqsimotni amalga oshiradi: https://github.com/minshallj/my_clib/blob/master/stack.c#L24-L31 Buni tartibingiz "sinf" ga moslashtirishi kerak.

Bundan tashqari, my_array dasturida boshqa joylarda yashaydigan globalmi? Hech qanday joyda e'lon qilinmasligini bilmayman.

4
qo'shib qo'ydi
Yig'ma xotirasi vaqtinchalik ma'lumot uchun to'g'ri joy. Muammo tuzilgan kod to'g'ridan-to'g'ri maqsadga yozilmaydi.
qo'shib qo'ydi muallif Michas, manba
Men aqldan ozgaruvchilardan emasman. Bir vaqtning o'zida bir a'zodan yomonroq muammo bor. My_array global, uni ishga tushirish funktsiyasi, uni tozalash funktsiyasi, undan foydalanish uchun bir nechta vazifalar bor ... Men juda ko'p kodni kiritishga harakat qilaman.
qo'shib qo'ydi muallif Michas, manba
Yaxshilab, yığmaning o'rniga yig'indisiga a_type hosil qiluvchi birinchi kod segmentini rioya qiling. Ushbu qatorni ishga tushirish usulini stackda ko'rishingiz mumkin.
qo'shib qo'ydi muallif Jacob Minshall, manba

Siz aslida tmp qatoridan qochishingiz kerak. Bu shoshilinch yordamni chaqiruvchiga kiritilishi mumkin. O'zgaruvchan uzunlikdagi ro'yxatni xotiradan boshqa funktsiya chaqiruvidan o'tkazishning yo'li yo'q. Ushbu xotirani o'zingiz xohlagan joyda yakunlashingiz mumkin, agar qo'ng'iroq qiluvchining ishonchli ravishda ularni joylashtirish uchun zarur bo'lgan barcha mantiqlari mavjud bo'lsa, ya'ni, inlined.

Aslida, amd64 da clang-3.5 bilan, temp qatori butunlay optimallashtirildi va simple_function qatorni oxiridagi so'nggi manzilga bevosita yozadi. Shuning uchun memcpy ga qo'ng'iroq hech qachon bo'lmaydi. Bu gcc 4.9.2 uchun emas.

Bu erda yaxshi ishlamaydi, deb o'ylayman, lekin siz ikkita bosqichli funktsiyaga ega bo'lishingiz mumkin, shunda ham umumiy funktsiyani tekshirib ko'rishingiz mumkin (masalan, realloc realloc), unda to'liq funktsiyani chaqirishingiz mumkin.

bool push_many (struct array_of_a_type * a, size_t size, ...) kabi turli xil funktsiyalarni moslashtiradigan narsalardan qanday xulosa chiqardingiz. Nvm, men buni sinab ko'rdim va na gcc, na clang variadik funktsiyani keltira olmadi. Gcc, push_many.constprop.0: funktsiyasining maxsus versiyasini ishlab chiqdi, ammo u hali ham qo'ng'iroq qiluvchining joylashtiradigan qismini blokirovkalashga qaraganda ancha sekinroq ko'rinar edi.

static bool push_many(struct array_of_a_type *a, size_t size, ...)
{
    va_list ap;
    va_start(ap, size);

    const size_t new_size = a->elements + size;
    if (new_size > a->allocated_size) {
        a_type *tmp = realloc(a->array, new_size * sizeof(a_type));
        if (!tmp) {
            return true;
        } else {
            a->array = tmp;
        a->allocated_size = new_size;
        }
    }

    a_type *p = a->array + a->elements; //points to spot after last used
    va_start(ap, size);
    for (int i = a->elements; i < new_size; i++) {
        p[i] = va_arg(ap, a_type); /* Increments ap to the next argument. */
    }
    va_end(ap);
//    memcpy(a->array, new_chunk, size * sizeof(a_type));
    a->elements = new_size;
    return false;
}

Replikatsiya loopi 14 ta ko'rsatmada, jumladan, 3 cmovcc shartli harakatlarga asoslangan. Har bir iteratsiya bir int nusxasini oladi. ( typedef int a_type; ) Shunday qilib, amd64 uchun gcc 4.9.2 afsuski, variadik funktsiyalarni inlining vaqtida foydasiz. clang-3.5 ham juda yomon kod ishlab chiqaradi.

Yoki yana bir yondashuv chaqiruv funktsiyasiga kiritilgan bosimni olish uchun makrolardan foydalanishdir. GCC yoki C99 variadic makroslari ishlamaydi; siz so'ldagi args ustida yineleyemezsiniz, faqat printf kabi variadik bir vazifaga aktarabilirsiniz. Shunday qilib, siz har bir makro so'rovi uchun reallokni tekshirishga ehtiyoj sezasiz va GCC uni optimallashtirishga to'g'ri keladi. Men gccga kosmik kassetani va son-sanoqsiz operatsiyalarni optimallashtirishni qiyinlashtirishi mumkinligiga shubham bor, chunki yozma ravishda bir martalik vaqt davomida turli xil qo'ng'iroqlarga sabab bo'lishi mumkin. har bir narsani bir-biriga qo'shib olishdan ko'ra farq qiladi. (Va, albatta, bir derleyici uchun ko'rish uchun, albatta, qattiq optimallash, deb o'ylayman.)

Katta miqdorda argsni qabul qiladigan PUSH4 , PUSH8 va PUSH16 makrosini ishlatishingiz mumkin.

Yoki siz kodingizni chigallashtirishingiz mumkin va ALLOC_SPACE_FOR_N_MORE so'l/funktsiyasiga ega bo'lishingiz mumkin. Keyin oddiy NOCHECK_PUSH makrosini etarlicha bo'sh joy deb hisoblagan va hisoblagichni oshirgansiz. (Va, umid qilamanki, gcc faqat bir marta bosish uchun oxirida bitta qo'shimchani amalga oshiradi.)

3
qo'shib qo'ydi
@NominalAnimal: ha, yaxshi nuqta. Men shunga o'xshash usullarni ham ko'rdim. Katta kodli ro'yxatlarni ham qattiq kodlash bilan (masalan, makrolarla to'liq matematik qilish uchun). Arg ro'yxatining uzunligi bo'yicha sobit chegaraga ega bo'lasiz, ammo bu odatda muhim emas.
qo'shib qo'ydi muallif Peter Cordes, manba
"variadic makroslarga qarshi ishlamasligimga qaramasdan, siz so'ldagi args ustida yineleyemeyiz" . Texnik jihatdan to'g'ri (ayniqsa, bitta makroda tufayli) bo'lsa-da, siz cheklangan, ammo o'zgarmaydigan sonli argumentlar (yagona makroga), texnikalar orqali ishlov beradigan makrolar majmuini yaratishingiz mumkin masalan, PP_NARGS() so'lini ishlatish uchun foydalanilganlar kabi yoki bu savolga so'nggi javobimda qo'llanilganlar kabi. Ularni amalga oshirishda "oqlangan emas" bo'lsa-da, ular foydali va ajablanarli darajada kuchli.
qo'shib qo'ydi muallif Nominal Animal, manba
@PeterCordes: Albatta mutlaqo qabul qilindim. Foydalanish hollari biroz g'alati ko'rinadi; O'ylaymanki, bir vaqtning o'zida bir-biriga o'xshash bir narsa (biroq, PP_NARGS() -like funktsiyasi bo'lmagan holda) turli xil qisqa bit satrlarini ikkilamchi oqimga joylashtirish uchun ishlatgan deb o'ylayman. Kodni ko'p hollarda soddalashtirib, (ayniqsa, bir vaqtning o'zida MPEG kabi ba'zi bir specga moslashtirilganda) va osonlik bilan ishlashni osonlashtiradi, va alohida makrolar o'z operatsiyalarini tekshirish uchun alohida dasturda (birlik testida!) Yaxshilab testdan o'tkazilishi mumkin .
qo'shib qo'ydi muallif Nominal Animal, manba
Yangi chiziqlar yaxshi ishlaydi, qadimgi narsa yo'q. Men C derivativlarida bunday o'sishni o'zim kutmoqdaman. Xo'sh, muammoli funksiya statik va kompilyatorlar tomonidan o'rnatiladi.
qo'shib qo'ydi muallif Michas, manba

malloc() a fixed sized array instead of using realloc(). Every time you realloc() it's possible that a copy will happen if realloc() can't grow the existing memory block.

Mumkin bo'lgan yechim malloc() - qattiq o'lchamli qator bo'lib, u to'la bo'lsa, u qatorning kattaligi. Keyinchalik ma'lumotlar yangi ikki qavatli qatorga nusxa ko'chiring. Bu mumkin nusxalar sonini kamaytiradi.

3
qo'shib qo'ydi
realloc sizning ikkinchi qismingiz kabi bajaradi.
qo'shib qo'ydi muallif HuStmpHrrr, manba
Shunday qilib, kichik boshlansin, lekin har safar o'sishni talab qilsangiz, keyingi kuchiga qarab turing. Shunday qilib, eng ko'p 50% sarflaysiz va realloc vaqtini amortizatsiya qilish ma'lumotingizni yangi realloc kerak bo'lmagan ko'plab qo'ng'iroqlarni bloklash.
qo'shib qo'ydi muallif Peter Cordes, manba
Ko'pgina platformalarda, realloc vaqtning ko'p qismini ko'chirib olishning hojati yo'q, xususan, katalog uchun ajratilgan virtual xotira o'z sahifasiga ega bo'lishi uchun etarli. realloc faqatgina qator bo'shligidan tashqari manzillar bo'sh bo'lsa, nusxa olish kerak.
qo'shib qo'ydi muallif Peter Cordes, manba
Derleme vaqtida qatorning oxirgi o'lchami noma'lum. Jildning o'zi ham ko'pincha juda oz bo'lishi mumkin. Bu qadar xotira ajratish uchun pul sarflash kerak.
qo'shib qo'ydi muallif Michas, manba

Sizning push funksiyangizda xato bo'ladi. Har doim qatorning old qismini yozadi.

new_chunk oralig'idan foydalanib, derleyici ishlashi qiyinlashtiradi. Katalogga qo'shimcha joy ajratish va to'g'ridan-to'g'ri qatorga yozish uchun kodni yaxshilash yaxshi [va tezroq].

Agar realloc ni amalga oshirishda "spekulyativ o'sish uzunligi" qo'shilsa, kod tezroq ketadi. Bu realloc qo'ng'iroqlari soniga qarab kesiladi. Ya'ni, realloc deyilganida, allocated_size va/yoki new_size o'sish faktoriga (masalan, 15) ko'paytiriladi, keyingi push_to_array oldindan kutib turish uchun hozirgi ga ajratilgan. Shunday qilib, ikkinchi qo'ng'iroq yana realloc dan qochishi mumkin, chunki u hali ham bo'sh joyga ega.

Bularning barchasini namoyish qiluvchi test dasturini yaratdim. Men to'rtta versiyani ishlab chiqdim, ularning har biri ortib bormoqda.

"Eng yaxshi" versiyasi 2,7 barobar tezroq


Xato:

push_to_array funktsiyasida bu original kod:

   //NOTE/BUG: this is broken -- it always writes the low part of the array
    a->elements = new_size;
    memcpy(a->array,new_chunk,size * sizeof(a_type));

Kodning nima bo'lishi kerak:

   //NOTE/FIX: this writes to the correct part of the array
    memcpy(a->array + a->elements,new_chunk,size * sizeof(a_type));
    a->elements = new_size;

Refactoring:

Men to'rtta qo'shimcha versiya bilan chiqdim:

  1. Faqat [minimal] bugni tuzatadi
  2. Shu singari, new_chunk dan foydalanadi, lekin o'sish parametrlariga ruxsat beradi
  3. o'sish parametrlarini ishlatadi va to'g'ridan-to'g'ri qatorga yozadi (ya'ni new_chunk )
  4. 3 ga o'xshash, lekin barcha kodni kiritadi

Kod:

Ayniqsa, tezlik uchun _fix_push va _fix_array_space funktsiyalari o'rtasidagi farqlarni qayd qiling.

Bundan tashqari new_chunk ni qurishda bajargan ishingizni taqqoslaydigan SIMPLE_INIT so'lini ko'ring.

// pushary.c -- test program

// pushary.h -- push to array

#ifndef _pushary_h_
#define _pushary_h_

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define bool    int
#define true    1
#define false   0

#ifndef MCHECK
#define MCHECK  0
#endif

#ifndef ABIG
#define ABIG    0
#endif

#ifndef NOMACRO
#define NOMACRO 0
#endif

#ifndef NODATA1
#define NODATA1 0
#endif

#ifndef NODATA2
#define NODATA2 0
#endif

#ifndef NODATA3
#define NODATA3 0
#endif

#ifndef NODATA4
#define NODATA4 0
#endif

#define sysfault(_fmt...) \
    do { \
        printf(_fmt); \
        exit(1); \
    } while (0)

#if MCHECK
#include 
#define MCHECKALL       mcheck_check_all()
#else
#define MCHECKALL       /**/
#endif

#if ABIG
typedef long long a_type;
#else
typedef int a_type;
#endif

// macro for simple_function
// NOTE: different functions could have different macros of this form
#define SIMPLE_INIT(_cmd,_a,_b,_c,_d) \
    _cmd(_a) _cmd(_b) _cmd(_a + _b) _cmd(_d) _cmd(_c) \
    _cmd(_c) _cmd(_c) _cmd(_c + _d) _cmd(_b + _d) _cmd(_a) \
    _cmd(_a + _c) _cmd(_b + _c) _cmd(_c + _d) _cmd(_c + _d) _cmd(_c)

#define _SIZE(_val) \
    + 1

#define _SET(_val) \
    ptr[idx++] = _val;

struct array_of_a_type {
    const char *sym;
    size_t allocated_size;
    size_t grow_size;                  //amount to grow on realloc
    size_t elements;                   //1-index based
    a_type *array;
    double elap;                       //elapsed time
    double rate;                       //rate
};
typedef struct array_of_a_type a_list;

typedef bool (*simple_p)(a_list *ary,int a,int b,int c,int d);

#if 0
#define INLINE  static inline
#else
#define INLINE  __attribute__((__always_inline__)) static inline
#endif

// test control
typedef struct tstctl tstctl_t;
struct tstctl {
    tstctl_t *tst_next;                //tstorder linkage
    const char *tst_tag;               //test name
    simple_p tst_proc;                 //simple function
    double tst_bestrat;                //best ratio
    int tst_bestgrow;                  //best growlen
    double tst_elap;                   //current/best elapsed time
    double tst_rate;                   //current rate
    int tst_trybest;                   //best trial
    a_list tst_lst;                    //array/list
};

// _fix_push -- original push (with bug fix)
INLINE bool
_fix_push(a_list *a,a_type *new_chunk,size_t size)
{
    const size_t new_size = a->elements + size;

    if (new_size > a->allocated_size) {
        a_type *tmp = realloc(a->array,new_size * sizeof(a_type));

        if (!tmp) {
            sysfault("_fix_push: realloc error -- %s\n",strerror(errno));
            return true;
        }
        else {
            a->array = tmp;
            a->allocated_size = new_size;
        }
    }

   //NOTE/FIX: this writes to the correct part of the array
    memcpy(a->array + a->elements,new_chunk,size * sizeof(a_type));
    a->elements = new_size;

    return false;
}

// _fix_array_space -- allocate space in array
// RETURNS: pointer to place to store (or NULL)
INLINE a_type *
_fix_array_space(a_list *a,size_t count)
{
    size_t new_size = a->elements + count;
    size_t newmax;
    a_type *tmp;

    newmax = a->allocated_size;

    if (new_size > newmax) {
       //prevent us from doing realloc on every push
       //NOTE: grow_size is arbitrary -- pick any optimal value
        newmax += new_size;
        newmax += a->grow_size;

        tmp = realloc(a->array,newmax * sizeof(a_type));
        if (tmp == NULL) {
            sysfault("_fix_array_space: realloc error -- %s\n",strerror(errno));
            return tmp;
        }

        a->array = tmp;
        a->allocated_size = newmax;
    }

    tmp = a->array + a->elements;
    a->elements = new_size;

    return tmp;
}

// /home/cae/OBJ/ovrgen/pushary/pushary.proto -- prototypes

// FILE: /home/cae/preserve/ovrbnc/pushary/com.c
// com.c -- common routines

   //fix_array_space -- allocate space in array
   //RETURNS: pointer to place to store (or NULL)
    a_type *
    fix_array_space(a_list *a,size_t count);

   //fix_push -- original push (with bug fix)
    bool
    fix_push(a_list *a,a_type *new_chunk,size_t size);

// FILE: /home/cae/preserve/ovrbnc/pushary/fix1.c
// fix1.c -- push to array
//
// fixes bug in orig

    bool
    fix1_simple(a_list *my_array,int a,int b,int c,int d);

// FILE: /home/cae/preserve/ovrbnc/pushary/fix2.c
// fix2.c -- push to array
//
// uses new_chunk array
// uses push function
// uses grow length

    bool
    fix2_simple(a_list *my_array,int a,int b,int c,int d);

    bool
    fix2_push(a_list *a,a_type *new_chunk,size_t size);

// FILE: /home/cae/preserve/ovrbnc/pushary/fix3.c
// fix3.c -- push to array
//
// uses grow length
// uses non-inline space function

    bool
    fix3_simple(a_list *my_array,int a,int b,int c,int d);

// FILE: /home/cae/preserve/ovrbnc/pushary/fix4.c
// fix4.c -- push to array
//
// uses grow length
// uses inline space function

    bool
    fix4_simple(a_list *my_array,int a,int b,int c,int d);

// FILE: /home/cae/preserve/ovrbnc/pushary/orig.c
// orig.c -- push to array

    bool
    orig_simple(a_list *my_array,int a,int b,int c,int d);

    bool
    orig_push(a_list *a,a_type *new_chunk,size_t size);

// FILE: /home/cae/preserve/ovrbnc/pushary/pushary.c
// pushary.c -- test program

   //main -- main program
    int
    main(int argc,char **argv);

   //usage -- show usage
    void
    usage(void);

   //gendata -- generate data
    void
    gendata(void);

   //defall -- define all tests
    void
    defall(void);

   //defone -- define all tests
    tstctl_t *
    defone(simple_p proc,const char *tag);

   //testall -- test all
    void
    testall(void);

   //testone -- test a function
    void
    testone(tstctl_t *tst);

   //_testone -- test a function
    void
    _testone(tstctl_t *tst,int trycnt,double *elap);

   //ratshow -- show ratio
    void
    ratshow(tstctl_t *tlhs,tstctl_t *trhs,int bestflg);

   //arycmp -- compare arrays
    void
    arycmp(tstctl_t *tlhs,tstctl_t *trhs);

   //arykill -- release an array
    void
    arykill(tstctl_t *tst);

   //tvsecf -- get hi-res time
    double
    tvsecf(void);

#endif

// orig.c -- push to array

bool
orig_simple(a_list *my_array,int a,int b,int c,int d)
{
    a_type new_chunk[] = {
        a, b, a + b, d, c,
        c, c, c + d, b + d, a,
        a + c, b + c, c + d, c + d, c,
    };
    size_t size = sizeof(new_chunk)/sizeof(a_type);

    return orig_push(my_array,new_chunk,size);
}

bool
orig_push(a_list *a,a_type *new_chunk,size_t size)
{
    const size_t new_size = a->elements + size;

    if (new_size > a->allocated_size) {
        a_type *tmp = realloc(a->array,new_size * sizeof(a_type));

        if (!tmp) {
            return true;
        }
        else {
            a->array = tmp;
            a->allocated_size = new_size;
        }
    }

   //NOTE/BUG: this is broken -- it always writes the low part of the array
    a->elements = new_size;
    memcpy(a->array,new_chunk,size * sizeof(a_type));

    return false;
}
// fix1.c -- push to array
//
// fixes bug in orig

bool
fix1_simple(a_list *my_array,int a,int b,int c,int d)
{
    a_type new_chunk[] = {
        a, b, a + b, d, c,
        c, c, c + d, b + d, a,
        a + c, b + c, c + d, c + d, c,
    };
    size_t size = sizeof(new_chunk)/sizeof(a_type);

#if NODATA1 == 0
    return fix_push(my_array,new_chunk,size);
#endif
}
// fix2.c -- push to array
//
// uses new_chunk array
// uses push function
// uses grow length

bool
fix2_simple(a_list *my_array,int a,int b,int c,int d)
{
    a_type new_chunk[] = {
        a, b, a + b, d, c,
        c, c, c + d, b + d, a,
        a + c, b + c, c + d, c + d, c,
    };
    size_t size = sizeof(new_chunk)/sizeof(a_type);

    return fix2_push(my_array,new_chunk,size);
}

bool
fix2_push(a_list *a,a_type *new_chunk,size_t size)
{
    a_type *tmp;

    tmp = fix_array_space(a,size);
    if (tmp == NULL)
        return true;

   //NOTE/FIX: this writes to the correct part of the array
#if NODATA2 == 0
    memcpy(tmp,new_chunk,size * sizeof(a_type));
#endif

    return false;
}
// fix3.c -- push to array
//
// uses grow length
// uses non-inline space function

bool
fix3_simple(a_list *my_array,int a,int b,int c,int d)
{
#if NOMACRO
    size_t count = 15;
#else
    size_t count = SIMPLE_INIT(_SIZE,1,2,3,4);
#endif
    a_type *ptr;

   //use non-inline function
    ptr = fix_array_space(my_array,count);
    if (ptr == NULL)
        return true;

   //NOTE: these optimize to _exactly_ the same code
#if NODATA3 == 0
#if NOMACRO
    ptr[0] = a;
    ptr[1] = b;
    ptr[2] = a + b;
    ptr[3] = d;
    ptr[4] = c;
    ptr[5] = c;
    ptr[6] = c;
    ptr[7] = c + d;
    ptr[8] = b + d;
    ptr[9] = a;
    ptr[10] = a + c;
    ptr[11] = b + c;
    ptr[12] = c + d;
    ptr[13] = c + d;
    ptr[14] = c;
#else
    int idx = 0;
    SIMPLE_INIT(_SET,a,b,c,d)
#endif
#endif

    return false;
}
// fix4.c -- push to array
//
// uses grow length
// uses inline space function

bool
fix4_simple(a_list *my_array,int a,int b,int c,int d)
{
#if NOMACRO
    size_t count = 15;
#else
    size_t count = SIMPLE_INIT(_SIZE,1,2,3,4);
#endif
    a_type *ptr;

   //use inline function
    ptr = _fix_array_space(my_array,count);
    if (ptr == NULL)
        return true;

   //NOTE: these optimize to _exactly_ the same code
#if NODATA4 == 0
#if NOMACRO
    ptr[0] = a;
    ptr[1] = b;
    ptr[2] = a + b;
    ptr[3] = d;
    ptr[4] = c;
    ptr[5] = c;
    ptr[6] = c;
    ptr[7] = c + d;
    ptr[8] = b + d;
    ptr[9] = a;
    ptr[10] = a + c;
    ptr[11] = b + c;
    ptr[12] = c + d;
    ptr[13] = c + d;
    ptr[14] = c;
#else
    int idx = 0;
    SIMPLE_INIT(_SET,a,b,c,d)
#endif
#endif

    return false;
}
// com.c -- common routines

// fix_array_space -- allocate space in array
// RETURNS: pointer to place to store (or NULL)
a_type *
fix_array_space(a_list *a,size_t count)
{

    return _fix_array_space(a,count);
}

// fix_push -- original push (with bug fix)
bool
fix_push(a_list *a,a_type *new_chunk,size_t size)
{

    return _fix_push(a,new_chunk,size);
}

int opt_f;
int opt_m;
int opt_o;
int opt_D;
int opt_M;
int opt_G;
int opt_s;

#define MDFT 1000000

int growlen;
int growmin;
int growmax;
int growbest;

double ratbest;

int datamax;
a_type *testdata;

tstctl_t *orig1;
tstctl_t *fix1;
tstctl_t *fix2;
tstctl_t *fix3;
tstctl_t *fix4;
tstctl_t *orig2;
tstctl_t *orig3;

tstctl_t *tstref;
tstctl_t *tstorder;

// main -- main program
int
main(int argc,char **argv)
{
    char *cp;
    pid_t pid;

    --argc;
    ++argv;

    opt_G = -25;
    opt_f = 1;
    opt_M = MDFT;

    for (;  argc > 0;  --argc, ++argv) {
        cp = *argv;
        if (*cp != '-')
            break;

        switch (cp[1]) {
        case 'f':
            opt_f = ! opt_f;
            break;

        case 'o':
            cp += 2;
            opt_o = (*cp != 0) ? atoi(cp) : 2;
            break;

        case 'D':
            opt_D = ! opt_D;
            break;

        case 'm':
#if MCHECK == 0
            usage();
#endif
            opt_m = ! opt_m;
            break;

        case 'M':
            cp += 2;
            opt_M = atoi(cp);
            break;

        case 'G':
            cp += 2;
            opt_G = (*cp != 0) ? atoi(cp) : 25;
            break;

        case 's':
            cp += 2;
            opt_s = (*cp != 0) ? atoi(cp) : 3;
            break;

        default:
            usage();
            break;
        }
    }

    if (! opt_M)
        opt_M = MDFT;
    printf("M=%d\n",opt_M);
    datamax = opt_M * 4;

    printf("D=%d\n",opt_D);
    gendata();

    if (opt_G < 0) {
        growmin = 0;
        growmax = -opt_G;
    }
    else {
        growmin = opt_G;
        growmax = opt_G;
    }

    growlen = growmin;

    printf("f=%d\n",opt_f);

    if (opt_s <= 0)
        opt_s = 1;
    printf("s=%d\n",opt_s);

    defall();

    for (growlen = growmin;  growlen <= growmax;  ++growlen) {
        if (! opt_f) {
            testall();
            continue;
        }

        fflush(stdout);
        fflush(stderr);

        pid = fork();

        if (pid < 0) {
            perror("fork");
            exit(1);
        }

        if (pid == 0) {
            testall();
            exit(0);
        }

        waitpid(pid,NULL,0);
    }

    return 0;
}

// usage -- show usage
void
usage(void)
{

    printf("  -f -- invert fork mode (DEFAULT: %s)\n",opt_f ? "on" : "off");
    printf("  -D -- use real random test data (DEFAULT: off)\n");
    printf("  -G[grow_length] -- array speculative grow length (DEFAULT: %d)\n",opt_G);
    printf("    <0 -- benchmark mode range\n");
    printf("    >=0 -- single grow length with data compare\n");
    printf("  -M[push_calls] -- number of times to push to array (DEFAULT: %d)\n",
        MDFT);
    printf("  -s -- (DEFAULT: 1)\n");
    printf("  -o -- (DEFAULT: 0)\n");
    printf("    0 -- use fix1\n");
    printf("    1 -- use orig (1st invocation)\n");
    printf("    2 -- use orig (2nd invocation)\n");
    printf("  -m -- force/test mcheck failure%s\n",
        MCHECK ? "" : " (requires rebuild with -DMCHECK=1 and -lmcheck)");

    exit(1);
}

// gendata -- generate data
void
gendata(void)
{
    int *ptr;
    int idx;

    if (opt_D || opt_m) {
        MCHECKALL;
        testdata = malloc(sizeof(a_type) * datamax);

       //force an mcheck exception
        if (opt_m) {
            ptr = testdata;
            ptr -= 10;
            for (idx = 0;  idx < 20;  ++idx)
                ptr[idx] = rand();
        }
        else {
            for (idx = 0;  idx < datamax;  ++idx)
                testdata[idx] = rand();
        }

        MCHECKALL;
    }
}

// defall -- define all tests
void
defall(void)
{

    orig1 = defone(orig_simple,"org1");
    fix1 = defone(fix1_simple,"fix1");
    fix2 = defone(fix2_simple,"fix2");
    fix3 = defone(fix3_simple,"fix3");
    fix4 = defone(fix4_simple,"fix4");
    orig2 = defone(orig_simple,"org2");
    orig3 = defone(orig_simple,"org3");

    switch (opt_o) {
    case 1:
        tstref = orig1;
        break;
    case 2:
        tstref = orig2;
        break;
    default:
        opt_o = 0;
        tstref = fix1;
    }

    printf("reference test is %s\n",tstref->tst_tag);
}

// defone -- define all tests
tstctl_t *
defone(simple_p proc,const char *tag)
{
    tstctl_t *tst;

    tst = calloc(1,sizeof(tstctl_t));
    tst->tst_tag = tag;
    tst->tst_proc = proc;

    tst->tst_bestrat = 0;

    return tst;
}

// testall -- test all
void
testall(void)
{
    tstctl_t *base;
    tstctl_t *trhs;
    tstctl_t *tlhs;

    printf("\n");
    printf("G=%d\n",growlen);

    tstorder = NULL;

   //perform tests
    testone(orig1);
    testone(fix1);
    testone(orig2);
    testone(fix2);
    testone(orig3);
    testone(fix3);
    testone(fix4);

   //show benchmarks
    for (trhs = tstorder;  trhs != NULL;  trhs = trhs->tst_next)
        ratshow(tstref,trhs,1);

#if 0
    do {
        if (opt_o)
            break;

        if (base == fix1)
            break;
        base = fix1;

        ratshow(base,fix2,0);
        ratshow(base,fix3,0);
        ratshow(base,fix4,0);
    } while (0);
#endif

   //compare data
    if (opt_G >= 0) {
        base = fix1;
        for (trhs = tstorder;  trhs != NULL;  trhs = trhs->tst_next)
            arycmp(base,trhs);
    }

   //release all array memory
    for (tlhs = tstorder;  tlhs != NULL;  tlhs = trhs) {
        trhs = tlhs->tst_next;
        arykill(tlhs);
    }
}

// testone -- test a function
void
testone(tstctl_t *tst)
{
    a_list *ary;
    int trycnt;
    double elapv[opt_s];
    tstctl_t *cur;
    tstctl_t *prev;

    tst->tst_elap = 1e20;

    ary = &tst->tst_lst;
    memset(ary,0,sizeof(a_list));

    ary->sym = tst->tst_tag;
    ary->grow_size = growlen;

    for (trycnt = 0;  trycnt < opt_s;  ++trycnt)
        _testone(tst,trycnt,&elapv[trycnt]);

    prev = NULL;
    for (cur = tstorder;  cur != NULL;  cur = cur->tst_next)
        prev = cur;
    if (prev != NULL)
        prev->tst_next = tst;
    else
        tstorder = tst;
}

// _testone -- test a function
void
_testone(tstctl_t *tst,int trycnt,double *elap)
{
    simple_p func;
    double tvbeg;
    double tvdif;
    a_list *ary;

    ary = &tst->tst_lst;
    arykill(tst);

    func = tst->tst_proc;

    MCHECKALL;

    tvbeg = tvsecf();

   //use real test data -- good for comparisons
    if (opt_D) {
        a_type *ptr = testdata;
        a_type *ptre = ptr + datamax;
        for (;  ptr < ptre;  ptr += 4)
            func(ary,ptr[0],ptr[1],ptr[2],ptr[3]);
    }

   //use the same test data -- faster and gives truer benchmark for function
   //being tested
    else {
        for (int loopcnt = datamax;  loopcnt > 0;  loopcnt -= 4)
            func(ary,1,2,3,4);
    }

    tvdif = tvsecf();
    tvdif -= tvbeg;

    MCHECKALL;

    ary->elap = tvdif;
    ary->rate = ary->elements;
    ary->rate /= tvdif;

    if (ary->elap < tst->tst_elap) {
        tst->tst_elap = ary->elap;
        tst->tst_rate = ary->rate;
        tst->tst_trybest = trycnt;
    }

    *elap = tvdif;
}

// ratshow -- show ratio
void
ratshow(tstctl_t *tlhs,tstctl_t *trhs,int bestflg)
{
    double ratio;
    double rhsrate;
    double lhsrate;
    int faster;

    printf("%s %.9f",trhs->tst_tag,trhs->tst_elap);

    lhsrate = tlhs->tst_rate;
    rhsrate = trhs->tst_rate;

    faster = (rhsrate > lhsrate);

    if (faster)
        ratio = rhsrate/lhsrate;
    else
        ratio = lhsrate/rhsrate;

    if (tlhs != trhs)
        printf(" is %.3fx %s",
            ratio,faster ? "faster" : "slower");

    do {
        if (! bestflg)
            break;

        if (! faster)
            ratio = -ratio;

        if (ratio <= trhs->tst_bestrat)
            break;

        trhs->tst_bestrat = ratio;
        trhs->tst_bestgrow = growlen;

        //printf(" BETTER (G=%d)",growlen);
    } while (0);

    printf("\n");
}

// arycmp -- compare arrays
void
arycmp(tstctl_t *tlhs,tstctl_t *trhs)
{
    a_list *alhs = &tlhs->tst_lst;
    a_list *arhs = &trhs->tst_lst;
    a_type lhs;
    a_type rhs;
    int matchflg;

    do {
        if (alhs->array == NULL)
            break;
        if (arhs->array == NULL)
            break;

        if (alhs->elements != arhs->elements) {
            printf("arycmp: count mismatch -- %s=%lu %s=%lu\n",
                alhs->sym,alhs->elements,arhs->sym,arhs->elements);
            break;
        }

        matchflg = 1;
        for (size_t idx = 0;  idx < alhs->elements;  ++idx) {
            lhs = alhs->array[idx];
            rhs = arhs->array[idx];
            if (lhs != rhs) {
                printf("arycmp: data mismatch -- idx=%lu %s=%d %s=%d\n",
                    idx,alhs->sym,lhs,arhs->sym,rhs);
                matchflg = 0;
                break;
            }
        }

        if (matchflg)
            printf("%s: MATCH\n",arhs->sym);
    } while (0);
}

// arykill -- release an array
void
arykill(tstctl_t *tst)
{
    a_list *ary;

    ary = &tst->tst_lst;

    if (ary->array != NULL) {
        MCHECKALL;
        free(ary->array);
        MCHECKALL;
    }

    ary->array = NULL;

    ary->allocated_size = 0;
    ary->elements = 0;
}

// tvsecf -- get hi-res time
double
tvsecf(void)
{
    struct timespec ts;
    double sec;

    clock_gettime(CLOCK_REALTIME,&ts);
    sec = ts.tv_nsec;
    sec /= 1e9;
    sec += ts.tv_sec;

    return sec;
}

Sinovlar:

Har xil usullar uchun benchmark chiqishi. Bu javobga moslashish juda katta, shuning uchun men uni ikkinchi qismga qo'yaman

1
qo'shib qo'ydi
Siz olishingiz mumkin bo'lgan optimallashtirish simple va push ni bir xil .c inline surishtiruv. Qulaylik uchun bitta faylni joylashtirdim, lekin izohlarga asoslangan alohida fayllarni eslang. fix1 uchun bir nechta faylga bo'lingansangiz, uning push kodi com.c - bu ko'rib chiqilgan Inlining o'chirish/boshqarish uchun. Yoki -fno-inline-functions ni va [ gcc ] uchun -fno-inline-small-functions ni ishlating. Siz bir xil kodni olmaysiz. Shunday qilib, push kerak inline bo'lishi kerak. BTW, keyinchalik testdan o'tkazilgandan so'ng realloc shuffling bilan bog'liq "10x anomaliya" dir, shuning uchun u nima.
qo'shib qo'ydi muallif Craig Estey, manba
SIMPLE_INIT va SIMPLE_INIT va boshqalar. NOMACRO [3] kodi>]. (2) surish yoki to'g'ridan-to'g'ri yozish. agar surishtiriladigan nizom allaqachon shakllantirilgan bo'lsa yaxshi bo'lar edi. Siz kosmik/to'g'ridan-to'g'ri yaxshiroq hisoblangan maxsus hisob-kitoblarni qilyapsiz. (3) growlen dan foydalanish. Buni boshqalarga ortogonal sifatida qo'shish mumkin. emas, balki ishonchli ga new_chunk uchun biron-bir optimizerni kutmoqdaman. Agar siz ushbu qatorni tuzishni juda ko'p ishlatmoqchi bo'lsangiz, uni tweaking bunga loyiqdir. Men uchun bir necha bor ko'p yillar mobaynida tweaked.
qo'shib qo'ydi muallif Craig Estey, manba
Xatolarni tuzatgan bo'lishi mumkin, deb o'ylayman, biroq u hali lavozimda edi. Men muhim bir mezonning soddaligi ekanligini o'yladim. Shuning uchun, men so'lni qildim. growlen - bu juda oddiy tezlik. Ushbu holda, derleyici realloc qo'ng'iroqlarini kamaytirishni bilolmaydi. Buning hammasini 7 kun oldin kodladim, ammo "anomaliya" ga osib qo'ydim. "Kosmik" funksiyasi juda std. Men bu kodni ko'p marta bajarganman, realloc qatori impls. Siz "surish" bilan qolishingiz mumkin, lekin siz 2.7x sekinroq, hamma narsa tengdir
qo'shib qo'ydi muallif Craig Estey, manba
Dasturingiz uchun rahmat. Ro'yxatda xatolik yuz berdi. Men buni aniqladim. Men kompilyatorning qattiq ishlashini xohlayman. Yuqori darajada tilni ishlatish uchun sabab bor. Biroq, SIMPLE_INIT juda yaxshi ko'rinadi. Reallok bilan ishlaydigan holat bu juda qiyin. Buni optimallashtirish uchun juda boshqacha savol tug'iladi.
qo'shib qo'ydi muallif Michas, manba
Clang yaxshi ish. Fix2 va fix4 bir xil ko'rinadi.
qo'shib qo'ydi muallif Michas, manba

NOTE: This answer is a continuation of my original answer because of space limitations (i.e. if upvoting, pls use original above: https://stackoverflow.com/a/39562827/5382650)


Sinovlar:

Asl koddagi xato sababli, bu kesh ko'rsatkichini keskinlashtirdi, chunki u sun'iy ravishda kesh ishini yaxshilash uchun kerak bo'ladi, chunki u emas, balki butun majmuani o`tkazishi kerak. Boshqacha aytganda, bu xato aniqlangandan ko'ra yaxshiroq ishlashga o'xshaydi.

Shunday qilib, fix1 dan foydalanish dastlabki ish faoliyatini yaxshilash uchun yaxshi ko'rsatkich bo'ladi va quyida keltirilgan ma'lumotlardan foydalaniladi. Asl [ holda tuzatish] 0.02 hosil qiladi, lekin fix1 0.06 ni beradi, va shuni aniqroq deb hisoblayman.

O'sish ko'rsatkichi "tungi parametridir" va "eng yaxshi" qiymati bilan fix4 versiyasi 2.7x yaxshilanadi. Men ishonamanki, eng ishonchli natijadir.

Biroq, ga qaramasdan, muhim birligi sinovlari, uzoq sinovlar, mcheck (3) ilovasini va boshqalarni qo'llashni tushuntira olmaydigan ma'lumotlardagi anomaliya mavjud. testning bir qismi sifatida algoritm [ bilan xato]. Agar dastlabki test birinchi sinov bo'lsa yoki fix1 dan keyin ishlatilsa, u "shkala" natijalarini beradi.

Ammo, asl nusxasi fix2 , fix3 yoki fix4 dan keyin ishlasa, ba'zan O'ziga nisbatan 10x yomonroq ishlashini ishlab chiqaradi. O'sish qiymati original tomonidan ishlatilmaydigan emas. Ammo, dastlabki xatti-harakat, avvalgi algoritmda ishlatiladigan spekulyativ o'sish omiliga bog'liq.

Ba'zan, "noqonuniy" uyasi orijinali, soxtalashtirilgan/sun'iy ravishda past qiymatga ega (taxminan 0,02). "Haywire" ketganda, u 10x sekin harakat qiladi (taxminan 0,2).

Bu erda "omad loyixasi" va "omad tilaymiz" kabi ko'rinadi. Agar -G parametriga boshqa qiymat berilgan bo'lsa (masalan, 0- 300 o'rtasidagi barcha o'sish ko'rsatkichlarini tekshiradigan -G-300 ), bir nechta ish mavjud.

Ishonchim komilki, noto'g'ri natijalar tegishli emas, lekin men ularni har qanday holatda saqlab qoldim. Bu faqat shovqin bo'lishi mumkin, ya'ni qadriyatlar yaxshi va xotira reallocator ichidagi ba'zi ichki elementlar tufayli o'zgarib turadi. Buning uchun ko'proq ichki bo'sh blokni ajratish/birlashma va boshqalar.

Agar mcheck ni amalga oshirish rejimi mavjud bo'lsa, ushbu tartib har bir narsa uchun toza billingni taqdim etganligi sababli realloc maydonining o'chirilishi sababli AFAICT, emas. sog'liqni saqlash.

M=1000000
D=0
f=1
s=1
reference test is fix1

G=0
org1 0.028413773 is 2.462x faster
fix1 0.069955111
org2 0.035362244 is 1.978x faster
fix2 0.032926321 is 2.125x faster
org3 0.268535376 is 3.839x slower
fix3 0.026652813 is 2.625x faster
fix4 0.025245905 is 2.771x faster

G=1
org1 0.027498960 is 2.517x faster
fix1 0.069201946
org2 0.033916712 is 2.040x faster
fix2 0.031118631 is 2.224x faster
org3 0.264514446 is 3.822x slower
fix3 0.026646614 is 2.597x faster
fix4 0.025324345 is 2.733x faster

G=2
org1 0.026978731 is 2.496x faster
fix1 0.067343950
org2 0.034334421 is 1.961x faster
fix2 0.031268835 is 2.154x faster
org3 0.266630888 is 3.959x slower
fix3 0.026658535 is 2.526x faster
fix4 0.025254488 is 2.667x faster

G=3
org1 0.027746677 is 2.495x faster
fix1 0.069227457
org2 0.033862829 is 2.044x faster
fix2 0.031069279 is 2.228x faster
org3 0.287544250 is 4.154x slower
fix3 0.026713371 is 2.591x faster
fix4 0.025189638 is 2.748x faster

G=4
org1 0.027034283 is 2.527x faster
fix1 0.068307161
org2 0.033991575 is 2.010x faster
fix2 0.031272411 is 2.184x faster
org3 0.311707735 is 4.563x slower
fix3 0.026990414 is 2.531x faster
fix4 0.025078297 is 2.724x faster

G=5
org1 0.027446985 is 2.429x faster
fix1 0.066675663
org2 0.033823967 is 1.971x faster
fix2 0.031498909 is 2.117x faster
org3 0.331423283 is 4.971x slower
fix3 0.026667356 is 2.500x faster
fix4 0.026413918 is 2.524x faster

G=6
org1 0.027255535 is 2.428x faster
fix1 0.066179037
org2 0.033841848 is 1.956x faster
fix2 0.031159401 is 2.124x faster
org3 0.335711241 is 5.073x slower
fix3 0.026690722 is 2.479x faster
fix4 0.025039911 is 2.643x faster

G=7
org1 0.027280807 is 2.440x faster
fix1 0.066556692
org2 0.034326553 is 1.939x faster
fix2 0.031259060 is 2.129x faster
org3 0.331621408 is 4.983x slower
fix3 0.026686430 is 2.494x faster
fix4 0.025387526 is 2.622x faster

G=8
org1 0.027087212 is 2.453x faster
fix1 0.066447973
org2 0.033598185 is 1.978x faster
fix2 0.031176090 is 2.131x faster
org3 0.034165382 is 1.945x faster
fix3 0.026757479 is 2.483x faster
fix4 0.025131702 is 2.644x faster

G=9
org1 0.027328253 is 2.451x faster
fix1 0.066978931
org2 0.034043789 is 1.967x faster
fix2 0.031486034 is 2.127x faster
org3 0.033723354 is 1.986x faster
fix3 0.027368069 is 2.447x faster
fix4 0.025647879 is 2.611x faster

G=10
org1 0.027052402 is 2.458x faster
fix1 0.066498756
org2 0.033848524 is 1.965x faster
fix2 0.031741381 is 2.095x faster
org3 0.033836603 is 1.965x faster
fix3 0.027002096 is 2.463x faster
fix4 0.025351524 is 2.623x faster

G=11
org1 0.027157784 is 2.471x faster
fix1 0.067117691
org2 0.033848047 is 1.983x faster
fix2 0.031594038 is 2.124x faster
org3 0.034133911 is 1.966x faster
fix3 0.027194977 is 2.468x faster
fix4 0.025204659 is 2.663x faster

G=12
org1 0.027328730 is 2.432x faster
fix1 0.066454649
org2 0.033915043 is 1.959x faster
fix2 0.031331778 is 2.121x faster
org3 0.033701420 is 1.972x faster
fix3 0.026796579 is 2.480x faster
fix4 0.025482893 is 2.608x faster

G=13
org1 0.027091503 is 2.520x faster
fix1 0.068269968
org2 0.033600807 is 2.032x faster
fix2 0.031302691 is 2.181x faster
org3 0.034220219 is 1.995x faster
fix3 0.026732683 is 2.554x faster
fix4 0.025168657 is 2.712x faster

G=14
org1 0.027466774 is 2.403x faster
fix1 0.065990925
org2 0.034015417 is 1.940x faster
fix2 0.031306028 is 2.108x faster
org3 0.033681631 is 1.959x faster
fix3 0.026975870 is 2.446x faster
fix4 0.025142908 is 2.625x faster

G=15
org1 0.030098915 is 2.202x faster
fix1 0.066287756
org2 0.033817768 is 1.960x faster
fix2 0.031510592 is 2.104x faster
org3 0.264448166 is 3.989x slower
fix3 0.026585102 is 2.493x faster
fix4 0.025573254 is 2.592x faster

G=16
org1 0.029087305 is 2.289x faster
fix1 0.066566944
org2 0.034010649 is 1.957x faster
fix2 0.032317400 is 2.060x faster
org3 0.269736767 is 4.052x slower
fix3 0.026986122 is 2.467x faster
fix4 0.025726795 is 2.587x faster

G=17
org1 0.027568817 is 2.418x faster
fix1 0.066652775
org2 0.033725500 is 1.976x faster
fix2 0.031077385 is 2.145x faster
org3 0.270752668 is 4.062x slower
fix3 0.028372288 is 2.349x faster
fix4 0.026800632 is 2.487x faster

G=18
org1 0.028200626 is 2.466x faster
fix1 0.069550514
org2 0.035360813 is 1.967x faster
fix2 0.033010244 is 2.107x faster
org3 0.308327198 is 4.433x slower
fix3 0.028569698 is 2.434x faster
fix4 0.028189659 is 2.467x faster

G=19
org1 0.028352022 is 2.457x faster
fix1 0.069663048
org2 0.035186291 is 1.980x faster
fix2 0.033131599 is 2.103x faster
org3 0.302445412 is 4.342x slower
fix3 0.028528690 is 2.442x faster
fix4 0.026380062 is 2.641x faster

G=20
org1 0.028351307 is 2.449x faster
fix1 0.069445372
org2 0.035343409 is 1.965x faster
fix2 0.032827139 is 2.115x faster
org3 0.333808899 is 4.807x slower
fix3 0.028279066 is 2.456x faster
fix4 0.026592016 is 2.612x faster

G=21
org1 0.028333902 is 2.457x faster
fix1 0.069613457
org2 0.035215616 is 1.977x faster
fix2 0.033250570 is 2.094x faster
org3 0.326132298 is 4.685x slower
fix3 0.026517391 is 2.625x faster
fix4 0.025246382 is 2.757x faster

G=22
org1 0.027449369 is 2.421x faster
fix1 0.066462278
org2 0.033666849 is 1.974x faster
fix2 0.031057119 is 2.140x faster
org3 0.332618952 is 5.005x slower
fix3 0.028064966 is 2.368x faster
fix4 0.026383400 is 2.519x faster

G=23
org1 0.028641462 is 2.444x faster
fix1 0.070001602
org2 0.035483837 is 1.973x faster
fix2 0.033087969 is 2.116x faster
org3 0.342431068 is 4.892x slower
fix3 0.028344154 is 2.470x faster
fix4 0.026709557 is 2.621x faster

G=24
org1 0.028158426 is 2.468x faster
fix1 0.069482327
org2 0.035173178 is 1.975x faster
fix2 0.033740997 is 2.059x faster
org3 0.346288681 is 4.984x slower
fix3 0.028279781 is 2.457x faster
fix4 0.027346849 is 2.541x faster

G=25
org1 0.028361082 is 2.469x faster
fix1 0.070035458
org2 0.035205841 is 1.989x faster
fix2 0.032957315 is 2.125x faster
org3 0.035385132 is 1.979x faster
fix3 0.028091431 is 2.493x faster
fix4 0.026364803 is 2.656x faster
0
qo'shib qo'ydi