Skip to content

Commit

Permalink
Merge pull request #181 from ashvardanian/main-dev
Browse files Browse the repository at this point in the history
MSVC Builds & Python Argument Parsing
  • Loading branch information
ashvardanian authored Oct 17, 2024
2 parents cc88ada + 4e33434 commit 0b2cf44
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 190 deletions.
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,16 @@ if(${STRINGZILLA_BUILD_SHARED})
"SZ_USE_ARM_NEON=1"
"SZ_USE_ARM_SVE=1")
endif()

if (MSVC)
# Add dependencies for necessary runtime libraries in case of static linking
# This ensures that basic runtime functions are available:
# msvcrt.lib: Microsoft Visual C Runtime, required for basic C runtime functions on Windows.
# vcruntime.lib: Microsoft Visual C++ Runtime library for basic runtime functions.
# ucrt.lib: Universal C Runtime, necessary for linking basic C functions like I/O.
target_link_libraries(${target} PRIVATE msvcrt.lib vcruntime.lib ucrt.lib)
endif()

endfunction()

define_shared(stringzilla_shared)
Expand All @@ -344,4 +354,6 @@ if(${STRINGZILLA_BUILD_SHARED})
"$<$<CXX_COMPILER_ID:MSVC>:/Oi-;/GS->")
target_link_options(stringzillite PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-nostdlib>")
target_link_options(stringzillite PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/NODEFAULTLIB>")


endif()
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ __Who is this for?__
<span style="color:#ABABAB;">arm:</span> <b>9.4</b> MB/s
</td>
<td align="center">
<code>uniform_int_distribution</code><br/>
<code>std::uniform_int_distribution</code><br/>
<span style="color:#ABABAB;">x86:</span> <b>47.2</b> &centerdot;
<span style="color:#ABABAB;">arm:</span> <b>20.4</b> MB/s
</td>
Expand All @@ -193,7 +193,7 @@ __Who is this for?__
<tr>
<td align="center">⚪</td>
<td align="center">
<code>transform</code><br/>
<code>std::transform</code><br/>
<span style="color:#ABABAB;">x86:</span> <b>3.81</b> &centerdot;
<span style="color:#ABABAB;">arm:</span> <b>2.65</b> GB/s
</td>
Expand Down
13 changes: 6 additions & 7 deletions c/lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -232,21 +232,20 @@ static void sz_dispatch_table_init(void) {
}

#if defined(_MSC_VER)
#pragma section(".CRT$XCU", read)
__declspec(allocate(".CRT$XCU")) void (*_sz_dispatch_table_init)() = sz_dispatch_table_init;

BOOL WINAPI DllMain(HINSTANCE hints, DWORD forward_reason, LPVOID lp) {
switch (forward_reason) {
case DLL_PROCESS_ATTACH: sz_dispatch_table_init(); return TRUE;
case DLL_PROCESS_ATTACH:
sz_dispatch_table_init(); // Ensure initialization
return TRUE;
case DLL_THREAD_ATTACH: return TRUE;
case DLL_THREAD_DETACH: return TRUE;
case DLL_PROCESS_DETACH: return TRUE;
}
}

#if SZ_AVOID_LIBC
BOOL WINAPI _DllMainCRTStartup(HINSTANCE hints, DWORD forward_reason, LPVOID lp) {
DllMain(hints, forward_reason, lp);
return TRUE;
}
#endif

#else
__attribute__((constructor)) static void sz_dispatch_table_init_on_gcc_or_clang(void) { sz_dispatch_table_init(); }
Expand Down
12 changes: 10 additions & 2 deletions include/stringzilla/stringzilla.h
Original file line number Diff line number Diff line change
Expand Up @@ -5323,8 +5323,16 @@ SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, s
// operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
// Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
//
// - `_mm512_mask_blend_epi8` - 1 cycle latency, and generally 2x can run in parallel.
// - `_mm512_test_epi8_mask` - 3 cycles latency, same as most comparison functions in AVX-512.
// - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
// - On Ice Lake: 3 cycles latency, ports: 1*p5
// - On Genoa: 6 cycles latency, ports: 1*FP12
// - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
// - On Ice Lake: 3 cycles latency, ports: 1*p05
// - On Genoa: 1 cycle latency, ports: 1*FP0123
// - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
// - On Ice Lake: 3 cycles latency, ports: 1*p5
// - On Genoa: 4 cycles latency, ports: 1*FP01
//
sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));
Expand Down
Loading

0 comments on commit 0b2cf44

Please sign in to comment.