/* * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) * * The white papers on CRC32C calculations with PCLMULQDQ instruction can be * downloaded from: * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf * * Copyright (C) 2012 Intel Corporation. * Copyright 2024 Google LLC * * Authors: * Wajdi Feghali <wajdi.k.feghali@intel.com> * James Guilford <james.guilford@intel.com> * David Cote <david.m.cote@intel.com> * Tim Chen <tim.c.chen@linux.intel.com> * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE.
*/
#include <linux/linkage.h>
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
# Define threshold below which buffers are considered "small" and routed to
# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200
################################################################
## 1) ALIGN:
################################################################
mov bufp_d, n_misaligned
neg n_misaligned
and $7, n_misaligned # calculate the misalignment amount of
# the address
je .Laligned # Skip if aligned
# Process 1 <= n_misaligned <= 7 bytes individually in order to align
# the remaining data to an 8-byte boundary.
.Ldo_align:
movq (bufp), %rax
add n_misaligned_q, bufp sub n_misaligned_q, len
.Lalign_loop:
crc32b %al, crc0 # compute crc32 of 1-byte
shr $8, %rax # get next byte
dec n_misaligned
jne .Lalign_loop
.Laligned:
################################################################
## 2) PROCESS BLOCK:
################################################################
cmp $128*24, len
jae .Lfull_block
.Lpartial_block:
# Compute floor(len / 24) to get num qwords to process from each lane.
imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
shr $16, %eax
jmp .Lcrc_3lanes
.Lfull_block:
# Processing 128 qwords from each lane.
mov $128, %eax
################################################################
## 3) CRC each of three lanes:
################################################################
.Lcrc_3lanes:
xor crc1,crc1
xor crc2,crc2
mov %eax, chunk_bytes
shl $3, chunk_bytes # num bytes to process from each lane sub $5, %eax # 4 for 4x_loop, 1 for special last iter
jl .Lcrc_3lanes_4x_done
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
# bookkeeping instructions, which can compete with crc32q for the ALUs.
.Lcrc_3lanes_4x_loop:
crc32q (bufp), crc0_q
crc32q (bufp,chunk_bytes_q), crc1
crc32q (bufp,chunk_bytes_q,2), crc2
crc32q 8(bufp), crc0_q
crc32q 8(bufp,chunk_bytes_q), crc1
crc32q 8(bufp,chunk_bytes_q,2), crc2
crc32q 16(bufp), crc0_q
crc32q 16(bufp,chunk_bytes_q), crc1
crc32q 16(bufp,chunk_bytes_q,2), crc2
crc32q 24(bufp), crc0_q
crc32q 24(bufp,chunk_bytes_q), crc1
crc32q 24(bufp,chunk_bytes_q,2), crc2
add $32, bufp sub $4, %eax
jge .Lcrc_3lanes_4x_loop
################################################################
## 5) If more blocks remain, goto (2):
################################################################
cmp $128*24, len
jae .Lfull_block
cmp $SMALL_SIZE, len
jae .Lpartial_block
#######################################################################
## 6) Process any remainder without interleaving:
#######################################################################
.Lsmall:
test len_dw, len_dw
jz .Ldone
mov len_dw, %eax
shr $3, %eax
jz .Ldo_dword
.Ldo_qwords:
crc32q (bufp), crc0_q
add $8, bufp
dec %eax
jnz .Ldo_qwords
.Ldo_dword:
test $4, len_dw
jz .Ldo_word
crc32l (bufp), crc0
add $4, bufp
.Ldo_word:
test $2, len_dw
jz .Ldo_byte
crc32w (bufp), crc0
add $2, bufp
.Ldo_byte:
test $1, len_dw
jz .Ldone
crc32b (bufp), crc0
.Ldone:
mov crc0, %eax
RET
SYM_FUNC_END(crc32c_x86_3way)
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.