< prev index next >
src/hotspot/cpu/s390/macroAssembler_s390.cpp
Print this page
*** 934,944 ****
assert((total_distance & 0x01L) == 0, "halfword alignment is mandatory");
assert(total_distance != 0, "sanity");
// Some extra safety net.
if (!RelAddr::is_in_range_of_RelAddr32(total_distance)) {
! guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "too far away");
}
(this)->relocate(rspec, relocInfo::pcrel_addr_format);
z_lgrl(Rdst, RelAddr::pcrel_off32(total_distance));
}
--- 934,944 ----
assert((total_distance & 0x01L) == 0, "halfword alignment is mandatory");
assert(total_distance != 0, "sanity");
// Some extra safety net.
if (!RelAddr::is_in_range_of_RelAddr32(total_distance)) {
! guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "load_long_pcrelative can't handle distance " INTPTR_FORMAT, total_distance);
}
(this)->relocate(rspec, relocInfo::pcrel_addr_format);
z_lgrl(Rdst, RelAddr::pcrel_off32(total_distance));
}
*** 954,964 ****
assert((total_distance & 0x01L) == 0, "halfword alignment is mandatory");
// Some extra safety net.
if (!RelAddr::is_in_range_of_RelAddr32(total_distance)) {
! guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "too far away");
}
(this)->relocate(rspec, relocInfo::pcrel_addr_format);
z_lgrl(Rdst, RelAddr::pcrel_off32(total_distance));
}
--- 954,964 ----
assert((total_distance & 0x01L) == 0, "halfword alignment is mandatory");
// Some extra safety net.
if (!RelAddr::is_in_range_of_RelAddr32(total_distance)) {
! guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "load_long_pcrelative can't handle distance " INTPTR_FORMAT, total_distance);
}
(this)->relocate(rspec, relocInfo::pcrel_addr_format);
z_lgrl(Rdst, RelAddr::pcrel_off32(total_distance));
}
*** 1023,1032 ****
--- 1023,1039 ----
} else {
ShouldNotReachHere();
}
}
+ void MacroAssembler::prefetch_read(Address a) {
+ z_pfd(1, a.disp20(), a.indexOrR0(), a.base());
+ }
+ void MacroAssembler::prefetch_update(Address a) {
+ z_pfd(2, a.disp20(), a.indexOrR0(), a.base());
+ }
+
// Clear a register, i.e. load const zero into reg.
// Return len (in bytes) of generated instruction(s).
// whole_reg: Clear 64 bits if true, 32 bits otherwise.
// set_cc: Use instruction that sets the condition code, if true.
int MacroAssembler::clear_reg(Register r, bool whole_reg, bool set_cc) {
*** 4894,4974 ****
// Special String Intrinsics. Implementation
//------------------------------------------------------
// Intrinsics for CompactStrings
! // Compress char[] to byte[]. odd_reg contains cnt. Kills dst. Early clobber: result
// The result is the number of characters copied before the first incompatible character was found.
! // If tmp2 is provided and the compression fails, the compression stops exactly at this point and the result is precise.
//
// Note: Does not behave exactly like package private StringUTF16 compress java implementation in case of failure:
! // - Different number of characters may have been written to dead array (if tmp2 not provided).
// - Returns a number <cnt instead of 0. (Result gets compared with cnt.)
! unsigned int MacroAssembler::string_compress(Register result, Register src, Register dst, Register odd_reg,
! Register even_reg, Register tmp, Register tmp2) {
! int block_start = offset();
! Label Lloop1, Lloop2, Lslow, Ldone;
! const Register addr2 = dst, ind1 = result, mask = tmp;
! const bool precise = (tmp2 != noreg);
BLOCK_COMMENT("string_compress {");
! z_sll(odd_reg, 1); // Number of bytes to read. (Must be a positive simm32.)
! clear_reg(ind1); // Index to read.
! z_llilf(mask, 0xFF00FF00);
! z_ahi(odd_reg, -16); // Last possible index for fast loop.
! z_brl(Lslow);
!
! // ind1: index, even_reg: index increment, odd_reg: index limit
! z_iihf(mask, 0xFF00FF00);
! z_lhi(even_reg, 16);
!
! bind(Lloop1); // 8 Characters per iteration.
! z_lg(Z_R0, Address(src, ind1));
! z_lg(Z_R1, Address(src, ind1, 8));
! if (precise) {
if (VM_Version::has_DistinctOpnds()) {
! z_ogrk(tmp2, Z_R0, Z_R1);
} else {
! z_lgr(tmp2, Z_R0);
! z_ogr(tmp2, Z_R1);
! }
! z_ngr(tmp2, mask);
! z_brne(Lslow); // Failed fast case, retry slowly.
}
! z_stcmh(Z_R0, 5, 0, addr2);
! z_stcm(Z_R0, 5, 2, addr2);
! if (!precise) { z_ogr(Z_R0, Z_R1); }
! z_stcmh(Z_R1, 5, 4, addr2);
! z_stcm(Z_R1, 5, 6, addr2);
if (!precise) {
! z_ngr(Z_R0, mask);
! z_brne(Ldone); // Failed (more than needed was written).
}
- z_aghi(addr2, 8);
- z_brxle(ind1, even_reg, Lloop1);
! bind(Lslow);
! // Compute index limit and skip if negative.
! z_ahi(odd_reg, 16-2); // Last possible index for slow loop.
! z_lhi(even_reg, 2);
! z_cr(ind1, odd_reg);
! z_brh(Ldone);
!
! bind(Lloop2); // 1 Character per iteration.
! z_llh(Z_R0, Address(src, ind1));
! z_tmll(Z_R0, 0xFF00);
! z_brnaz(Ldone); // Failed slow case: Return number of written characters.
! z_stc(Z_R0, Address(addr2));
! z_aghi(addr2, 1);
! z_brxle(ind1, even_reg, Lloop2);
! bind(Ldone); // result = ind1 = 2*cnt
! z_srl(ind1, 1);
BLOCK_COMMENT("} string_compress");
!
return offset() - block_start;
}
// Inflate byte[] to char[].
unsigned int MacroAssembler::string_inflate_trot(Register src, Register dst, Register cnt, Register tmp) {
--- 4901,5195 ----
// Special String Intrinsics. Implementation
//------------------------------------------------------
// Intrinsics for CompactStrings
! // Compress char[] to byte[].
! // Restores: src, dst
! // Uses: cnt
! // Kills: tmp, Z_R0, Z_R1.
! // Early clobber: result.
! // Note:
! // cnt is signed int. Do not rely on high word!
! // counts # characters, not bytes.
// The result is the number of characters copied before the first incompatible character was found.
! // If precise is true, the processing stops exactly at this point. Otherwise, the result may be off
! // by a few bytes. The result always indicates the number of copied characters.
//
// Note: Does not behave exactly like package private StringUTF16 compress java implementation in case of failure:
! // - Different number of characters may have been written to dead array (if precise is false).
// - Returns a number <cnt instead of 0. (Result gets compared with cnt.)
! unsigned int MacroAssembler::string_compress(Register result, Register src, Register dst, Register cnt,
! Register tmp, bool precise) {
! assert_different_registers(Z_R0, Z_R1, src, dst, cnt, tmp);
+ if (precise) {
+ BLOCK_COMMENT("encode_iso_array {");
+ } else {
BLOCK_COMMENT("string_compress {");
+ }
+ int block_start = offset();
! Register Rsrc = src;
! Register Rdst = dst;
! Register Rix = tmp;
! Register Rcnt = cnt;
! Register Rmask = result; // holds incompatibility check mask until result value is stored.
! Label ScalarShortcut, AllDone;
!
! z_iilf(Rmask, 0xFF00FF00);
! z_iihf(Rmask, 0xFF00FF00);
!
! {
! //---< shortcuts for short strings (very frequent) >---
! // Strings with 4 and 8 characters were fond to occur very frequently.
! // Therefore, we handle them right away with minimal overhead.
! Label skipShortcut, skip4Shortcut, skip8Shortcut;
! Register Rout = Z_R0;
! z_chi(Rcnt, 4);
! z_brne(skip4Shortcut); // 4 characters are very frequent
! z_lg(Z_R0, 0, Rsrc); // Treat exactly 4 characters specially.
if (VM_Version::has_DistinctOpnds()) {
! Rout = Z_R0;
! z_ngrk(Rix, Z_R0, Rmask);
} else {
! Rout = Rix;
! z_lgr(Rix, Z_R0);
! z_ngr(Z_R0, Rmask);
! }
! z_brnz(skipShortcut);
! z_stcmh(Rout, 5, 0, Rdst);
! z_stcm(Rout, 5, 2, Rdst);
! z_lgfr(result, Rcnt);
! z_bru(AllDone);
! bind(skip4Shortcut);
!
! z_chi(Rcnt, 8);
! z_brne(skip8Shortcut); // There's more to do...
! z_lmg(Z_R0, Z_R1, 0, Rsrc); // Treat exactly 8 characters specially.
! if (VM_Version::has_DistinctOpnds()) {
! Rout = Z_R0;
! z_ogrk(Rix, Z_R0, Z_R1);
! z_ngr(Rix, Rmask);
! } else {
! Rout = Rix;
! z_lgr(Rix, Z_R0);
! z_ogr(Z_R0, Z_R1);
! z_ngr(Z_R0, Rmask);
! }
! z_brnz(skipShortcut);
! z_stcmh(Rout, 5, 0, Rdst);
! z_stcm(Rout, 5, 2, Rdst);
! z_stcmh(Z_R1, 5, 4, Rdst);
! z_stcm(Z_R1, 5, 6, Rdst);
! z_lgfr(result, Rcnt);
! z_bru(AllDone);
!
! bind(skip8Shortcut);
! clear_reg(Z_R0, true, false); // #characters already processed (none). Precond for scalar loop.
! z_brl(ScalarShortcut); // Just a few characters
!
! bind(skipShortcut);
! }
! clear_reg(Z_R0); // make sure register is properly initialized.
!
! if (VM_Version::has_VectorFacility()) {
! const int min_vcnt = 32; // Minimum #characters required to use vector instructions.
! // Otherwise just do nothing in vector mode.
! // Must be multiple of 2*(vector register length in chars (8 HW = 128 bits)).
! const int log_min_vcnt = exact_log2(min_vcnt);
! Label VectorLoop, VectorDone, VectorBreak;
!
! VectorRegister Vtmp1 = Z_V16;
! VectorRegister Vtmp2 = Z_V17;
! VectorRegister Vmask = Z_V18;
! VectorRegister Vzero = Z_V19;
! VectorRegister Vsrc_first = Z_V20;
! VectorRegister Vsrc_last = Z_V23;
!
! assert((Vsrc_last->encoding() - Vsrc_first->encoding() + 1) == min_vcnt/8, "logic error");
! assert(VM_Version::has_DistinctOpnds(), "Assumption when has_VectorFacility()");
! z_srak(Rix, Rcnt, log_min_vcnt); // # vector loop iterations
! z_brz(VectorDone); // not enough data for vector loop
!
! z_vzero(Vzero); // all zeroes
! z_vgmh(Vmask, 0, 7); // generate 0xff00 mask for all 2-byte elements
! z_sllg(Z_R0, Rix, log_min_vcnt); // remember #chars that will be processed by vector loop
!
! bind(VectorLoop);
! z_vlm(Vsrc_first, Vsrc_last, 0, Rsrc);
! add2reg(Rsrc, min_vcnt*2);
!
! //---< check for incompatible character >---
! z_vo(Vtmp1, Z_V20, Z_V21);
! z_vo(Vtmp2, Z_V22, Z_V23);
! z_vo(Vtmp1, Vtmp1, Vtmp2);
! z_vn(Vtmp1, Vtmp1, Vmask);
! z_vceqhs(Vtmp1, Vtmp1, Vzero); // high half of all chars must be zero for successful compress.
! z_brne(VectorBreak); // break vector loop, incompatible character found.
! // re-process data from current iteration in break handler.
!
! //---< pack & store characters >---
! z_vpkh(Vtmp1, Z_V20, Z_V21); // pack (src1, src2) -> tmp1
! z_vpkh(Vtmp2, Z_V22, Z_V23); // pack (src3, src4) -> tmp2
! z_vstm(Vtmp1, Vtmp2, 0, Rdst); // store packed string
! add2reg(Rdst, min_vcnt);
!
! z_brct(Rix, VectorLoop);
!
! z_bru(VectorDone);
!
! bind(VectorBreak);
! z_sll(Rix, log_min_vcnt); // # chars processed so far in VectorLoop, excl. current iteration.
! z_sr(Z_R0, Rix); // correct # chars processed in total.
!
! bind(VectorDone);
! }
!
! {
! const int min_cnt = 8; // Minimum #characters required to use unrolled loop.
! // Otherwise just do nothing in unrolled loop.
! // Must be multiple of 8.
! const int log_min_cnt = exact_log2(min_cnt);
! Label UnrolledLoop, UnrolledDone, UnrolledBreak;
!
! if (VM_Version::has_DistinctOpnds()) {
! z_srk(Rix, Rcnt, Z_R0); // remaining # chars to compress in unrolled loop
! } else {
! z_lr(Rix, Rcnt);
! z_sr(Rix, Z_R0);
}
! z_sra(Rix, log_min_cnt); // unrolled loop count
! z_brz(UnrolledDone);
!
! bind(UnrolledLoop);
! z_lmg(Z_R0, Z_R1, 0, Rsrc);
! if (precise) {
! z_ogr(Z_R1, Z_R0); // check all 8 chars for incompatibility
! z_ngr(Z_R1, Rmask);
! z_brnz(UnrolledBreak);
!
! z_lg(Z_R1, 8, Rsrc); // reload destroyed register
! z_stcmh(Z_R0, 5, 0, Rdst);
! z_stcm(Z_R0, 5, 2, Rdst);
! } else {
! z_stcmh(Z_R0, 5, 0, Rdst);
! z_stcm(Z_R0, 5, 2, Rdst);
!
! z_ogr(Z_R0, Z_R1);
! z_ngr(Z_R0, Rmask);
! z_brnz(UnrolledBreak);
! }
! z_stcmh(Z_R1, 5, 4, Rdst);
! z_stcm(Z_R1, 5, 6, Rdst);
!
! add2reg(Rsrc, min_cnt*2);
! add2reg(Rdst, min_cnt);
! z_brct(Rix, UnrolledLoop);
!
! z_lgfr(Z_R0, Rcnt); // # chars processed in total after unrolled loop.
! z_nilf(Z_R0, ~(min_cnt-1));
! z_tmll(Rcnt, min_cnt-1);
! z_brnaz(ScalarShortcut); // if all bits zero, there is nothing left to do for scalar loop.
! // Rix == 0 in all cases.
! z_lgfr(result, Rcnt); // all characters processed.
! z_sgfr(Rdst, Rcnt); // restore ptr
! z_sgfr(Rsrc, Rcnt); // restore ptr, double the element count for Rsrc restore
! z_sgfr(Rsrc, Rcnt);
! z_bru(AllDone);
!
! bind(UnrolledBreak);
! z_lgfr(Z_R0, Rcnt); // # chars processed in total after unrolled loop
! z_nilf(Z_R0, ~(min_cnt-1));
! z_sll(Rix, log_min_cnt); // # chars processed so far in UnrolledLoop, excl. current iteration.
! z_sr(Z_R0, Rix); // correct # chars processed in total.
if (!precise) {
! z_lgfr(result, Z_R0);
! z_aghi(result, min_cnt/2); // min_cnt/2 characters have already been written
! // but ptrs were not updated yet.
! z_sgfr(Rdst, Z_R0); // restore ptr
! z_sgfr(Rsrc, Z_R0); // restore ptr, double the element count for Rsrc restore
! z_sgfr(Rsrc, Z_R0);
! z_bru(AllDone);
! }
! bind(UnrolledDone);
! }
!
! {
! Label ScalarLoop, ScalarDone, ScalarBreak;
!
! bind(ScalarShortcut);
! z_ltgfr(result, Rcnt);
! z_brz(AllDone);
!
! {
! //---< Special treatment for very short strings (one or two characters) >---
! // For these strings, we are sure that the above code was skipped.
! // Thus, no registers were modified, register restore is not required.
! Label ScalarDoit, Scalar2Char;
! z_chi(Rcnt, 2);
! z_brh(ScalarDoit);
! z_llh(Z_R1, 0, Z_R0, Rsrc);
! z_bre(Scalar2Char);
! z_tmll(Z_R1, 0xff00);
! z_lghi(result, 0); // cnt == 1, first char invalid, no chars successfully processed
! z_brnaz(AllDone);
! z_stc(Z_R1, 0, Z_R0, Rdst);
! z_lghi(result, 1);
! z_bru(AllDone);
!
! bind(Scalar2Char);
! z_llh(Z_R0, 2, Z_R0, Rsrc);
! z_tmll(Z_R1, 0xff00);
! z_lghi(result, 0); // cnt == 2, first char invalid, no chars successfully processed
! z_brnaz(AllDone);
! z_stc(Z_R1, 0, Z_R0, Rdst);
! z_tmll(Z_R0, 0xff00);
! z_lghi(result, 1); // cnt == 2, second char invalid, one char successfully processed
! z_brnaz(AllDone);
! z_stc(Z_R0, 1, Z_R0, Rdst);
! z_lghi(result, 2);
! z_bru(AllDone);
!
! bind(ScalarDoit);
}
! if (VM_Version::has_DistinctOpnds()) {
! z_srk(Rix, Rcnt, Z_R0); // remaining # chars to compress in unrolled loop
! } else {
! z_lr(Rix, Rcnt);
! z_sr(Rix, Z_R0);
! }
! z_lgfr(result, Rcnt); // # processed characters (if all runs ok).
! z_brz(ScalarDone);
! bind(ScalarLoop);
! z_llh(Z_R1, 0, Z_R0, Rsrc);
! z_tmll(Z_R1, 0xff00);
! z_brnaz(ScalarBreak);
! z_stc(Z_R1, 0, Z_R0, Rdst);
! add2reg(Rsrc, 2);
! add2reg(Rdst, 1);
! z_brct(Rix, ScalarLoop);
!
! z_bru(ScalarDone);
!
! bind(ScalarBreak);
! z_sr(result, Rix);
!
! bind(ScalarDone);
! z_sgfr(Rdst, result); // restore ptr
! z_sgfr(Rsrc, result); // restore ptr, double the element count for Rsrc restore
! z_sgfr(Rsrc, result);
! }
! bind(AllDone);
+ if (precise) {
+ BLOCK_COMMENT("} encode_iso_array");
+ } else {
BLOCK_COMMENT("} string_compress");
! }
return offset() - block_start;
}
// Inflate byte[] to char[].
unsigned int MacroAssembler::string_inflate_trot(Register src, Register dst, Register cnt, Register tmp) {
*** 4995,5051 ****
BLOCK_COMMENT("} string_inflate");
return offset() - block_start;
}
! // Inflate byte[] to char[]. odd_reg contains cnt. Kills src.
! unsigned int MacroAssembler::string_inflate(Register src, Register dst, Register odd_reg,
! Register even_reg, Register tmp) {
! int block_start = offset();
BLOCK_COMMENT("string_inflate {");
- Label Lloop1, Lloop2, Lslow, Ldone;
- const Register addr1 = src, ind2 = tmp;
! z_sll(odd_reg, 1); // Number of bytes to write. (Must be a positive simm32.)
! clear_reg(ind2); // Index to write.
! z_ahi(odd_reg, -16); // Last possible index for fast loop.
! z_brl(Lslow);
- // ind2: index, even_reg: index increment, odd_reg: index limit
clear_reg(Z_R0);
clear_reg(Z_R1);
- z_lhi(even_reg, 16);
-
- bind(Lloop1); // 8 Characters per iteration.
- z_icmh(Z_R0, 5, 0, addr1);
- z_icmh(Z_R1, 5, 4, addr1);
- z_icm(Z_R0, 5, 2, addr1);
- z_icm(Z_R1, 5, 6, addr1);
- z_aghi(addr1, 8);
- z_stg(Z_R0, Address(dst, ind2));
- z_stg(Z_R1, Address(dst, ind2, 8));
- z_brxle(ind2, even_reg, Lloop1);
-
- bind(Lslow);
- // Compute index limit and skip if negative.
- z_ahi(odd_reg, 16-2); // Last possible index for slow loop.
- z_lhi(even_reg, 2);
- z_cr(ind2, odd_reg);
- z_brh(Ldone);
-
- bind(Lloop2); // 1 Character per iteration.
- z_llc(Z_R0, Address(addr1));
- z_sth(Z_R0, Address(dst, ind2));
- z_aghi(addr1, 1);
- z_brxle(ind2, even_reg, Lloop2);
! bind(Ldone);
BLOCK_COMMENT("} string_inflate");
return offset() - block_start;
}
// Kills src.
unsigned int MacroAssembler::has_negatives(Register result, Register src, Register cnt,
--- 5216,5647 ----
BLOCK_COMMENT("} string_inflate");
return offset() - block_start;
}
! // Inflate byte[] to char[].
! // Restores: src, dst
! // Uses: cnt
! // Kills: tmp, Z_R0, Z_R1.
! // Note:
! // cnt is signed int. Do not rely on high word!
! // counts # characters, not bytes.
! unsigned int MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
! assert_different_registers(Z_R0, Z_R1, src, dst, cnt, tmp);
BLOCK_COMMENT("string_inflate {");
+ int block_start = offset();
+
+ Register Rcnt = cnt; // # characters (src: bytes, dst: char (2-byte)), remaining after current loop.
+ Register Rix = tmp; // loop index
+ Register Rsrc = src; // addr(src array)
+ Register Rdst = dst; // addr(dst array)
+ Label ScalarShortcut, AllDone;
+
+ {
+ //---< shortcuts for short strings (very frequent) >---
+ Label skipShortcut, skip4Shortcut;
+ z_ltr(Rcnt, Rcnt); // absolutely nothing to do for strings of len == 0.
+ z_brz(AllDone);
+ clear_reg(Z_R0); // make sure registers are properly initialized.
+ clear_reg(Z_R1);
+ z_chi(Rcnt, 4);
+ z_brne(skip4Shortcut); // 4 characters are very frequent
+ z_icm(Z_R0, 5, 0, Rsrc); // Treat exactly 4 characters specially.
+ z_icm(Z_R1, 5, 2, Rsrc);
+ z_stm(Z_R0, Z_R1, 0, Rdst);
+ z_bru(AllDone);
+ bind(skip4Shortcut);
+
+ z_chi(Rcnt, 8);
+ z_brh(skipShortcut); // There's a lot to do...
+ z_lgfr(Z_R0, Rcnt); // remaining #characters (<= 8). Precond for scalar loop.
+ // This does not destroy the "register cleared" state of Z_R0.
+ z_brl(ScalarShortcut); // Just a few characters
+ z_icmh(Z_R0, 5, 0, Rsrc); // Treat exactly 8 characters specially.
+ z_icmh(Z_R1, 5, 4, Rsrc);
+ z_icm(Z_R0, 5, 2, Rsrc);
+ z_icm(Z_R1, 5, 6, Rsrc);
+ z_stmg(Z_R0, Z_R1, 0, Rdst);
+ z_bru(AllDone);
+ bind(skipShortcut);
+ }
+ clear_reg(Z_R0); // make sure register is properly initialized.
+
+ if (VM_Version::has_VectorFacility()) {
+ const int min_vcnt = 32; // Minimum #characters required to use vector instructions.
+ // Otherwise just do nothing in vector mode.
+ // Must be multiple of vector register length (16 bytes = 128 bits).
+ const int log_min_vcnt = exact_log2(min_vcnt);
+ Label VectorLoop, VectorDone;
+
+ assert(VM_Version::has_DistinctOpnds(), "Assumption when has_VectorFacility()");
+ z_srak(Rix, Rcnt, log_min_vcnt); // calculate # vector loop iterations
+ z_brz(VectorDone); // skip if none
+
+ z_sllg(Z_R0, Rix, log_min_vcnt); // remember #chars that will be processed by vector loop
+
+ bind(VectorLoop);
+ z_vlm(Z_V20, Z_V21, 0, Rsrc); // get next 32 characters (single-byte)
+ add2reg(Rsrc, min_vcnt);
+
+ z_vuplhb(Z_V22, Z_V20); // V2 <- (expand) V0(high)
+ z_vupllb(Z_V23, Z_V20); // V3 <- (expand) V0(low)
+ z_vuplhb(Z_V24, Z_V21); // V4 <- (expand) V1(high)
+ z_vupllb(Z_V25, Z_V21); // V5 <- (expand) V1(low)
+ z_vstm(Z_V22, Z_V25, 0, Rdst); // store next 32 bytes
+ add2reg(Rdst, min_vcnt*2);
+
+ z_brct(Rix, VectorLoop);
+
+ bind(VectorDone);
+ }
+
+ const int min_cnt = 8; // Minimum #characters required to use unrolled scalar loop.
+ // Otherwise just do nothing in unrolled scalar mode.
+ // Must be multiple of 8.
+ {
+ const int log_min_cnt = exact_log2(min_cnt);
+ Label UnrolledLoop, UnrolledDone;
! if (VM_Version::has_DistinctOpnds()) {
! z_srk(Rix, Rcnt, Z_R0); // remaining # chars to process in unrolled loop
! } else {
! z_lr(Rix, Rcnt);
! z_sr(Rix, Z_R0);
! }
! z_sra(Rix, log_min_cnt); // unrolled loop count
! z_brz(UnrolledDone);
clear_reg(Z_R0);
clear_reg(Z_R1);
! bind(UnrolledLoop);
! z_icmh(Z_R0, 5, 0, Rsrc);
! z_icmh(Z_R1, 5, 4, Rsrc);
! z_icm(Z_R0, 5, 2, Rsrc);
! z_icm(Z_R1, 5, 6, Rsrc);
! add2reg(Rsrc, min_cnt);
!
! z_stmg(Z_R0, Z_R1, 0, Rdst);
!
! add2reg(Rdst, min_cnt*2);
! z_brct(Rix, UnrolledLoop);
!
! bind(UnrolledDone);
! z_lgfr(Z_R0, Rcnt); // # chars left over after unrolled loop.
! z_nilf(Z_R0, min_cnt-1);
! z_brnz(ScalarShortcut); // if zero, there is nothing left to do for scalar loop.
! // Rix == 0 in all cases.
! z_sgfr(Z_R0, Rcnt); // negative # characters the ptrs have been advanced previously.
! z_agr(Rdst, Z_R0); // restore ptr, double the element count for Rdst restore.
! z_agr(Rdst, Z_R0);
! z_agr(Rsrc, Z_R0); // restore ptr.
! z_bru(AllDone);
! }
!
! {
! bind(ScalarShortcut);
! // Z_R0 must contain remaining # characters as 64-bit signed int here.
! // register contents is preserved over scalar processing (for register fixup).
!
! {
! Label ScalarDefault;
! z_chi(Rcnt, 2);
! z_brh(ScalarDefault);
! z_llc(Z_R0, 0, Z_R0, Rsrc); // 6 bytes
! z_sth(Z_R0, 0, Z_R0, Rdst); // 4 bytes
! z_brl(AllDone);
! z_llc(Z_R0, 1, Z_R0, Rsrc); // 6 bytes
! z_sth(Z_R0, 2, Z_R0, Rdst); // 4 bytes
! z_bru(AllDone);
! bind(ScalarDefault);
! }
!
! Label CodeTable;
! // Some comments on Rix calculation:
! // - Rcnt is small, therefore no bits shifted out of low word (sll(g) instructions).
! // - high word of both Rix and Rcnt may contain garbage
! // - the final lngfr takes care of that garbage, extending the sign to high word
! z_sllg(Rix, Z_R0, 2); // calculate 10*Rix = (4*Rix + Rix)*2
! z_ar(Rix, Z_R0);
! z_larl(Z_R1, CodeTable);
! z_sll(Rix, 1);
! z_lngfr(Rix, Rix); // ix range: [0..7], after inversion & mult: [-(7*12)..(0*12)].
! z_bc(Assembler::bcondAlways, 0, Rix, Z_R1);
!
! z_llc(Z_R1, 6, Z_R0, Rsrc); // 6 bytes
! z_sth(Z_R1, 12, Z_R0, Rdst); // 4 bytes
!
! z_llc(Z_R1, 5, Z_R0, Rsrc);
! z_sth(Z_R1, 10, Z_R0, Rdst);
!
! z_llc(Z_R1, 4, Z_R0, Rsrc);
! z_sth(Z_R1, 8, Z_R0, Rdst);
!
! z_llc(Z_R1, 3, Z_R0, Rsrc);
! z_sth(Z_R1, 6, Z_R0, Rdst);
!
! z_llc(Z_R1, 2, Z_R0, Rsrc);
! z_sth(Z_R1, 4, Z_R0, Rdst);
!
! z_llc(Z_R1, 1, Z_R0, Rsrc);
! z_sth(Z_R1, 2, Z_R0, Rdst);
!
! z_llc(Z_R1, 0, Z_R0, Rsrc);
! z_sth(Z_R1, 0, Z_R0, Rdst);
! bind(CodeTable);
!
! z_chi(Rcnt, 8); // no fixup for small strings. Rdst, Rsrc were not modified.
! z_brl(AllDone);
!
! z_sgfr(Z_R0, Rcnt); // # characters the ptrs have been advanced previously.
! z_agr(Rdst, Z_R0); // restore ptr, double the element count for Rdst restore.
! z_agr(Rdst, Z_R0);
! z_agr(Rsrc, Z_R0); // restore ptr.
! }
! bind(AllDone);
BLOCK_COMMENT("} string_inflate");
+ return offset() - block_start;
+ }
+
+ // Inflate byte[] to char[], length known at compile time.
+ // Restores: src, dst
+ // Kills: tmp, Z_R0, Z_R1.
+ // Note:
+ // len is signed int. Counts # characters, not bytes.
+ unsigned int MacroAssembler::string_inflate_const(Register src, Register dst, Register tmp, int len) {
+ assert_different_registers(Z_R0, Z_R1, src, dst, tmp);
+
+ BLOCK_COMMENT("string_inflate_const {");
+ int block_start = offset();
+
+ Register Rix = tmp; // loop index
+ Register Rsrc = src; // addr(src array)
+ Register Rdst = dst; // addr(dst array)
+ Label ScalarShortcut, AllDone;
+ int nprocessed = 0;
+ int src_off = 0; // compensate for saved (optimized away) ptr advancement.
+ int dst_off = 0; // compensate for saved (optimized away) ptr advancement.
+ bool restore_inputs = false;
+ bool workreg_clear = false;
+
+ if ((len >= 32) && VM_Version::has_VectorFacility()) {
+ const int min_vcnt = 32; // Minimum #characters required to use vector instructions.
+ // Otherwise just do nothing in vector mode.
+ // Must be multiple of vector register length (16 bytes = 128 bits).
+ const int log_min_vcnt = exact_log2(min_vcnt);
+ const int iterations = (len - nprocessed) >> log_min_vcnt;
+ nprocessed += iterations << log_min_vcnt;
+ Label VectorLoop;
+
+ if (iterations == 1) {
+ z_vlm(Z_V20, Z_V21, 0+src_off, Rsrc); // get next 32 characters (single-byte)
+ z_vuplhb(Z_V22, Z_V20); // V2 <- (expand) V0(high)
+ z_vupllb(Z_V23, Z_V20); // V3 <- (expand) V0(low)
+ z_vuplhb(Z_V24, Z_V21); // V4 <- (expand) V1(high)
+ z_vupllb(Z_V25, Z_V21); // V5 <- (expand) V1(low)
+ z_vstm(Z_V22, Z_V25, 0+dst_off, Rdst); // store next 32 bytes
+
+ src_off += min_vcnt;
+ dst_off += min_vcnt*2;
+ } else {
+ restore_inputs = true;
+
+ z_lgfi(Rix, len>>log_min_vcnt);
+ bind(VectorLoop);
+ z_vlm(Z_V20, Z_V21, 0, Rsrc); // get next 32 characters (single-byte)
+ add2reg(Rsrc, min_vcnt);
+
+ z_vuplhb(Z_V22, Z_V20); // V2 <- (expand) V0(high)
+ z_vupllb(Z_V23, Z_V20); // V3 <- (expand) V0(low)
+ z_vuplhb(Z_V24, Z_V21); // V4 <- (expand) V1(high)
+ z_vupllb(Z_V25, Z_V21); // V5 <- (expand) V1(low)
+ z_vstm(Z_V22, Z_V25, 0, Rdst); // store next 32 bytes
+ add2reg(Rdst, min_vcnt*2);
+
+ z_brct(Rix, VectorLoop);
+ }
+ }
+
+ if (((len-nprocessed) >= 16) && VM_Version::has_VectorFacility()) {
+ const int min_vcnt = 16; // Minimum #characters required to use vector instructions.
+ // Otherwise just do nothing in vector mode.
+ // Must be multiple of vector register length (16 bytes = 128 bits).
+ const int log_min_vcnt = exact_log2(min_vcnt);
+ const int iterations = (len - nprocessed) >> log_min_vcnt;
+ nprocessed += iterations << log_min_vcnt;
+ assert(iterations == 1, "must be!");
+
+ z_vl(Z_V20, 0+src_off, Z_R0, Rsrc); // get next 16 characters (single-byte)
+ z_vuplhb(Z_V22, Z_V20); // V2 <- (expand) V0(high)
+ z_vupllb(Z_V23, Z_V20); // V3 <- (expand) V0(low)
+ z_vstm(Z_V22, Z_V23, 0+dst_off, Rdst); // store next 32 bytes
+
+ src_off += min_vcnt;
+ dst_off += min_vcnt*2;
+ }
+
+ if ((len-nprocessed) > 8) {
+ const int min_cnt = 8; // Minimum #characters required to use unrolled scalar loop.
+ // Otherwise just do nothing in unrolled scalar mode.
+ // Must be multiple of 8.
+ const int log_min_cnt = exact_log2(min_cnt);
+ const int iterations = (len - nprocessed) >> log_min_cnt;
+ nprocessed += iterations << log_min_cnt;
+
+ //---< avoid loop overhead/ptr increment for small # iterations >---
+ if (iterations <= 2) {
+ clear_reg(Z_R0);
+ clear_reg(Z_R1);
+ workreg_clear = true;
+
+ z_icmh(Z_R0, 5, 0+src_off, Rsrc);
+ z_icmh(Z_R1, 5, 4+src_off, Rsrc);
+ z_icm(Z_R0, 5, 2+src_off, Rsrc);
+ z_icm(Z_R1, 5, 6+src_off, Rsrc);
+ z_stmg(Z_R0, Z_R1, 0+dst_off, Rdst);
+
+ src_off += min_cnt;
+ dst_off += min_cnt*2;
+ }
+
+ if (iterations == 2) {
+ z_icmh(Z_R0, 5, 0+src_off, Rsrc);
+ z_icmh(Z_R1, 5, 4+src_off, Rsrc);
+ z_icm(Z_R0, 5, 2+src_off, Rsrc);
+ z_icm(Z_R1, 5, 6+src_off, Rsrc);
+ z_stmg(Z_R0, Z_R1, 0+dst_off, Rdst);
+
+ src_off += min_cnt;
+ dst_off += min_cnt*2;
+ }
+
+ if (iterations > 2) {
+ Label UnrolledLoop;
+ restore_inputs = true;
+
+ clear_reg(Z_R0);
+ clear_reg(Z_R1);
+ workreg_clear = true;
+
+ z_lgfi(Rix, iterations);
+ bind(UnrolledLoop);
+ z_icmh(Z_R0, 5, 0, Rsrc);
+ z_icmh(Z_R1, 5, 4, Rsrc);
+ z_icm(Z_R0, 5, 2, Rsrc);
+ z_icm(Z_R1, 5, 6, Rsrc);
+ add2reg(Rsrc, min_cnt);
+
+ z_stmg(Z_R0, Z_R1, 0, Rdst);
+ add2reg(Rdst, min_cnt*2);
+
+ z_brct(Rix, UnrolledLoop);
+ }
+ }
+
+ if ((len-nprocessed) > 0) {
+ switch (len-nprocessed) {
+ case 8:
+ if (!workreg_clear) {
+ clear_reg(Z_R0);
+ clear_reg(Z_R1);
+ }
+ z_icmh(Z_R0, 5, 0+src_off, Rsrc);
+ z_icmh(Z_R1, 5, 4+src_off, Rsrc);
+ z_icm(Z_R0, 5, 2+src_off, Rsrc);
+ z_icm(Z_R1, 5, 6+src_off, Rsrc);
+ z_stmg(Z_R0, Z_R1, 0+dst_off, Rdst);
+ break;
+ case 7:
+ if (!workreg_clear) {
+ clear_reg(Z_R0);
+ clear_reg(Z_R1);
+ }
+ clear_reg(Rix);
+ z_icm(Z_R0, 5, 0+src_off, Rsrc);
+ z_icm(Z_R1, 5, 2+src_off, Rsrc);
+ z_icm(Rix, 5, 4+src_off, Rsrc);
+ z_stm(Z_R0, Z_R1, 0+dst_off, Rdst);
+ z_llc(Z_R0, 6+src_off, Z_R0, Rsrc);
+ z_st(Rix, 8+dst_off, Z_R0, Rdst);
+ z_sth(Z_R0, 12+dst_off, Z_R0, Rdst);
+ break;
+ case 6:
+ if (!workreg_clear) {
+ clear_reg(Z_R0);
+ clear_reg(Z_R1);
+ }
+ clear_reg(Rix);
+ z_icm(Z_R0, 5, 0+src_off, Rsrc);
+ z_icm(Z_R1, 5, 2+src_off, Rsrc);
+ z_icm(Rix, 5, 4+src_off, Rsrc);
+ z_stm(Z_R0, Z_R1, 0+dst_off, Rdst);
+ z_st(Rix, 8+dst_off, Z_R0, Rdst);
+ break;
+ case 5:
+ if (!workreg_clear) {
+ clear_reg(Z_R0);
+ clear_reg(Z_R1);
+ }
+ z_icm(Z_R0, 5, 0+src_off, Rsrc);
+ z_icm(Z_R1, 5, 2+src_off, Rsrc);
+ z_llc(Rix, 4+src_off, Z_R0, Rsrc);
+ z_stm(Z_R0, Z_R1, 0+dst_off, Rdst);
+ z_sth(Rix, 8+dst_off, Z_R0, Rdst);
+ break;
+ case 4:
+ if (!workreg_clear) {
+ clear_reg(Z_R0);
+ clear_reg(Z_R1);
+ }
+ z_icm(Z_R0, 5, 0+src_off, Rsrc);
+ z_icm(Z_R1, 5, 2+src_off, Rsrc);
+ z_stm(Z_R0, Z_R1, 0+dst_off, Rdst);
+ break;
+ case 3:
+ if (!workreg_clear) {
+ clear_reg(Z_R0);
+ }
+ z_llc(Z_R1, 2+src_off, Z_R0, Rsrc);
+ z_icm(Z_R0, 5, 0+src_off, Rsrc);
+ z_sth(Z_R1, 4+dst_off, Z_R0, Rdst);
+ z_st(Z_R0, 0+dst_off, Rdst);
+ break;
+ case 2:
+ z_llc(Z_R0, 0+src_off, Z_R0, Rsrc);
+ z_llc(Z_R1, 1+src_off, Z_R0, Rsrc);
+ z_sth(Z_R0, 0+dst_off, Z_R0, Rdst);
+ z_sth(Z_R1, 2+dst_off, Z_R0, Rdst);
+ break;
+ case 1:
+ z_llc(Z_R0, 0+src_off, Z_R0, Rsrc);
+ z_sth(Z_R0, 0+dst_off, Z_R0, Rdst);
+ break;
+ default:
+ guarantee(false, "Impossible");
+ break;
+ }
+ src_off += len-nprocessed;
+ dst_off += (len-nprocessed)*2;
+ nprocessed = len;
+ }
+
+ //---< restore modified input registers >---
+ if ((nprocessed > 0) && restore_inputs) {
+ z_agfi(Rsrc, -(nprocessed-src_off));
+ if (nprocessed < 1000000000) { // avoid int overflow
+ z_agfi(Rdst, -(nprocessed*2-dst_off));
+ } else {
+ z_agfi(Rdst, -(nprocessed-dst_off));
+ z_agfi(Rdst, -nprocessed);
+ }
+ }
+ BLOCK_COMMENT("} string_inflate_const");
return offset() - block_start;
}
// Kills src.
unsigned int MacroAssembler::has_negatives(Register result, Register src, Register cnt,
< prev index next >