;==============================================================================
; The JVM will expect to find this file in one of two places, depending on
; which kind of build you're doing:
;
;   jdk/lib/amd64/boilerplate.ll
;   jre/lib/amd64/boilerplate.ll
;
; The build system's install step copies it to the right location.
;


;==============================================================================
; This file is intended to hold "boilerplate" hand writter IR for various
; IR abstractions and built-in functions.
;
; These functions will be inlined and then removed from the Module before
; code-gen occurs.  See the EnableLateInlining pass for more details.
;
; It is expected that abstraction Functions in this file will have at least
; the attributes of { noinline "azul-late-inline"="1" }.  The value of the
; azul-late-inline attribute indicates with round of inline enabling will
; catch the function in question.  As of now, we expect this sequence of
; LLVM passes:
;   Inlining and iterative optimization
;   Late Inline "0"
;   Safepoint Placement, Prune Deopt States
;   Late Inline "1"
;   Standard LLVM optimizations
;   Late Inline "2"
;   Late placement of GC barriers and safepoints
;   Barrier optimizations
;   EnableLateInlining for GC and safepoint abstractions functions
;   Cleanup optimizations post barrier insertion
;
; As such, Java bytecode abstractions should be
; "azul-late-inline"="0" - anything used by the parser as a helper function
;    (i.e. the name of the JBA doesn't effect optimization
; "azul-late-inline"="1" - lock and allocation JBAs, (i.e. things
;    which we want to optimize post deopt state pruning)
; "azul-late-inline"="2" - JBA with special optimization semantics
;    (i.e. get_klass_id or call resolution JBAs)
; "azul-late-inline"="3" - GC/Safepoint abstractions and anything which
;    breaks the opaque reference abstraction


;==============================================================================
;; Specify the target data layout.  From the llvm LangRef, only the
;; default value for endianness (llvm defaults to big-endian) doesn't
;; match what we build for.  The other components for datalayout
;; (pointer size, pointer alignment etc.) have default values that
;; match our host except for compressed oops, which are represented as addrspace
;; ni:2 and p2 with 32 bit values (rather than 64 bit).
target datalayout = "e-m:e-ni:1-p2:32:8:8:32-ni:2"

;==============================================================================
; Forward declare these *constants* as global variables.  The VM will rewrite
; these declarations to include a constant initializer and be private linkage.
;

; Do we build AARCH64?
@FalconAARCH64Build = external global i1

; Byte offset into an array oop to find the i32 array length field:
@arrayOopDesc.length_offset_in_bytes = external global i32
@compressed_oops_encoding_mask_ptr = external global ptr

@oopDesc.headerSize = external global i32

;
; Header size of the int[]. In theory this is not required and compiler
; should be able to constant fold same value from the exact known type. But we
; decide to save a bit of compile time here.
@arrayOopDesc.intArrayHeaderSize = external global i32
; Header size of the char[]. Same story as with int[].
@arrayOopDesc.charArrayHeaderSize = external global i32
; Header size of the byte[].
@arrayOopDesc.byteArrayHeaderSize = external global i32
; Header size of the long[]. Same story as with int[].
@arrayOopDesc.longArrayHeaderSize = external global i32
;
; Byte offset into an objArray oop to find the element kid field
@objArrayOopDesc.ekid_offset_in_bytes = external global i32
;
; Byte offset into a method oop that holds the from_compiled_entry offset.
@methodOopDesc.from_compiled_entry_offset_in_bytes = external global i32
;
; Constants used to read the klass id off an oop
@markWord.kid_shift = external global i64
@markWord.kid_mask = external global i64
@markHalfWord.kid_shift = external global i32
@markHalfWord.kid_mask = external global i32
@markHalfWord.kid_offset_bytes = external global i32

; Constants used in the hash code computation logic.
@markWord.preheader_mask_in_place = external global i64
@markWord.hashcode_mask_in_place = external global i64
@HashCode.addr_shift = external global i64
@HashCode.addr_hash_mask = external global i64
@OptimizeIdentityHashForDistribution = external global i1
@UseTaggedAddressForJavaHeap = external global i1
@GPGC_Layout.page_sector_shift = external global i64
@GPGC_Layout.SectorMask = external global i64
@GPGC_Layout.naked_page_bits = external global i64
@GPGC_Layout.naked_page_mask = external global i64
@LogBytesPerGPGCPage = external global i64
@LogBytesPerGPGCPageInfo = external global i64
@GPGC_PageInfo.page_info_base = external global ptr
@GPGC_PageInfo.id_hash_seed_offset_in_bytes = external global i64
@FalconCASObjectLazyLVB = external global i1
@GPGC_PageInfo.cards_base_addr_offset_in_bytes = external global i64
@GPGC_Layout.mid_space_min_object_size_words = external global i64
@FalconNewInstanceVersion = external global i32
@FalconTLABPrefetchDistance = external global i32
@FalconTLABUseFastStosb = external global i1
@FalconTLABMinSizeForFastStosb = external global i64

;
; Constants used by resolve_virtual:
;  @azul.vtable_begin_klass_offset_bytes = private unnamed_addr constant i64 N
;  @azul.vtable_per_entry_size_bytes = private unnamed_addr constant i64 N
; 
;  These aren't "pure" constants (i.e. a direct offset or value from
;  the JVM side) -- they are pre-calculated in dolphinAbstractions.cpp
;  to keep the IR simple.
@azul.vtable_begin_klass_offset_bytes = external global i64
@azul.vtable_per_entry_size_bytes = external global i64

; Number of bytes in a word.  Always 8 for us, but it can help clarify
; intent.
@azul.BytesInAWord = external global i32

;
; The base of the klass table:
@klassTable.klassTableBase = external global ptr
; Well known common entries to the klass table
@klassTable.java_lang_object_class = external global ptr addrspace(1)
; Compressed version of klass oop 
@klassTable.java_lang_object_class_coop = external global ptr addrspace(2)

@java_lang_Class.oop = external global ptr addrspace(1)

@UseCompressedOops.flag = external global i1


@UseLVBs = external global i1
;
; Constants related to card marking
@LogWordsPerGPGCCardMarkSummaryWord = external global i64
@LogBytesPerHeapOop = external global i64
;
; lvb trap mask is a global constant with late resolution.
; IMPORTANT! This LVB trap mask variable is used in orca for aliasing decisions 
; which is involved in correctness of the code generated. 
@lvbTrapMask = external global i64

; These are global constants with late resolution. We don't embed
; raw addresses in the IR so as to avoid run to run variation, which
; hurts compile stashing.
; Note the collector state might get changed any moment!
; Some important states are get set/unset only at safepoint, so
; a one can relay only on such ones and only if there is no safepoint
; between reading collector state and comparing it with target one.
; Any specific to collector state action must complete before
; a safepoint. This is achievable by keeping logic within
; "azul-late-inline"="4" abstraction
@GPGC_NewCollector.collection_state_addr = external global i8
@GPGC_OldCollector.collection_state_addr = external global i8

; Important collector states that are only set/unset at safepoint
@GPGC_Collector.ConcurrentMarking       = external global i8
@GPGC_Collector.ConcurrentRefProcessing = external global i8

;
; From klass.hpp:
;   offsets include the oop header, so they can be used to index
;   directly into an oop.
;
@klassOopDesc.access_flags_offset_bytes = external global i32
@klassOopDesc.modifier_flags_offset_in_bytes = external global i32
@klassOopDesc.super_check_offset_offset_bytes = external global i32
@klassOopDesc.secondary_super_kid_cache_offset_bytes = external global i32
@klassOopDesc.layout_helper_offset_in_bytes = external global i32
@klassOopDesc.java_mirror_offset_in_bytes = external global i32
@klassOopDesc.klassId_offset_in_bytes = external global i32
@klassOopDesc.super_offset_in_bytes = external global i32
@klassOopDesc.trace_id_offset_in_bytes = external global i32
@arrayKlassOopDesc.component_mirror_offset_in_bytes = external global i32
@Klass.layout_helper_needs_slow_path_bit = external global i32
@Klass.layout_helper_size_in_bytes_mask = external global i32
@Klass.object_array_layout_helper = external global i32
@Klass.layout_helper_neutral_value = external global i32
@Klass.layout_helper_log2_element_size_mask = external global i32
@Klass.layout_helper_log2_element_size_shift = external global i32
@Klass.layout_helper_header_size_mask = external global i32
@Klass.layout_helper_header_size_shift = external global i32
@Klass.layout_helper_element_type_mask = external global i32
@Klass.layout_helper_element_type_shift = external global i32

@Thread.reversible_tid_offset_bytes = external global i32
@Thread.self_offset_bytes = external global i32
@Thread.tlab_top_offset_bytes = external global i32
@Thread.tlab_zend_offset_bytes = external global i32
@Thread.tlab_end_offset_bytes = external global i32
@Thread.tlab_zregion_size_bytes = external global i32
@Thread.scoped_value_cache_offset_bytes = external global i32
@Thread.current_thread_offset_bytes = external global i32
@Thread.current_carrier_thread_offset_bytes = external global i32

@Thread.osthread_offset_bytes = external global i32
@Thread.pthread_id_offset_bytes = external global i32
@Thread.interrupted_offset_bytes = external global i32

@Thread.park_permit_offset_bytes = external global i32
@java.lang.Thread.eetop_offset_bytes = external global i32
@Thread.doing_unsafe_access_offset_bytes = external global i32
@Thread.is_disable_suspend_offset_bytes = external global i32

@Thread.held_monitor_count_offset_bytes = external global i32

@oopDesc.lock_info_offset_in_bytes = external global i32
@markWord.rec_mask_in_place = external global i32
@markWord.rec_shift = external global i32
@markWord.dont_care_bit_poison_xor = external global i32
@markWord.tid_mask_in_place = external global i32
@MonitorEnterFastPath.flag = external global i1
@MonitorExitFastPath.flag = external global i1
@SupportHeldMonitorCount = external global i1
@NewInstanceFastPath.flag = external global i1
@NewArrayFastPath.flag = external global i1
@MinObjAlignmentInBytes = external global i32
@UseSafepointsInCopy = external global i1
@UseSafepointsInCopyInliningIntrinsics = external global i1
@MemoryOpsChunkSizeInBytes = external global i64
@UseFastNanoTime.flag = external global i1

@klassOopDesc.init_state_offset_in_bytes = external global i32
@instanceKlass.fully_initialized = external global i32
@instanceKlass.vtable_start_offset_in_bytes = external global i32
@instanceKlass.vtable_length_offset_in_bytes = external global i32
@instanceKlass.itable_length_offset_in_bytes = external global i32
@itableOffsetEntry.interface_offset_in_bytes = external global i32
@itableOffsetEntry.size_bytes = external global i32
@itableOffsetEntry.offset_offset_in_bytes = external global i32
@itableMethodEntry.size_in_bytes = external global i32
@itableMethodEntry.method_offset_in_bytes = external global i32

@java_lang_ref_Reference.referent_offset_in_bytes = external global i32

@java_lang_Class.klass_offset = external global i32
@java_lang_Class.array_klass_offset = external global i32

@java_lang_invoke_MethodHandle.form_offset_in_bytes = external global i32
@java_lang_invoke_LambdaForm.vmentry_offset_in_bytes = external global i32

; valid for jdk8
@java_lang_invoke_MemberName.vmtarget_offset_in_bytes = external global i32
; valid for jdk11+
@java_lang_invoke_MemberName.method_offset_in_bytes = external global i32
@java_lang_invoke_ResolvedMethodName.vmtarget_offset_in_bytes = external global i32

@java_lang_invoke_MemberName.vmindex_offset_in_bytes = external global i32
@java_lang_invoke_MemberName.clazz_offset_in_bytes = external global i32
@ResolvedMethodTableSupported = external global i1

; This is NOT a constant.
; This global is used to ensure a use of the fake_vm_state calls before
; safepoint insertion.  This is required to prevent them being optimized
; away if we mark them as ReadOnly - which we really really want to do
; from a performance persective.  This global is external so that when
; linking in function bodies for inlining, the linker does not creates
; copies of this global to resolve collisions.  Since there is no possibility
; renamed copies of this variable, we can refer to the anchor by name.
; We purposely do not porovide a definition to ensure an error is
; generated if any uses of the meta variable remain after optimization.
@llvm.jvmstate_anchor = external global i32

; Values for the java integer types
@JIntMin = external global i32
@JIntMax = external global i32

@JLongMin = external global i64
@JLongMax = external global i64

; Pre-calculated table with crc values for all possible 8-bit values.
@StubRoutines.CRCTable = external constant [256 x i32], align 4

@AccessFlags_JVM_ACC_INTERFACE_Mask = external global i32
@AccessFlags_JVM_ACC_ABSTRACT_Mask = external global i32
@Primitive_ModifierFlags = external global i32

@AccessFlags_JVM_ACC_IS_HIDDEN_CLASS_Mask = external global i32

@java_lang_String.coder_offset_in_bytes = external global i32
@java_lang_String.value_offset_in_bytes = external global i32

@JDK_VERSION_MAJOR = external global i32
@CompactStrings = external global i1

@DeoptReasons.Reason_unhandled = external global i32

@FalconUseEmptyProfileBoolean = external global i1

@FalconInlineConcurrentGetRefFastPath = external global i1

@UseTrueObjectsForUnsafe = external global i1

@FalconUseOptimizedReallocInPlace = external global i1

@FalconUseInvariantKlassLoad = external global i1

; Whether we should partially inline _vectorizedMismatch using masked ld/st
@FalconPartialInlineVectorizedMismatch = external global i1

@fillerObjKlass.kid = external global i32
@fillerObjKlass.size_bits_mask = external global i64

@sun_security_provider_DigestBase.blockSize_offset_in_bytes = external global i32

; JFR
@JfrTraceIdEpoch.epoch = external global i8
@JfrTraceIdEpoch.tag_state = external global i8
@VOID_TYPE_ID = external global i64

; We create branch-weight metadata to tell the compiler that the LVB slow path
; should be treated as very rarely taken.  This is modeled as a triggered LVB
; happening one out of a million times.
!0 = !{ !"branch_weights", i32 1, i32 1000000 }

; A generic 'true' node, used for the inttoptr instruction that creates oops.
!1 = !{i1 true}


; This is part of the TBAA structure defined in DolphinBuilder.  The root name
; must be the same to merge the two.  We only duplicate the parts we actually
; need here.
!2 = !{!"dolphin-tbaa-access-type"}
!3 = !{!"array_length_access", !2, i64 1}
!4 = !{!"array_ekid_access", !2, i64 1}
!13 = !{!"thread_reversible_tid_access", !2, i64 1}

!14 = !{!"mark_word_access", !2}
!15 = !{!"preheader_access", !2}

!19 = !{!"klass_access_flags_access", !2}
!21 = !{!"itable_access", !2, i64 1}
!22 = !{!"sub_cached_superkid_access", !2}
!23 = !{!"super_check_offset_access", !2}
!26 = !{!"layout_helper_access", !2, i64 1}
!27 = !{!"klass_state_access", !2}

!24 = !{!"klass_oop_klass_id_access", !2, i64 1}
!17 = !{!"from_compiled_entry_access", !2}
 
; This metadata is used in !llvm.loop to force runtime unrolling
; in presence of multiple exits in loop
!28 = !{!28, !29}
!29 = !{!"llvm.loop.unroll.runtime.multi.exit.enable"}

!30 = !{!31, !31, i64 0}
!31 = !{!"klassTable_access", !2}
declare i64 @llvm.x86.bmi.pext.64(i64, i64)
declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
declare i32 @llvm.x86.bmi.pext.32(i32, i32)
declare i32 @llvm.x86.bmi.pdep.32(i32, i32)

declare ptr addrspace(2) @azul.compress(ptr addrspace(1))
  noinline readnone nounwind speculatable "azul-late-inline"="4" willreturn "gc-leaf-function" "has-latent-use"

declare ptr addrspace(1) @azul.uncompress(ptr addrspace(2) %coop)
  noinline readnone nounwind speculatable "azul-late-inline"="4" willreturn "gc-leaf-function" "has-latent-use"

; Definition generated by dolphinAbstractions
declare noundef ptr addrspace(1) @azul.poison.oop(ptr addrspace(1) noundef %oop)
    noinline readnone nounwind "azul-late-inline"="4" willreturn "gc-leaf-function" "has-latent-use"
declare noundef ptr addrspace(1) @azul.unpoison.oop(ptr addrspace(1) noundef %oop)
    noinline readnone nounwind "azul-late-inline"="4" willreturn "gc-leaf-function" "has-latent-use"

; Indicates cold paths where inserting a deopt might be profitable.
; Used to postpone speculation decision until Orca.
declare zing_uncommon_trap void @azul.maybe_deoptimize(...) nounwind "consumes-vmstate" "has-latent-use"

;; These two both represent the lvb_trap_handler function, but are typed
;; differently to respect the c heap vs java heap distinction we need for
;; safepoint insertion
declare zing_stub_default ptr addrspace(1) @cHeapLvb.TrapFunc(ptr addrspace(1) %val, 
                                            ptr %addr)
    nounwind argmemonly willreturn "gc-leaf-function"

declare zing_stub_default ptr addrspace(1) @jHeapLvb.TrapFunc(ptr addrspace(1) %val,
                                            ptr addrspace(1) %addr)
    nounwind argmemonly willreturn "gc-leaf-function"

;; These two both represent the lvb_trap_handler_compressed_mode function, but are typed
;; differently to respect the c heap vs java heap distinction we need for
;; safepoint insertion
;; Also, we need these two extra variants for supporting compressed-oops LVB
;; slow path.
declare zing_stub_default ptr addrspace(2) @cHeapLvb.coop.TrapFunc(ptr addrspace(2) %val, 
                                            ptr %addr)
    nounwind argmemonly willreturn "gc-leaf-function"

declare zing_stub_default ptr addrspace(2) @jHeapLvb.coop.TrapFunc(ptr addrspace(2) %val,
                                            ptr addrspace(1) %addr)
    nounwind argmemonly willreturn "gc-leaf-function"

declare void @"DolphinRuntime::verify_possibly_out_of_phase_oop"(ptr addrspace(1))
    nounwind willreturn "gc-leaf-function"

; Does a LVB test depending on the mode represented by @GPGCUseDensePhaseEncoding constant.
declare noundef i1 @LvbTest(i64 noundef %value)
    "azul-late-inline"="0" alwaysinline "vmstate-idempotent"="true" willreturn "gc-leaf-function" readonly nounwind

; This is a helper for implementing Java heap based LVBs.  It does not have "LVB
; semantics" (the optimizer does not optimize these as LVBs) and is merely a
; code sharing construct.
;
; Don't call this function directly!  Instead use of the higher level
; abstractions built using this helper.
;
; The definition of this function is generated by dolphinAbstractions
declare ptr addrspace(1) @azul.jHeapLVBHelper(ptr addrspace(1) %value , ptr addrspace(1) %addr)
    alwaysinline nounwind willreturn "gc-leaf-function" "azul-late-inline"="0"

; See dolphinAbstractions.cpp for the definitions of cHeapLvb & jHeapLvb

declare ptr addrspace(1) @jHeapLvb(ptr addrspace(1) %0, ptr addrspace(1) %1)
    nounwind "gc-leaf-function" willreturn noinline "has-latent-use"

; Heal address %addr in place to contain an in-phase oop that is semantically
; equivalent to what it contains currently.  After this call, %addr is
; guaranteed to contain a fixed up oop till the next checkpoint / safepoint.
;
; This function does _not_ have LVB semantics, and can be used as a "side
; effect" that heals *%addr.
define void @azul.healJHeapLocation(ptr addrspace(1) %addr)
      nounwind noinline "azul-late-inline"="4" willreturn "gc-leaf-function" {
  entry:
    %loaded_value = call ptr addrspace(1) @azul.load_ref_p1(ptr addrspace(1) %addr)
    %result_unused = call ptr addrspace(1) @azul.jHeapLVBHelper(
      ptr addrspace(1) %loaded_value, ptr addrspace(1) %addr)
    ret void
}

; Return the '%bit.offset'th bit in memory starting from address '%base'.
; This helper routine is structured so that given a base address and a 
; bit.offset derived from the address used by an SVB, we should end up
; with a code sequence which looks like the following inlined into the 
; caller (i.e. the svb routine):
;   mov8      %r14, 0x3ffffffffff80 
;   mov8      %rdx, %rcx             <-- Copy address
;   shr8      %rdx, 14               <-- PageNum * BytesPerPageInfo
;   and8      %rdx, %r14             <-- PageInfo offset from base
;   mov8i     %rbp, 0x580040000000   <-- PageInfo base
;   ld8       %r8, (%rdx*1+%rbp+80)  <-- Card base address (also block base)
;   mov8      %r9, %r11              <-- Copy address
;   sub8      %r9, %r8               <-- Offset from base of block
;   shr8i     %rdx, 9                <-- byte index into bitmap
;   ld8       %rdx, (%rdx*8+r8)      <-- byte from bitmap
;   btx8      %rdx, %r9
;   jae       .LBB10_1 <-- use of CF
declare i1 @bittest(ptr %base, i64 %bit.offset)
    "azul-late-inline"="0" alwaysinline "vmstate-idempotent"="true" willreturn "gc-leaf-function" readonly nounwind

; Perform the generation test needed for an SVB.  This is extracted out
; for readability and optimization.
; %object is the one we're storing into.  %value is what we're storing.
declare i1 @azul.stores_new_to_old(ptr addrspace(1) %object,
                                   ptr addrspace(1) %value)
    nounwind noinline "azul-late-inline"="4" willreturn "gc-leaf-function"

; We do not emit this from the frontend, but the optimizer will
; replace calls to @azul.stores_new_to_old with calls to this function
; when legal.
declare i1 @azul.is_old_gen(ptr addrspace(1) %oop)
    readnone nounwind noinline "azul-late-inline"="4" willreturn "gc-leaf-function" "has-latent-use"

; The utility function incrementing the count located at counter_base[*counter_offset_ptr]
define void @increment_counter(i1 %flag, ptr %addr, i64 %amount)
    nounwind "azul-late-inline"="0" alwaysinline willreturn "gc-leaf-function" {
  entry:
    br i1 %flag, label %update_counter, label %exit
  update_counter:
    %ThreadCounters.Counter = load atomic i64, ptr %addr unordered, align 8
    %ThreadCounters.Counter.Inc = add i64 %ThreadCounters.Counter, %amount
    store atomic i64 %ThreadCounters.Counter.Inc, ptr %addr unordered, align 8
    br label %exit
  exit:
    ret void
}
;=============================================================================
;
; This is a Java bytecode abstraction for capturing how a stored-value-barrier
; (SVB) is performed on an oop stored into the jHeap.  This implementation is
; for GPGC.
;
; %value is the oop being stored.  %addr is the address being stored to.  
; %object is an address %addr is based on.  This might be the base of the 
; object or it might simply be some other interior derived pointer.  We don't
; allow it to be an exterior derived pointer because it could in theory cross
; the generation boundary.
;
; The last argument is an object size. It is zero if the size is unknown.
declare ptr addrspace(1) @svb(ptr addrspace(1) %value, 
                             ptr addrspace(1) %addr,
                             ptr addrspace(1) %object,
                             i32 %objectSize)
    nounwind noinline "azul-late-inline"="3" willreturn "gc-leaf-function" "has-latent-use"

; Helper method used in @svb. If we ever figure out a way to do this without
; inline asm, then just remove this and add the appropriate IR to the 
; generator of @svb
define void @AtomicBtsq(i64 %bit.index, ptr %addr) 
  alwaysinline nounwind willreturn "gc-leaf-function" {
entry:
  %isAARCH64Build = load i1, ptr @FalconAARCH64Build
  br i1 %isAARCH64Build, label %AARCH64, label %X86_64

X86_64:
  ; NOTE: I have no idea if I've specified a reasonable set of constraints to the inline asm.  I don't
  ;       even know what most of this stuff means, it's just what clang produced.
  ; TODO-PERF: It would be nice to represent this using an atomicrmw on the
  ; containing word/byte.  Such a representation does not appear to lower
  ; properly today, but this could be taught to the LLVM x86 backend.
  call void asm sideeffect "lock btsq $0,($1)", "r,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i64 %bit.index, ptr %addr)
       nounwind willreturn "gc-leaf-function"
  ret void

AARCH64:
  ; This implementation is taken from StubGenerator::gen_svb_card_mark().
  ;  On x86_64 the btsq instruction is able to process arbitrary index correctly
  ;  handling long bit mask. If we do a atomicrmw or, bit index should be less than 64.
  %word.index.i = lshr i64 %bit.index, 6
  %addr.i = getelementptr i64, ptr %addr, i64 %word.index.i
  %bit.index.trunc = trunc i64 %bit.index to i6
  %bit.index.i64.i = zext i6 %bit.index.trunc to i64
  %single_bit_mask = shl i64 1, %bit.index.i64.i
  atomicrmw or ptr %addr.i, i64 %single_bit_mask acq_rel, align 8
  ret void
}


;==============================================================================
;
; This is a Java bytecode abstraction for capturing how an array length check
; is performed.
;
; Parameters:
;
;   %arrayOop: A pointer to a non-null java array object, of any element type.
;
; Return:
;
;   The length of the array, as an i32.
;
; Function Attrs: noinline readonly "azul-late-inline"="0"
;
define noundef i32 @azul.array_length(ptr addrspace(1) noundef %arrayOop)
    nounwind readonly alwaysinline "azul-late-inline"="0" willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
entry:
  %length_offset    = load i32, ptr @arrayOopDesc.length_offset_in_bytes
  %length_gep       = getelementptr inbounds i8, ptr addrspace(1) %arrayOop, i32 %length_offset
  ; This invariant load will become problematic if we inline the array allocation,
  ; see Zilla 3300 for background and proposed fix.
  %length           = load atomic i32, ptr addrspace(1) %length_gep unordered, align 4, !tbaa !3, !range !5, !invariant.load !{}, !noundef !{}
  ret i32 %length
}

; The range [0; SINT_MAX - 1] is used for array length.
; 2147483646 = (2^31) - 1 - 1.
!5 = !{i32 0, i32 2147483646}

;==============================================================================
;
; This is a Java bytecode abstraction for extracting the kid of the element
; type stored in a reference array
;
; Parameters:
;
;   %arrayOop: A pointer to a non-null java object array object
;
; Return:
;
;   The kid of the element type, as an i32
;
; Function Attrs: noinline readonly "azul-late-inline"="0"
;
define i32 @azul.objarray_element_kid(ptr addrspace(1) %arrayOop)
    nounwind alwaysinline readonly "azul-late-inline"="0" willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
entry:
  %ekid_offset    = load i32, ptr @objArrayOopDesc.ekid_offset_in_bytes
  %ekid_gep       = getelementptr inbounds i8, ptr addrspace(1) %arrayOop, i32 %ekid_offset
  ; This invariant load will become problematic if we inline the array allocation,
  ; see Zilla 3300 for background and proposed fix.
  %ekid           = load atomic i32, ptr addrspace(1) %ekid_gep unordered, align 4, !tbaa !4, !invariant.load !{}
  ret i32 %ekid
}

define ptr @azul.get_compiled_entry(ptr addrspace(1) %methodOop)
      nounwind noinline readonly argmemonly "azul-late-inline"="2" willreturn "gc-leaf-function" {
entry:
  %from_compiled_entry_offset = load i32, ptr @methodOopDesc.from_compiled_entry_offset_in_bytes
  %from_compiled_gep = getelementptr inbounds i8, ptr addrspace(1) %methodOop, i32 %from_compiled_entry_offset
  %from_compiled_entry = load atomic ptr, ptr addrspace(1) %from_compiled_gep unordered, align 8, !tbaa !17
  ret ptr %from_compiled_entry
}

define noundef i32 @azul.get_klass_id(ptr addrspace(1) nocapture %oop)
      nounwind noinline readonly argmemonly "invariant-object-property"="0"
      "azul-late-inline"="2" willreturn "gc-leaf-function" "has-latent-use" {
entry:
  %kid_offset_in_bytes = load i32, ptr @markHalfWord.kid_offset_bytes
  %kid_shift = load i32, ptr @markHalfWord.kid_shift
  %kid_mask  = load i32, ptr @markHalfWord.kid_mask

  %mark_half_word_addr = getelementptr i8, ptr addrspace(1) %oop, i32 %kid_offset_in_bytes
  %mark_word = load atomic i32, ptr addrspace(1) %mark_half_word_addr unordered, align 4, !tbaa !14
  %mark_word_shifted = lshr i32 %mark_word, %kid_shift
  %kid = and i32 %mark_word_shifted, %kid_mask
  ret i32 %kid
}

; These abstractions are defined in dolphinAbstractions. Various functions for
; loading an oop. All these are loads from java heap.
declare ptr addrspace(1) @azul.load_ref_p1_with_itable_tbaa(ptr addrspace(1))
         "azul-late-inline"="0" alwaysinline willreturn "gc-leaf-function" nounwind
declare ptr addrspace(1) @azul.load_ref_p1_with_vtable_tbaa(ptr addrspace(1))
         "azul-late-inline"="0" alwaysinline willreturn "gc-leaf-function" nounwind
declare ptr addrspace(1) @azul.load_ref_p1(ptr addrspace(1))
         "azul-late-inline"="0" alwaysinline willreturn "gc-leaf-function" nounwind

; The following are abstractions for loading from c-heap.
declare ptr addrspace(1) @azul.load_ref_p0(ptr)
         "azul-late-inline"="0" alwaysinline willreturn "gc-leaf-function" nounwind


;; Given a kid, return the corresponding klassOop
define ptr addrspace(1) @azul.load_klass(i32 %kid) 
      nounwind alwaysinline readonly "azul-late-inline"="0" willreturn "gc-leaf-function" {
  %ktbl_base = load ptr, ptr @klassTable.klassTableBase

  %klass_addr = getelementptr inbounds ptr addrspace(1), ptr %ktbl_base, i32 %kid
  %isInvariantKlassLoad = load i1, ptr @FalconUseInvariantKlassLoad
  br i1 %isInvariantKlassLoad, label %invariantKlassLoad, label %not_invariant

invariantKlassLoad:
  %klassOop = load atomic ptr addrspace(1), ptr %klass_addr unordered, align 8, !tbaa !30, !invariant.load !{}, !azul.get_klass_by_kid !{}
  ret ptr addrspace(1) %klassOop

not_invariant:
  %klass = load atomic ptr addrspace(1), ptr %klass_addr unordered, align 8, !tbaa !30, !azul.get_klass_by_kid !{}
  ret ptr addrspace(1) %klass
}

;; Given a kid, return the AccessFlags associated with that klass.  This is
;;  mainly here so it can be constant folded.
define i32 @azul.get_access_flags(i32 %kid)
      nounwind alwaysinline readonly "azul-late-inline"="0" willreturn "gc-leaf-function" {

entry:
  %klassOop = call ptr addrspace(1) @azul.load_klass(i32 %kid)
  %access_flags = call i32 @azul.get_access_flags_klass(ptr addrspace(1) %klassOop)
  ret i32 %access_flags
}

define i32 @azul.get_access_flags_klass(ptr addrspace(1) %klassOop)
      nounwind alwaysinline readonly "azul-late-inline"="0" willreturn "gc-leaf-function" {

entry:
  %access_flags_offset = load i32, ptr @klassOopDesc.access_flags_offset_bytes

  %access_flags_addr = getelementptr inbounds i8, ptr addrspace(1) %klassOop, i32 %access_flags_offset
  %access_flags = load atomic i32, ptr addrspace(1) %access_flags_addr unordered, align 4, !tbaa !19
  ret i32 %access_flags
}

; TODO: We currently cannot add the invariant-object-property on the receiver because we can LVB a non-oop as part of retrieving the caller (see ZVM-21564).
define ptr addrspace(1) @azul.resolve_virtual(ptr addrspace(1) %receiver, i64 %vtable_index)
    nounwind noinline readonly "azul-late-inline"="2" willreturn "gc-leaf-function" "foldable-function" {

entry:
  %kid = call i32 @azul.get_klass_id(ptr addrspace(1) %receiver)
  %klass = call ptr addrspace(1) @azul.load_klass(i32 %kid)

  %vtable_begin_klass_offset_bytes = load i64, ptr @azul.vtable_begin_klass_offset_bytes
  %vtable_per_entry_size_bytes = load i64, ptr @azul.vtable_per_entry_size_bytes
  %vtable_entry_offset_bytes = mul nuw nsw i64 %vtable_index, %vtable_per_entry_size_bytes
  %total_offset_bytes = add nuw nsw i64 %vtable_entry_offset_bytes, %vtable_begin_klass_offset_bytes

  %callee_address = getelementptr i8, ptr addrspace(1) %klass, i64 %total_offset_bytes
  %callee = call ptr addrspace(1) @azul.load_ref_p1_with_vtable_tbaa(ptr addrspace(1) %callee_address)
  ret ptr addrspace(1) %callee
}

;; Does an itable lookup.  iKlass_kid is the klass-id corresponding to iklass.
;; iKlass is the interface that contains the
;; method being invoked and itable_index is the index of the method in
;; the vtable corresponding to the interface.
;;
;; IMPORTANT: since this function loads oops, it needs to be inlined
;; (via azul-late-inline) before we transform loads to LVBs.
;;
;; Returns either a valid methodOop or null if the receiver's class
;; does not implement the given interface.
define ptr addrspace(1) @azul.resolve_interface(ptr addrspace(1) %receiver, i32 %iKlass_kid, i32 %itable_index)
    nounwind noinline readonly "azul-late-inline"="2" willreturn "gc-leaf-function"
    "foldable-function" "invariant-object-property"="0" {
 entry:
  %vtable_start_offset_in_bytes = load i32, ptr @instanceKlass.vtable_start_offset_in_bytes
  %vtable_length_offset_in_bytes = load i32, ptr @instanceKlass.vtable_length_offset_in_bytes
  %itable_length_offset_in_bytes = load i32, ptr @instanceKlass.itable_length_offset_in_bytes
  %interface_offset_in_bytes = load i32, ptr @itableOffsetEntry.interface_offset_in_bytes
  %itable_offset_table_entry_size_bytes = load i32, ptr @itableOffsetEntry.size_bytes
  %offset_entry_vtable_offset_in_bytes = load i32, ptr @itableOffsetEntry.offset_offset_in_bytes
  %itable_method_entry_size_in_bytes = load i32, ptr @itableMethodEntry.size_in_bytes
  %itable_method_entry_offset_in_bytes = load i32, ptr @itableMethodEntry.method_offset_in_bytes
  %bytes_in_a_word = load i32, ptr @azul.BytesInAWord
  %receiver_kid = call i32 @azul.get_klass_id(ptr addrspace(1) %receiver)
  %receiver_klass = call ptr addrspace(1) @azul.load_klass(i32 %receiver_kid)
  br label %check_itable_len

;; Normally last itable entry is an "end" marker. But if %receiver doesn't
;; implement any interface it has zero itable entries and as a result there is
;; no "end" marker. So we need to make sure itable is not empty to proceed...
check_itable_len:
  %itable_length_addr = getelementptr inbounds i8, ptr addrspace(1) %receiver_klass, i32 %itable_length_offset_in_bytes
  %itable_length_in_words = load atomic i32, ptr addrspace(1) %itable_length_addr unordered, align 4
  %is_zero_len = icmp eq i32 %itable_length_in_words, 0
  br i1 %is_zero_len, label %iklass_null, label %get_itable_start

get_itable_start:
  %iKlass = call ptr addrspace(1) @azul.load_klass(i32 %iKlass_kid)
  %vtable_start = getelementptr inbounds i8, ptr addrspace(1) %receiver_klass, i32 %vtable_start_offset_in_bytes
  %vtable_length_addr = getelementptr inbounds i8, ptr addrspace(1) %receiver_klass, i32 %vtable_length_offset_in_bytes
  %vtable_length_in_words = load atomic i32, ptr addrspace(1) %vtable_length_addr unordered, align 4
  %vtable_length_in_bytes = mul nsw i32 %vtable_length_in_words, %bytes_in_a_word

;; The klassItable is right after the klassVtable.
  %itable_start = getelementptr i8, ptr addrspace(1) %vtable_start, i32 %vtable_length_in_bytes
  br label %loop

;; Format of an itable
;;
;;    ---- offset table ---
;;    klassOop of interface 1             \_
;;    offset to vtable from start of oop  / offset table entry
;;    ...
;;    klassOop of interface n             \_
;;    offset to vtable from start of oop  / offset table entry
;;    --- vtable for interface 1 ---
;;    methodOop                           \_ method table entry
;;    ...
;;    methodOop                           \_ method table entry
;;    -- vtable for interface 2 ---
;;    ...

 loop:
  %offset_table_iterator = phi ptr addrspace(1) [ %itable_start, %get_itable_start ], [ %offset_table_next, %continue_after_null_check ]
  %offset_table_iklass_addr_i8 = getelementptr inbounds i8, ptr addrspace(1) %offset_table_iterator, i32 %interface_offset_in_bytes
  %offset_table_iklass = call ptr addrspace(1) @azul.load_ref_p1_with_itable_tbaa(ptr addrspace(1) %offset_table_iklass_addr_i8)
  %offset_table_iklass_is_null = icmp eq ptr addrspace(1) %offset_table_iklass, null
  br i1 %offset_table_iklass_is_null, label %iklass_null, label %continue_after_null_check, !prof !7

continue_after_null_check:
  %offset_table_iklass_matches = icmp eq ptr addrspace(1) %offset_table_iklass, %iKlass
  %offset_table_next = getelementptr inbounds i8, ptr addrspace(1) %offset_table_iterator, i32 %itable_offset_table_entry_size_bytes
  br i1 %offset_table_iklass_matches, label %found_entry, label %loop

 found_entry:
;; %offset_table_iterator now points to a matching offset table entry.
  %vtable_offset_addr = getelementptr inbounds i8, ptr addrspace(1) %offset_table_iterator, i32 %offset_entry_vtable_offset_in_bytes
  %vtable_offset = load atomic i32, ptr addrspace(1) %vtable_offset_addr unordered, align 4, !tbaa !21
  %vtable = getelementptr inbounds i8, ptr addrspace(1) %receiver_klass, i32 %vtable_offset
  %method_entry_offset_bytes = mul nsw i32 %itable_method_entry_size_in_bytes, %itable_index
  %method_table_entry = getelementptr inbounds i8, ptr addrspace(1) %vtable, i32 %method_entry_offset_bytes
  %callee_addr_i8 = getelementptr inbounds i8, ptr addrspace(1) %method_table_entry, i32 %itable_method_entry_offset_in_bytes
  %callee = call ptr addrspace(1) @azul.load_ref_p1_with_itable_tbaa(ptr addrspace(1) %callee_addr_i8)
  ret ptr addrspace(1) %callee

 iklass_null:
  ret ptr addrspace(1) null
}

!7 = !{!"branch_weights", i32 1, i32 1048575}

; Load klassOop from java.lang.Class
define ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %java_mirror) 
     "azul-late-inline"="0" alwaysinline nounwind readonly willreturn "gc-leaf-function" {
  %klass_offset = load i32, ptr @java_lang_Class.klass_offset
  %klass_oop_addr.i8 = getelementptr inbounds i8, ptr addrspace(1) %java_mirror, i32 %klass_offset
  %klass_oop = load atomic ptr addrspace(1), ptr addrspace(1) %klass_oop_addr.i8 unordered, align 8, !azul.klass_from_java_lang_Class !{}

  ret ptr addrspace(1) %klass_oop
}

define ptr addrspace(1) @azul.get_member_name_vmtarget(ptr addrspace(1) %member_name) 
    nounwind alwaysinline readonly "azul-late-inline"="0" "gc-leaf-function"="true" {
  %rmt_supported = load i1, ptr @ResolvedMethodTableSupported
  br i1 %rmt_supported, label %is.rmt.supported, label %not.rmt.supported

is.rmt.supported:
  %rmethod_name_offset_in_bytes = load i32, ptr @java_lang_invoke_MemberName.method_offset_in_bytes
  %rmethod_name_addr = getelementptr inbounds i8, ptr addrspace(1) %member_name, i32 %rmethod_name_offset_in_bytes
  %rmethod_name = load ptr addrspace(1), ptr addrspace(1) %rmethod_name_addr

  %vmtarget_in_rmethod_name_offset_in_bytes = load i32, ptr @java_lang_invoke_ResolvedMethodName.vmtarget_offset_in_bytes
  %vmtarget_in_rmethod_name_addr = getelementptr inbounds i8, ptr addrspace(1) %rmethod_name, i32 %vmtarget_in_rmethod_name_offset_in_bytes
  br label %load.vmtarget

not.rmt.supported:
  %vmtarget_in_member_name_offset_in_bytes = load i32, ptr @java_lang_invoke_MemberName.vmtarget_offset_in_bytes
  %vmtarget_in_member_name_addr = getelementptr inbounds i8, ptr addrspace(1) %member_name, i32 %vmtarget_in_member_name_offset_in_bytes
  br label %load.vmtarget

load.vmtarget:
  %vmtarget_addr = phi ptr addrspace(1) [%vmtarget_in_rmethod_name_addr, %is.rmt.supported], [%vmtarget_in_member_name_addr, %not.rmt.supported]
  %vmtarget = load ptr addrspace(1), ptr addrspace(1) %vmtarget_addr
  ret ptr addrspace(1) %vmtarget
}

define ptr addrspace(1) @azul.resolve_method_handle(ptr addrspace(1) %method_handle) 
    nounwind noinline readonly "azul-late-inline"="2" "gc-leaf-function"="true" {
  ; Load lambda_form field from method_handle
  %form_offset_in_bytes = load i32, ptr @java_lang_invoke_MethodHandle.form_offset_in_bytes
  %lambda_form_addr = getelementptr inbounds i8, ptr addrspace(1) %method_handle, i32 %form_offset_in_bytes
  %lambda_form = load ptr addrspace(1), ptr addrspace(1) %lambda_form_addr

  ; Load member_name field from lambda_form
  %vmentry_offset_in_bytes = load i32, ptr @java_lang_invoke_LambdaForm.vmentry_offset_in_bytes
  %member_name_addr = getelementptr inbounds i8, ptr addrspace(1) %lambda_form, i32 %vmentry_offset_in_bytes
  %member_name = load ptr addrspace(1), ptr addrspace(1) %member_name_addr

  %callee = call ptr addrspace(1) @azul.get_member_name_vmtarget(ptr addrspace(1) %member_name)
  
  ret ptr addrspace(1) %callee
}

declare zing_partial_subtype_check i32 @"StubRoutines::partial_subtype_check()"(ptr addrspace(1),
    ptr addrspace(1)) nounwind readonly argmemonly willreturn "gc-leaf-function" "vmstate-idempotent"="true"

;; Check if %sub_kid is a subtype of %super_kid (i.e. sub_kid is equal
;; to super_kid or is a proper subtype of super_kid) This is a (slightly
;; modified copy of gen_full_subtype_check from stubGenerator_x86_64.cpp 
;; transcribed to llvm IR.  The only major change is addition of the
;; equality fastpath.
define i1 @azul.is_subtype_of(i32 %super_kid, i32 %sub_kid) 
    nounwind noinline readonly "azul-late-inline"="1" willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
entry:
  %super_check_offset_offset = load i32, ptr @klassOopDesc.super_check_offset_offset_bytes
  %secondary_super_kid_cache_offset = load i32, ptr @klassOopDesc.secondary_super_kid_cache_offset_bytes

  %is_same = icmp eq i32 %super_kid, %sub_kid
  br i1 %is_same, label %ret_true, label %subtype_check

subtype_check:
  %super_klass = call ptr addrspace(1) @azul.load_klass(i32 %super_kid)
  %sub_klass = call ptr addrspace(1) @azul.load_klass(i32 %sub_kid)

  %super_check_offset_addr = getelementptr inbounds i8, ptr addrspace(1) %super_klass, i32 %super_check_offset_offset
  %super_check_offset = load atomic i32, ptr addrspace(1) %super_check_offset_addr unordered, align 4, !tbaa !23

  %sub_cached_superkid_addr = getelementptr inbounds i8, ptr addrspace(1) %sub_klass, i32 %super_check_offset
  %sub_cached_superkid = load atomic i32, ptr addrspace(1) %sub_cached_superkid_addr unordered, align 4, !tbaa !22
  %is_subtype = icmp eq i32 %sub_cached_superkid, %super_kid
  br i1 %is_subtype, label %ret_true, label %continue_check

continue_check:
  %has_secondary_supers = icmp eq i32 %super_check_offset, %secondary_super_kid_cache_offset
  br i1 %has_secondary_supers, label %check_secondary_supers, label %ret_false

check_secondary_supers:
  %full_check_result = call zing_partial_subtype_check i32 @"StubRoutines::partial_subtype_check()"(
              ptr addrspace(1) %super_klass, ptr addrspace(1) %sub_klass)
  ;; Note: the partial subtype check routine returns zero on match
  %result = icmp eq i32 %full_check_result, 0
  ret i1 %result

ret_true:
  ret i1 true

ret_false:
  ret i1 false
}

declare zing_stub_default void @"StubRoutines::dolphin_install_exception_exit_adapter()"(i64) 
    willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind

;; We don't need to make the anchor stores (as for other functions in DolphinRuntime) since this is
;; a leaf function.
declare i64 @"DolphinRuntime::get_frame_size"(ptr) willreturn "gc-leaf-function" nounwind

;; %parentAddr is a PC from the CodeBlob executing in the current physical frame.  This can be used
;; to locate the CodeBlob itself, and to fetch meta-information about the codeblob.
define void @azul.unwind_prepare(ptr %parentAddr) 
    nounwind noinline "azul-late-inline"="4" willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
entry:
  %frameSize = call i64 @"DolphinRuntime::get_frame_size"(ptr %parentAddr)
  call zing_stub_default void @"StubRoutines::dolphin_install_exception_exit_adapter()"(i64 %frameSize)
  ret void
}

;; ---------------------------------------------------------------------------------------------
;; These are intrinsics starting from here, and they will be later moved to an intrinsic.ll file
;; 

;; === Math intrinsics ===
;; List intrinsics in the same order as they are declared in vmSymbols.hpp
;; -- This makes it easier to spot missing ones.

define zing double @_dsignum(double %a) {
  %is_nan = fcmp uno double %a, 0.0
  br i1 %is_nan, label %done, label %not_nan

not_nan:
  %is_eq = fcmp ueq double %a, 0.0
  br i1 %is_eq, label %done, label %not_equal

not_equal:
  %is_lt = fcmp ult double %a, 0.0
  br i1 %is_lt, label %lower, label %greater

lower:
  ret double -1.0

greater:
  ret double 1.0

done:
  ret double %a
}

define zing float @_fsignum(float %a) {
  %is_nan = fcmp uno float %a, 0.0
  br i1 %is_nan, label %done, label %not_nan

not_nan:
  %is_eq = fcmp ueq float %a, 0.0
  br i1 %is_eq, label %done, label %not_equal

not_equal:
  %is_lt = fcmp ult float %a, 0.0
  br i1 %is_lt, label %lower, label %greater

lower:
  ret float -1.0

greater:
  ret float 1.0

done:
  ret float %a
}


declare double @llvm.fabs.f64(double %Val)

define zing double @_dabs(double %operand)
    nounwind alwaysinline readnone {
entry:
  %result = call double @llvm.fabs.f64(double %operand)
  ret double %result
}

declare float @llvm.fabs.f32(float %Val)

define zing float @_fabs(float %operand)
    nounwind alwaysinline readnone {
entry:
  %result = call float @llvm.fabs.f32(float %operand)
  ret float %result
}

declare double @llvm.sin.f64(double %Val)

define zing double @_dsin(double %operand) 
    nounwind alwaysinline readnone {
entry:
  %result = call double @llvm.sin.f64(double %operand)
  ret double %result
}

declare double @llvm.cos.f64(double %Val)

define zing double @_dcos(double %operand)
    nounwind alwaysinline readnone {
entry:
  %result = call double @llvm.cos.f64(double %operand)
  ret double %result
}

declare double @llvm.sqrt.f64(double %Val)

define zing double @_dsqrt(double %operand) 
    nounwind alwaysinline readnone {
entry:
  %result = call double @llvm.sqrt.f64(double %operand)
  ret double %result
}

define zing double @_strict_dsqrt(double %operand)
    nounwind alwaysinline readnone {
entry:
  %result = call double @llvm.sqrt.f64(double %operand)
  ret double %result
}

; Attributes must match the stub definition in dolphinStubCompiler.cpp
declare double @"StubRoutines::dlog()"(double %Val)
  mustprogress nofree norecurse nosync nounwind readnone speculatable willreturn memory(none) "azul-generatable" "gc-leaf-function"

define zing double @_dlog(double %operand) 
    nounwind alwaysinline readnone {
entry:
  %result = call double @"StubRoutines::dlog()"(double %operand)
  ret double %result
}

; Attributes must match the stub definition in dolphinStubCompiler.cpp
declare double @"StubRoutines::dlog10()"(double %Val)
  mustprogress nofree norecurse nosync nounwind readnone speculatable willreturn memory(none) "azul-generatable" "gc-leaf-function"

define zing double @_dlog10(double %operand) 
    nounwind alwaysinline readnone {
entry:
  %result = call double @"StubRoutines::dlog10()"(double %operand)
  ret double %result
}

; Attributes must match the stub definition in dolphinStubCompiler.cpp
declare double @"StubRoutines::dpow()"(double %x, double %y)
  mustprogress nofree norecurse nosync nounwind readnone speculatable willreturn memory(none) "azul-generatable" "gc-leaf-function" "specializable-function-id"

define zing double @_dpow(double %x, double %y)
  nounwind alwaysinline readnone {
entry:
  %result = call double @"StubRoutines::dpow()"(double %x, double %y)
  ret double %result
}

; Attributes must match the stub definition in dolphinStubCompiler.cpp
declare double @"StubRoutines::dexp()"(double %Val)
  mustprogress nofree norecurse nosync nounwind readnone speculatable willreturn memory(none) "azul-generatable" "gc-leaf-function"

define zing double @_dexp(double %operand) 
    nounwind alwaysinline readnone {
entry:
  %result = call double @"StubRoutines::dexp()"(
      double %operand)
  ret double %result
}

;; There is no fast Math.tan() on AARCH64 yet, see JDK-8189106.
declare double @"StubRoutines::dtan()"(double %Val)
  mustprogress nofree nosync nounwind readnone speculatable willreturn memory(none) "azul-generatable" "gc-leaf-function"

define zing double @_dtan(double %operand)
    nounwind alwaysinline readnone {
entry:
  %result = call double @"StubRoutines::dtan()"(
      double %operand)
  ret double %result
}

define zing i32 @_min(i32 %operand1, i32 %operand2)
    nounwind alwaysinline readnone {
entry:
  %cond = icmp sle i32 %operand1, %operand2
  %result = select i1 %cond, i32 %operand1, i32 %operand2
  ret i32 %result
}

define zing i32 @_max(i32 %operand1, i32 %operand2) 
    nounwind alwaysinline readnone {
entry:
  %cond = icmp sge i32 %operand1, %operand2
  %result = select i1 %cond, i32 %operand1, i32 %operand2
  ret i32 %result
}

define i64 @_unsignedMultiplyHigh(i64 %a, i64 %b) {
  %op1 = zext i64 %a to i128
  %op2 = zext i64 %b to i128
  %op = mul nuw nsw i128 %op1, %op2
  %tmp = lshr i128 %op, 64
  %res = trunc i128 %tmp to i64
  ret i64 %res
}

define i64 @_multiplyHigh(i64 %a, i64 %b) {
  %op1 = sext i64 %a to i128
  %op2 = sext i64 %b to i128
  %op = mul nuw nsw i128 %op1, %op2
  %tmp = ashr i128 %op, 64
  %res = trunc i128 %tmp to i64
  ret i64 %res
}

;; == fp maximum/minimum intrinsic implementation ==
declare double @llvm.maximum.f64(double %a, double %b)
declare double @llvm.minimum.f64(double %a, double %b)
declare float  @llvm.maximum.f32(float %a, float %b)
declare float  @llvm.minimum.f32(float %a, float %b)

define double @_maxD(double %a, double %b) alwaysinline nounwind memory(none) {
  %res = call double @llvm.maximum.f64(double %a, double %b)
  ret double %res
}

define float @_maxF(float %a, float %b) alwaysinline nounwind memory(none) {
  %res = call float @llvm.maximum.f32(float %a, float %b)
  ret float %res
}

define double @_minD(double %a, double %b) alwaysinline nounwind memory(none) {
  %res = call double @llvm.minimum.f64(double %a, double %b)
  ret double %res
}

define float @_minF(float %a, float %b) alwaysinline nounwind memory(none) {
  %res = call float @llvm.minimum.f32(float %a, float %b)
  ret float %res
}

declare double @llvm.floor.f64(double %Val)

define zing double @_floor(double %a)
    nounwind alwaysinline readnone {
entry:
  %result = call double @llvm.floor.f64(double %a)
  ret double %result
}

declare double @llvm.ceil.f64(double %Val)

define zing double @_ceil(double %a)
    nounwind alwaysinline readnone {
entry:
  %result = call double @llvm.ceil.f64(double %a)
  ret double %result
}

declare double @llvm.rint.f64(double %Val)

define zing double @_rint(double %a)
    nounwind alwaysinline readnone {
entry:
  %result = call double @llvm.rint.f64(double %a)
  ret double %result
}

declare double @llvm.fma.f64(double %a, double %b, double %c)

define zing double @_fmaD(double %a, double %b, double %c)
    nounwind alwaysinline readnone {
entry:
  %result = call double @llvm.fma.f64(double %a, double %b, double %c)
  ret double %result
}

declare float @llvm.fma.f32(float %a, float %b, float %c)

define zing float @_fmaF(float %a, float %b, float %c)
    nounwind alwaysinline readnone {
entry:
  %result = call float @llvm.fma.f32(float %a, float %b, float %c)
  ret float %result
}

define zing i32 @_divideUnsigned_i(i32 %dividend, i32 %divisor)
    alwaysinline {
entry:
  %is_zero = icmp eq i32 %divisor, 0
  br i1 %is_zero, label %zero, label %not_zero

not_zero:
  %is_neg = icmp slt i32 %divisor, 0
  br i1 %is_neg, label %negative, label %positive

negative:
  ; Fastpath for case 'divisor < 0'. Same implementation as in
  ; j.l.Long::divideUnsigned (or OpenJDK's `udivmodI` masm), which
  ; in turn is taken from Hacker's Delight 2nd ed. sec. 9.3
  ; quot = (dividend & ~(dividend - divisor)) >>> 31
  %sub = sub i32 %dividend, %divisor
  %not = xor i32 %sub, -1
  %tmp = and i32 %dividend, %not
  %res = lshr i32 %tmp, 31
  ret i32 %res

positive:
  ; For divisor >= 0 use plain `udiv` instruction.
  %div = udiv i32 %dividend, %divisor
  ret i32 %div

zero:
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  %ret = call zing_uncommon_trap i32(...) @llvm.experimental.deoptimize.isI32(i32 %reasonUnhandled)
  "azul-need-deopt-before-call" "azul-allow-gcptrs-in-regs" "deopt-lowering"="live-in" [ "deopt"() ]
  ret i32 %ret
}

define zing i32 @_remainderUnsigned_i(i32 %dividend, i32 %divisor)
    alwaysinline {
entry:
  %is_zero = icmp eq i32 %divisor, 0
  br i1 %is_zero, label %zero, label %not_zero

not_zero:
  %is_neg = icmp slt i32 %divisor, 0
  br i1 %is_neg, label %negative, label %positive

negative:
  ; Fastpath for case 'divisor < 0'. Same implementation as in
  ; j.l.Long::remainderUnsigned (or OpenJDK's `udivmodI` masm), which
  ; in turn is taken from Hacker's Delight 2nd ed. sec. 9.3
  ; rem = dividend - (((dividend & ~(dividend - divisor)) >> 31) & divisor);
  %sub = sub i32 %dividend, %divisor
  %not = xor i32 %sub, -1
  %tmp = and i32 %dividend, %not
  %quot = ashr i32 %tmp, 31
  %and = and i32 %quot, %divisor
  %res = sub i32 %dividend, %and
  ret i32 %res

positive:
  ; For divisor >= 0 use plain `urem` instruction.
  %rem = urem i32 %dividend, %divisor
  ret i32 %rem

zero:
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  %ret = call zing_uncommon_trap i32(...) @llvm.experimental.deoptimize.isI32(i32 %reasonUnhandled)
  "azul-need-deopt-before-call" "azul-allow-gcptrs-in-regs" "deopt-lowering"="live-in" [ "deopt"() ]
  ret i32 %ret
}

;; ==== Other intrinsics ====

define void @_dolphin_uncommon_trap() {
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  call zing_uncommon_trap void(...) @llvm.experimental.deoptimize.isVoid(i32 %reasonUnhandled) "azul-need-deopt-before-call" "azul-allow-gcptrs-in-regs" "deopt-lowering"="live-in" [ "deopt"() ]
  ret void
}

declare void @llvm.prefetch.p0(ptr, i32, i32, i32) nounwind 

define void @azul.prefetch_addr1(ptr addrspace(1) %obj, i64 %offset, i32 %rw)
  noinline nounwind "azul-late-inline"="3" willreturn "gc-leaf-function" argmemonly {

  %ptr = addrspacecast ptr addrspace(1) %obj to ptr, !verifier_exception !1
  %offptr = getelementptr i8, ptr %ptr, i64 %offset
  %cmp_rw = icmp eq i32 %rw, 0
  br i1 %cmp_rw, label %read, label %write

write:
  call void @llvm.prefetch.p0(ptr %offptr, i32 1, i32 1, i32 1)
  ret void

read:
  call void @llvm.prefetch.p0(ptr %offptr, i32 0, i32 1, i32 1)
  ret void
}

define zing void @_exclusive(ptr addrspace(1) %obj, i64 %offset) 
  alwaysinline {
  call void @azul.prefetch_addr1(ptr addrspace(1) %obj, i64 %offset, i32 1)
  ret void
}

define zing void @_shared(ptr addrspace(1) %obj, i64 %offset) 
  alwaysinline {
  call void @azul.prefetch_addr1(ptr addrspace(1) %obj, i64 %offset, i32 0)
  ret void
}

declare void @llvm.assume(i1)
define zing void @_assume(i32 %cond.i32) alwaysinline {
  %cond.i1 = trunc i32 %cond.i32 to i1
  call void @llvm.assume(i1 %cond.i1)
  ret void
}

declare i64 @"os::javaTimeMillis"() nounwind inaccessiblememonly
  "azul-trivial-deadness" "compute-time" willreturn "gc-leaf-function" "vmstate-idempotent"="true"
declare i64 @"os::javaTimeNanos_slow"() nounwind inaccessiblememonly
  "azul-trivial-deadness" "compute-time" willreturn "gc-leaf-function" "vmstate-idempotent"="true"
declare i64 @"os::javaTimeNanos_fast"() nounwind inaccessiblememonly
  "azul-trivial-deadness" "compute-time" willreturn "gc-leaf-function" "vmstate-idempotent"="true"
declare i64 @"JfrTime::counterTime"() nounwind inaccessiblememonly
  "azul-trivial-deadness" "compute-time" willreturn "gc-leaf-function" "vmstate-idempotent"="true"

; Note: We need the alwaysinline for the vmstate-idempotent to be accurate
define zing i64 @_currentTimeMillis() alwaysinline nounwind inaccessiblememonly
  "azul-trivial-deadness" "compute-time" "vmstate-idempotent"="true" {
entry:
  %res = call i64 @"os::javaTimeMillis"()
  ret i64 %res
}

; Note: We need the alwaysinline for the vmstate-idempotent to be accurate
define zing i64 @_nanoTime() alwaysinline nounwind inaccessiblememonly
  "azul-trivial-deadness" "compute-time" "vmstate-idempotent"="true" {
entry:
  %useFast = load i1, ptr @UseFastNanoTime.flag
  br i1 %useFast, label %fastPath, label %slowPath

fastPath:
  %fastResult = call i64 @"os::javaTimeNanos_fast"()
  ret i64 %fastResult

slowPath:
  %slowResult = call i64 @"os::javaTimeNanos_slow"()
  ret i64 %slowResult
}

define zing i64 @_counterTime() alwaysinline nounwind inaccessiblememonly
  "azul-trivial-deadness" "compute-time" "vmstate-idempotent"="true" {
entry:
  %res = call i64 @"JfrTime::counterTime"()
  ret i64 %res
}

declare i32 @"StubRoutines::checkcast_arraycopy()"(
          i64,              ;; index to 1st dest element, including all offsets 
          ptr addrspace(1) readonly, ;; derived ptr to src start, 
          i64,              ;; length
          i64,              ;; dest element klass supercheck offset
          i64,              ;; dest element kid
          ptr addrspace(1)) ;; dest
        nounwind willreturn "gc-leaf-function" argmemonly
declare void @"StubRoutines::object_arraycopy()"(
          ptr addrspace(1), ptr addrspace(1) readonly, i64) 
          nounwind willreturn "gc-leaf-function" argmemonly

declare void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(
        ptr addrspace(1),       ;; destination
        ptr addrspace(1),       ;; source
        i64,                    ;; length of the copy, in bytes
        i32)                    ;; element size
        nounwind argmemonly willreturn "gc-leaf-function"


declare void @llvm.memmove.element.unordered.atomic.p0.p0.i64(
        ptr,       ;; destination
        ptr,       ;; source
        i64,                    ;; length of the copy, in bytes
        i32)                    ;; element size
        nounwind argmemonly willreturn "gc-leaf-function"

declare void @llvm.memmove.element.unordered.atomic.p1.p0.i64(
        ptr addrspace(1),       ;; destination
        ptr,       ;; source
        i64,                    ;; length of the copy, in bytes
        i32)                    ;; element size
        nounwind argmemonly willreturn "gc-leaf-function"
declare void @llvm.memmove.element.unordered.atomic.p0.p1.i64(
        ptr,       ;; destination
        ptr addrspace(1),       ;; source
        i64,                    ;; length of the copy, in bytes
        i32)                    ;; element size
        nounwind argmemonly willreturn "gc-leaf-function"
declare void @llvm.memmove.element.unordered.atomic.p1.p1.i64(
        ptr addrspace(1),       ;; destination
        ptr addrspace(1),       ;; source
        i64,                    ;; length of the copy, in bytes
        i32)                    ;; element size
        nounwind argmemonly willreturn "gc-leaf-function"

declare void @llvm.memset.element.unordered.atomic.p0.i64(
        ptr,       ;; destination
        i8,        ;; value
        i64,                    ;; length to memset, in bytes
        i32)                    ;; element size
        nounwind memory(argmem: write) willreturn "gc-leaf-function"
declare void @llvm.memset.element.unordered.atomic.p1.i64(
        ptr addrspace(1),       ;; destination
        i8,                     ;; value
        i64,                    ;; length to memset, in bytes
        i32)                    ;; element size
        nounwind memory(argmem: write) willreturn "gc-leaf-function"

declare <32 x i8> @llvm.masked.load.v32i8.p1(
        ptr addrspace(1) nocapture,
        i32 immarg,
        <32 x i1>,
        <32 x i8>)
        nocallback nofree nosync nounwind willreturn memory(argmem: read) "gc-leaf-function"

declare void @llvm.masked.store.v32i8.p1(
        <32 x i8>, ptr addrspace(1) nocapture,
        i32 immarg,
        <32 x i1>)
        nocallback nofree nosync nounwind willreturn memory(argmem: write) "gc-leaf-function"

declare void @azul.element_memcpy_ref(ptr addrspace(1) %dst_addr, ptr addrspace(1) %src_addr, i64 %length.bytes) 
    "azul-late-inline"="0" alwaysinline willreturn "gc-leaf-function" nounwind
declare void @azul.element_memmove_ref(ptr addrspace(1) %dst_addr, ptr addrspace(1) %src_addr, i64 %length.bytes) 
    "azul-late-inline"="0" alwaysinline willreturn "gc-leaf-function" nounwind

declare i32 @azul.memcmp(ptr addrspace(1) %lhs, ptr addrspace(1) %rhs, i64 %length.bytes, i64 %element.size)
    nounwind argmemonly willreturn "gc-leaf-function" readonly

declare zing_stub_default void @"StubRoutines::dolphin_arraycopy_slow()"(
        i64 %javaThread, ptr addrspace(1) readonly %src, i64 %src_pos,
        ptr addrspace(1) %dest, i64 %dest_pos, i64 %len)
        "consumes-caller-vmstate" "azul-deopt-on-throw" "azul-allow-gcptrs-in-regs"

!9 = !{ !"branch_weights", i32 1048575, i32 1 }

define zing void @_arraycopy(ptr addrspace(1) readonly %src, i32 %src_pos, 
                             ptr addrspace(1) %dest, i32 %dest_pos, 
                             i32 %length) alwaysinline {
entry:
  call void @azul.arraycopy(ptr addrspace(1) readonly %src, i32 %src_pos,
                            ptr addrspace(1) %dest, i32 %dest_pos, 
                            i32 %length) [ "deopt"() ]
  ret void
}

; Note: This JBA is used both to implement the corresponding arraycopy intrinsic,
; but also is used by other intrinsics (such as Object::clone).
define void @azul.arraycopy(ptr addrspace(1) readonly %src, i32 %src_pos,
                           ptr addrspace(1) %dest, i32 %dest_pos,
                           i32 %length)
   "alwaysinline-top-level" "azul-late-inline"="0" "consumes-caller-vmstate" "azul-deopt-on-throw" {
entry:
  %src_not_null = icmp ne ptr addrspace(1) %src, null
  %dest_not_null = icmp ne ptr addrspace(1) %dest, null
  %not_null = and i1 %src_not_null, %dest_not_null
  br i1 %not_null, label %not.null, label %array.copy.noreturn, !prof !9

not.null:
  ;; Are layout helpers equal?
  %src_kid = call i32 @azul.get_klass_id(ptr addrspace(1) %src)
  %dest_kid = call i32 @azul.get_klass_id(ptr addrspace(1) %dest)
  %src_lh = call i32 @azul.load_layout_helper(i32 %src_kid)
  %dest_lh = call i32 @azul.load_layout_helper(i32 %dest_kid)
  %lhs_equal = icmp eq i32 %src_lh, %dest_lh
  br i1 %lhs_equal, label %lhs.equal, label %array.copy.noreturn, !prof !9

lhs.equal:
  ;; Is src (dest) an array?
  %lh_neutral_value = load i32, ptr @Klass.layout_helper_neutral_value
  %is_src_array = icmp slt i32 %src_lh, %lh_neutral_value
  br i1 %is_src_array, label %both.arrays, label %array.copy.noreturn, !prof !9

both.arrays:
  %src_pos_non_negative = icmp sge i32 %src_pos, 0
  %dest_pos_non_negative = icmp sge i32 %dest_pos, 0
  %length_non_negative = icmp sge i32 %length, 0
  %pos_non_negative = and i1 %src_pos_non_negative, %dest_pos_non_negative
  %non_negative = and i1 %length_non_negative, %pos_non_negative
  br i1 %non_negative, label %non.negative, label %array.copy.noreturn, !prof !9

non.negative:
  ;; Is (src_pos + length <= src.length) && (dest_pos + length <= dest.length)?
  
  ;; Both src_pos and length are non-negative, so the result never wraps.
  ;; But it can overflow signed i32, so use unsigned comparison.
  %src_limit = add i32 %src_pos, %length
  %src_length = call i32 @azul.array_length(ptr addrspace(1) %src)
  %src_within_bound = icmp ule i32 %src_limit, %src_length

  %dest_limit = add i32 %dest_pos, %length
  %dest_length = call i32 @azul.array_length(ptr addrspace(1) %dest)
  %dest_within_bound = icmp ule i32 %dest_limit, %dest_length

  %within_bounds = and i1 %src_within_bound, %dest_within_bound
  br i1 %within_bounds, label %within.bounds, label %array.copy.noreturn, !prof !9

within.bounds:
  %log2_element_size = call i32 @azul.layout_helper_log2_element_size(i32 %src_lh)
  %log2_element_size.zext = zext i32 %log2_element_size to i64
  %length.zext = zext i32 %length to i64
  %length.bytes = shl i64 %length.zext, %log2_element_size.zext

  %UseSafepointsInCopy.i1 = load i1, ptr @UseSafepointsInCopy
  %UseSafepointsInCopyInliningIntrinsics.i1 = load i1, ptr @UseSafepointsInCopyInliningIntrinsics
  %need_length_check = and i1 %UseSafepointsInCopy.i1, %UseSafepointsInCopyInliningIntrinsics.i1
  br i1 %need_length_check, label %check.length, label %no.check.length

check.length:
  %MemoryOpsChunkSizeInBytes.i64 = load i64, ptr @MemoryOpsChunkSizeInBytes
  %over_threshold = icmp ugt i64 %length.bytes, %MemoryOpsChunkSizeInBytes.i64
  br label %no.check.length

no.check.length:
  %use_safepoints = phi i1 [%over_threshold, %check.length], [false, %within.bounds]
  %header_size = call i32 @azul.layout_helper_header_size(i32 %src_lh)
  %header_size.zext = zext i32 %header_size to i64
  %src_pos.zext = zext i32 %src_pos to i64
  %dest_pos.zext = zext i32 %dest_pos to i64

  ;; src_ptr = src + header_size + src_pos << log2_element_size
  %src_pos_bytes = shl i64 %src_pos.zext, %log2_element_size.zext
  %src_offset = add i64 %src_pos_bytes, %header_size.zext
  %src_ptr = getelementptr inbounds i8, ptr addrspace(1) %src, i64 %src_offset

  ;; dest_ptr = dest + header_size + dest_pos << log2_element_size
  %dest_pos_bytes = shl i64 %dest_pos.zext, %log2_element_size.zext
  %dest_offset = add i64 %dest_pos_bytes, %header_size.zext
  %dest_ptr = getelementptr inbounds i8, ptr addrspace(1) %dest, i64 %dest_offset

  ;; Is src (dest) an object array?  Note that we really only need to test 
  ;; the tag bits, but since all object arrays share the same bits for the
  ;; other subfields, we can cheat and just use a 32 bit comparison.  This 
  ;; avoids the need to shift and mask.
  %object_array_lh = load i32, ptr @Klass.object_array_layout_helper
  %src_is_objarray = icmp eq i32 %src_lh, %object_array_lh
  br i1 %src_is_objarray, label %objarray.copy, label %check.overlap

objarray.copy:
  %kids_equal = icmp eq i32 %src_kid, %dest_kid
  br i1 %kids_equal, label %fast.objarray.copy, label %kids.non.equal

kids.non.equal:
  ;; Is src element type a subtype of dest element type?
  %src_ekid_compatible_check = call i32 @azul.objarray_element_kid(ptr addrspace(1) %src)
  %dest_ekid_compatible_check = call i32 @azul.objarray_element_kid(ptr addrspace(1) %dest)
  %is_compatible = call i1 @azul.is_subtype_of(i32 %dest_ekid_compatible_check, i32 %src_ekid_compatible_check)
  br i1 %is_compatible, label %fast.objarray.copy, label %dispatch.copy

fast.objarray.copy:
  br label %check.overlap

check.overlap:
  ;; We now need to check the two arrays for overlap. If they overlap,
  ;; then the copy must be done with memmove, else we can do it with
  ;; memcpy.
  %diff_array = icmp ne ptr addrspace(1) %src, %dest
  br i1 %diff_array, label %dispatch.copy, label %check.offset.overlap, !prof !9

check.offset.overlap:
  %src_is_first = icmp ule i32 %src_pos, %dest_pos
  br i1 %src_is_first, label %src.first, label %dest.first

src.first:
  ; no overlap if: src_pos + length <= dest_pos
  ; Note: We know from above that src_limit & dest_pos are non-negative
  %no_overlap.1 = icmp ule i32 %src_limit, %dest_pos
  br label %dispatch.copy

dest.first:
  ; no overlap if: dest_pos + length <= src_pos
  ; Note: We know from above that dest_limit & src_pos are non-negative
  %no_overlap.2 = icmp ule i32 %dest_limit, %src_pos
  br label %dispatch.copy

dispatch.copy:
  %have_no_overlap = phi i1 [ true, %check.overlap ],
                            [ true, %kids.non.equal ],
                            [ %no_overlap.1, %src.first ],
                            [ %no_overlap.2, %dest.first ]
  %is_checkcast_copy = phi i1 [ false, %check.overlap ],
                              [ true, %kids.non.equal ],  
                              [ false, %src.first ], 
                              [ false, %dest.first ]
  br i1 %src_is_objarray, label %dispatch.objarray.copy, label %dispatch.primitive.copy

dispatch.objarray.copy:
  br i1 %use_safepoints, label %dispatch.fast.safepoint.objarray.copy, label %dispatch.fast.objarray.copy

dispatch.fast.safepoint.objarray.copy:
  br i1 %is_checkcast_copy, label %do.safepoint.checkast.objarray.copy, label %do.safepoint.objarray.copy

dispatch.primitive.copy:
  br i1 %have_no_overlap, label %do.safepoint.primitive.memcpy, label %do.safepoint.primitive.memmove, !prof !9

do.safepoint.primitive.memcpy:
  switch i32 %log2_element_size, label %safepoint.primitive.copy.8
                        [ i32 0, label %safepoint.primitive.copy.1
                          i32 1, label %safepoint.primitive.copy.2
                          i32 2, label %safepoint.primitive.copy.4 ]

safepoint.primitive.copy.1:
  call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(
       ptr addrspace(1) elementtype(i8) align 1 %dest_ptr,
       ptr addrspace(1) elementtype(i8) align 1 %src_ptr,
       i64 %length.bytes,
       i32 1) "azul-allow-gcptrs-in-regs" [ "deopt"() ]
  br label %done

safepoint.primitive.copy.2:

  call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(
       ptr addrspace(1) elementtype(i16) align 2 %dest_ptr,
       ptr addrspace(1) elementtype(i16) align 2 %src_ptr,
       i64 %length.bytes,
       i32 2) "azul-allow-gcptrs-in-regs" [ "deopt"() ]
  br label %done

safepoint.primitive.copy.4:
  call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(
       ptr addrspace(1) elementtype(i32) align 4 %dest_ptr,
       ptr addrspace(1) elementtype(i32) align 4 %src_ptr,
       i64 %length.bytes,
       i32 4) "azul-allow-gcptrs-in-regs" [ "deopt"() ]
  br label %done

safepoint.primitive.copy.8:
  call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(
       ptr addrspace(1) elementtype(i64) align 8 %dest_ptr,
       ptr addrspace(1) elementtype(i64) align 8 %src_ptr,
       i64 %length.bytes,
       i32 8) "azul-allow-gcptrs-in-regs" [ "deopt"() ]
  br label %done

do.safepoint.primitive.memmove:
  switch i32 %log2_element_size, label %safepoint.primitive.move.8
                        [ i32 0, label %safepoint.primitive.move.1
                          i32 1, label %safepoint.primitive.move.2
                          i32 2, label %safepoint.primitive.move.4 ]

safepoint.primitive.move.1:
  call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(
       ptr addrspace(1) elementtype(i8) align 1 %dest_ptr,
       ptr addrspace(1) elementtype(i8) align 1 %src_ptr,
       i64 %length.bytes,
       i32 1) "azul-allow-gcptrs-in-regs" [ "deopt"() ]
  br label %done

safepoint.primitive.move.2:
  call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(
       ptr addrspace(1) elementtype(i16) align 2 %dest_ptr,
       ptr addrspace(1) elementtype(i16) align 2 %src_ptr,
       i64 %length.bytes,
       i32 2) "azul-allow-gcptrs-in-regs" [ "deopt"() ]
  br label %done

safepoint.primitive.move.4:
  call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(
       ptr addrspace(1) elementtype(i32) align 4 %dest_ptr,
       ptr addrspace(1) elementtype(i32) align 4 %src_ptr,
       i64 %length.bytes,
       i32 4) "azul-allow-gcptrs-in-regs" [ "deopt"() ]
  br label %done

safepoint.primitive.move.8:
  call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(
       ptr addrspace(1) elementtype(i64) align 8 %dest_ptr,
       ptr addrspace(1) elementtype(i64) align 8 %src_ptr,
       i64 %length.bytes,
       i32 8) "azul-allow-gcptrs-in-regs" [ "deopt"() ]
  br label %done

do.safepoint.objarray.copy:
  br i1 %have_no_overlap, label %do.safepoint.objarray.memcpy, label %do.safepoint.objarray.memmove, !prof !9

do.safepoint.objarray.memcpy:
  call void @"StubRoutines::object_arraycopy_bytes_disjoint_safepoint()"(
       ptr addrspace(1) %dest,
       i64 %dest_offset,
       ptr addrspace(1) %src,
       i64 %src_offset,
       i64 %length.bytes) [ "deopt"() ]
  br label %done

do.safepoint.objarray.memmove:
  call void @"StubRoutines::object_arraycopy_bytes_safepoint()"(
       ptr addrspace(1) %dest,
       i64 %dest_offset,
       ptr addrspace(1) %src,
       i64 %src_offset,
       i64 %length.bytes) [ "deopt"() ]
  br label %done

do.safepoint.checkast.objarray.copy:
  call void @"StubRoutines::checkcast_arraycopy_safepoint()"(
       ptr addrspace(1) %dest,
       i64 %dest_offset,
       ptr addrspace(1) %src,
       i64 %src_offset,
       i64 %length.bytes) [ "deopt"() ]
  br label %done

dispatch.fast.objarray.copy:
  br i1 %is_checkcast_copy, label %do.fast.checkcast.copy, label %do.fast.objarray.copy

do.fast.objarray.copy:
  br i1 %have_no_overlap, label %do.fast.objarray.memcpy, label %do.fast.objarray.memmove, !prof !9

do.fast.objarray.memcpy:
  call void @azul.element_memcpy_ref(ptr addrspace(1) %dest_ptr, ptr addrspace(1) %src_ptr, i64 %length.bytes)
  br label %done

do.fast.objarray.memmove:
  call void @azul.element_memmove_ref(ptr addrspace(1) %dest_ptr, ptr addrspace(1) %src_ptr, i64 %length.bytes)
  br label %done

do.fast.checkcast.copy:
; NB! In compressed Oops mode, oop size is 4 bytes and not 8. So, we need to
; divide the dest_offset by 8 in regular mode (i.e. shr by 3) and divide by 4
; (i.e. shr by 2) in compressed mode, to get the dest_index.
  %logBytesPerHeapOop = load i64, ptr @LogBytesPerHeapOop
  %dest_index = lshr i64 %dest_offset, %logBytesPerHeapOop

  %dest_ekid = call i32 @azul.objarray_element_kid(ptr addrspace(1) %dest)
  %dest_ekid.zext = zext i32 %dest_ekid to i64
  %dest_element_klass = call ptr addrspace(1) @azul.load_klass(i32 %dest_ekid)
  %super_check_offset_offset = load i32, ptr @klassOopDesc.super_check_offset_offset_bytes
  %super_check_offset_addr = getelementptr inbounds i8, ptr addrspace(1) %dest_element_klass, i32 %super_check_offset_offset
  %super_check_offset = load atomic i32, ptr addrspace(1) %super_check_offset_addr unordered, align 4, !tbaa !23
  %super_check_offset.zext = zext i32 %super_check_offset to i64

  %result = call i32 @"StubRoutines::checkcast_arraycopy()"(
          i64 %dest_index,
          ptr addrspace(1) %src_ptr,
          i64 %length.zext,
          i64 %super_check_offset.zext,
          i64 %dest_ekid.zext,
          ptr addrspace(1) %dest)

  %succeed = icmp eq i32 %result, 0
  br i1 %succeed, label %done, label %checkcast.copy.failed, !prof !9

checkcast.copy.failed:
  ;; n elements were copied, but next element can't be stored in the dest array.
  ;; Call @"StubRoutines::dolphin_arraycopy_slow()" with updated arguments to throw an exception.
  %n = xor i32 -1, %result
  %next_src_pos = add i32 %src_pos, %n
  %next_dest_pos = add i32 %dest_pos, %n
  %next_length = sub i32 %length, %n
  br label %slow.array.copy

slow.array.copy:
  %current_thread = call i64 @azul.get_current_thread()
  %slow_src_pos.zext = zext i32 %next_src_pos to i64
  %slow_dest_pos.zext = zext i32 %next_dest_pos to i64
  %slow_length.zext = zext i32 %next_length to i64
  call zing_stub_default void @"StubRoutines::dolphin_arraycopy_slow()"(i64 %current_thread,
                                    ptr addrspace(1) %src, i64 %slow_src_pos.zext,
                                    ptr addrspace(1) %dest, i64 %slow_dest_pos.zext,
                                    i64 %slow_length.zext) [ "deopt"() ]
  br label %done

array.copy.noreturn:
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  call zing_uncommon_trap void(...) @llvm.experimental.deoptimize.isVoid(i32 %reasonUnhandled) "azul-need-deopt-before-call" "azul-allow-gcptrs-in-regs" "deopt-lowering"="live-in" [ "deopt"() ]
  ret void

done:  
  ret void
}

declare void @"StubRoutines::object_arraycopy_bytes_disjoint_safepoint()"(
        ptr addrspace(1) %dst,                ;; destination
        i64 %dst_pos,                         ;; offset in destination
        ptr addrspace(1) readonly %src,       ;; source
        i64 %src_pos,                         ;; offset in source
        i64 %length)                          ;; length of the copy, in bytes
        "consumes-caller-vmstate" "azul-deopt-on-throw" "azul-allow-gcptrs-in-regs"
declare void @"StubRoutines::object_arraycopy_bytes_safepoint()"(
        ptr addrspace(1) %dst,                ;; destination
        i64 %dst_pos,                         ;; offset in destination
        ptr addrspace(1) readonly %src,       ;; source
        i64 %src_pos,                         ;; offset in source
        i64 %length)                          ;; length of the copy, in bytes
        "consumes-caller-vmstate" "azul-deopt-on-throw" "azul-allow-gcptrs-in-regs"

declare void @"StubRoutines::checkcast_arraycopy_safepoint()"(
        ptr addrspace(1) %dst,                ;; destination
        i64 %dst_pos,                         ;; offset in destination
        ptr addrspace(1) readonly %src,       ;; source
        i64 %src_pos,                         ;; offset in source
        i64 %length)                          ;; length of the copy, in bytes
        "consumes-caller-vmstate" "azul-deopt-on-throw" "azul-allow-gcptrs-in-regs"

define ptr addrspace(1) @azul.get_mirror_from_instance_klass_oop(ptr addrspace(1) %klass)
    nounwind alwaysinline readonly "azul-late-inline"="0" willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
entry:
  %javaMirrorOffsetInBytes = load i32, ptr @klassOopDesc.java_mirror_offset_in_bytes

  %mirrorAddr.i8 = getelementptr i8, ptr addrspace(1) %klass, i32 %javaMirrorOffsetInBytes
  ;; Once initialized a java instance is initialized, the mirror must be a nonnull
  ;; reference to the associated java.lang.Class.  We can assume that 'this' is initialized or we
  ;; couldn't have made this call.  The instance itself could only be created by a new which
  ;; forces class initialization.
  %mirror = load atomic ptr addrspace(1), ptr addrspace(1) %mirrorAddr.i8 unordered, align 8, !nonnull !{}, !azul.java_lang_Class_from_klass !{}
  ret ptr addrspace(1) %mirror
}

define ptr addrspace(1) @azul.get_java_lang_class(ptr addrspace(1) %this)
    nounwind noinline readonly "azul-late-inline"="1" willreturn "gc-leaf-function" "vmstate-idempotent"="true" "foldable-function" {
entry:
  %kid = call i32 @azul.get_klass_id(ptr addrspace(1) %this)
  ; FIXME: We need the metadata !nonnull added on the load
  ; returned by @azul.load_klass, when it is inlined. Since we do not yet support
  ; metadata propagation from callsite back to the load being returned in the
  ; callee (through Orca), we inline this function by hand and add the metadata
  ; on the load. 
  %ktbl_base = load ptr, ptr @klassTable.klassTableBase
  %klass_addr = getelementptr inbounds ptr addrspace(1), ptr %ktbl_base, i32 %kid
  %isInvariantKlassLoad = load i1, ptr @FalconUseInvariantKlassLoad
  br i1 %isInvariantKlassLoad, label %invariantKlassLoad, label %not_invariant

invariantKlassLoad:
  %klassOop = load atomic ptr addrspace(1), ptr %klass_addr unordered, align 8, !tbaa !30, !invariant.load !{}, !azul.get_klass_by_kid !{}, !nonnull !{}
  br label %merge

not_invariant:
  %klass = load atomic ptr addrspace(1), ptr %klass_addr unordered, align 8, !tbaa !30, !azul.get_klass_by_kid !{}, !nonnull !{}
  br label %merge

merge:
  %klass_phi = phi ptr addrspace(1) [ %klassOop, %invariantKlassLoad ], [ %klass, %not_invariant ]
  %mirror = call ptr addrspace(1) @azul.get_mirror_from_instance_klass_oop(ptr addrspace(1) %klass_phi)
  ret ptr addrspace(1) %mirror
}

define zing nonnull ptr addrspace(1) @_getClass(ptr addrspace(1) %this)
    alwaysinline readonly nounwind {
entry:
  %res = call ptr addrspace(1) @azul.get_java_lang_class(ptr addrspace(1) %this)
  ret ptr addrspace(1) %res
}

define i32 @azul.get_kid_from_klass_oop(ptr addrspace(1) %klassOop)
    nounwind alwaysinline readonly "azul-late-inline"="0" willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
entry:
  %klassIdOffset = load i32, ptr @klassOopDesc.klassId_offset_in_bytes
  %klassIdAddr = getelementptr inbounds i8, ptr addrspace(1) %klassOop, i32 %klassIdOffset
  %klassId = load atomic i32, ptr addrspace(1) %klassIdAddr unordered, align 4, !tbaa !24, !azul.get_kid_from_klass !{}
  ret i32 %klassId
}

; Intrinsic for jdk.jfr.internal.JVM::getClassId
define zing i64 @_getClassId(ptr addrspace(1) %lang.class)
    nounwind alwaysinline willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
 entry:
  ; Get KlassOop from lang.class
  %klassOop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %lang.class)
  %traceid.offset = load i32, ptr @klassOopDesc.trace_id_offset_in_bytes

  ; If klassOop is null we have a primitive type
  %klassOopIsNull = icmp eq i8 addrspace(1)* %klassOop, null
  br i1 %klassOopIsNull, label %primitive, label %not_primitive, !prof !{ !"branch_weights", i32 1, i32 1024 }

 not_primitive:
  %klass_traceid_addr = getelementptr inbounds i8, ptr addrspace(1) %klassOop, i32 %traceid.offset
  %klass_traceid_raw = load atomic i64, ptr addrspace(1) %klass_traceid_addr unordered, align 8

  ; SET_USED_THIS_EPOCH
  %epoch.i8 = load atomic i8, ptr @JfrTraceIdEpoch.epoch unordered, align 1
  ; 0 - first epoch
  ; 1 - second epoch
  %epoch_bit = icmp eq i8 %epoch.i8, 1
  %epoch_mask = select i1 %epoch_bit, i64 2, i64 1
  %klass_traceid.used = or i64 %klass_traceid_raw, %epoch_mask
  store atomic i64 %klass_traceid.used, ptr addrspace(1) %klass_traceid_addr unordered, align 8

  ; 16 - TRACE_ID_SHIFT
  %klass_traceid = lshr i64 %klass_traceid.used, 16
  ret i64 %klass_traceid

 primitive:
  ; Check_array_klass_not_null
  %arrayKlassOop = call ptr addrspace(1) @azul.get_array_klass_oop(ptr addrspace(1) %lang.class)
  %arrayKlassOop.is.null = icmp eq ptr addrspace(1) %arrayKlassOop, null
  br i1 %arrayKlassOop.is.null, label %void_class, label %non_void_class

 void_class:
  %void_trace_id = load i64, ptr @VOID_TYPE_ID
  ret i64 %void_trace_id

 non_void_class:
  %prim_traceid_addr = getelementptr inbounds i8, ptr addrspace(1) %arrayKlassOop, i32 %traceid.offset
  %prim_traceid_raw = load atomic i64, ptr addrspace(1) %prim_traceid_addr unordered, align 8
  ; 16 - TRACE_ID_SHIFT
  %prim_traceid.tmp = lshr i64 %prim_traceid_raw, 16
  %prim_traceid = add i64 %prim_traceid.tmp, 1

  ; JfrTraceIdEpoch::set_changed_tag_state()
  %tag_state.i8 = load atomic i8, ptr @JfrTraceIdEpoch.tag_state acquire, align 1
  %tag_state.true = icmp ne i8 %tag_state.i8, 0
  br i1 %tag_state.true, label %signaled, label %not_signaled

 not_signaled:
  store atomic i8 1, ptr @JfrTraceIdEpoch.tag_state release, align 1
  br label %signaled

 signaled:
  ret i64 %prim_traceid
}

define zing i32 @_isInstance(
       ptr addrspace(1) %lang.class,
       ptr addrspace(1) %obj)
    nounwind readonly
    alwaysinline "azul-generatable" "azul-inlining-candidate"
    willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
entry:
  %objIsNull = icmp eq ptr addrspace(1) %obj, null
  br i1 %objIsNull, label %return_false, label %obj_not_null

obj_not_null:
  ; Get KlassOop from lang.class
  %klassOop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %lang.class)

  ; If klassOop is null we have a primitive type
  %klassOopIsNull = icmp eq ptr addrspace(1) %klassOop, null
  br i1 %klassOopIsNull, label %return_false, label %not_primitive

not_primitive:
  %klassId = call i32 @azul.get_kid_from_klass_oop(ptr addrspace(1) %klassOop)
  %objKlassId = call i32 @azul.get_klass_id(ptr addrspace(1) %obj)

  %ret = call i1 @azul.is_subtype_of(i32 %klassId, i32 %objKlassId)
  %ret.i32 = zext i1 %ret to i32
  ret i32 %ret.i32

return_false:
  ret i32 0
}

define zing i32 @_isPrimitive(
       ptr addrspace(1) %lang.class)
    nounwind readonly
    alwaysinline "azul-generatable" "azul-inlining-candidate"
    willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
entry:
  ; Get KlassOop from lang.class
  %klassOop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %lang.class)

  ; If klassOop is null we have a primitive type
  %klassOopIsNull = icmp eq ptr addrspace(1) %klassOop, null
  br i1 %klassOopIsNull, label %is_primitive, label %return_false

is_primitive:
  ret i32 1

return_false:
  ret i32 0
}

define i1 @azul.is_interface(ptr addrspace(1) %lang.class)
   alwaysinline "azul-late-inline"="0"
   willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind readonly {
entry:
  %JVM_ACC_INTERFACE = load i32, ptr @AccessFlags_JVM_ACC_INTERFACE_Mask

  ; Get KlassOop from lang.class
  %klassOop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %lang.class)

  ; If klassOop is null we have a primitive type
  %klassOopIsNull = icmp eq ptr addrspace(1) %klassOop, null
  br i1 %klassOopIsNull, label %return_false, label %not_primitive

not_primitive:
  %access_flags = call i32 @azul.get_access_flags_klass(ptr addrspace(1) %klassOop)
  %masked = and i32 %access_flags, %JVM_ACC_INTERFACE
  %isInterface = icmp ne i32 %masked, 0
  ret i1 %isInterface

return_false:
  ret i1 false

}

;; java.lang.Class.isInterface()
define zing i32 @_isInterface(
       ptr addrspace(1) %lang.class)
    nounwind readonly
    alwaysinline "azul-generatable" "azul-inlining-candidate"
    willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
  %isInterface = call i1 @azul.is_interface(ptr addrspace(1) %lang.class)
  %res = select i1 %isInterface, i32 1, i32 0
  ret i32 %res
}

;; java.lang.Class.isHidden()
define zing i32 @_isHidden(ptr addrspace(1) %lang.class)
    alwaysinline nounwind readonly willreturn "gc-leaf-function"
    "azul-generatable" "azul-inlining-candidate" "vmstate-idempotent"="true"
{
  ; Get KlassOop from lang.class
  %klassOop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %lang.class)

  ; If klassOop is null we have a primitive type
  %klassOopIsNull = icmp eq ptr addrspace(1) %klassOop, null
  br i1 %klassOopIsNull, label %return_false, label %not_primitive

not_primitive:
  %JVM_ACC_IS_HIDDEN_CLASS = load i32, ptr @AccessFlags_JVM_ACC_IS_HIDDEN_CLASS_Mask
  %access_flags = call i32 @azul.get_access_flags_klass(ptr addrspace(1) %klassOop)
  %masked = and i32 %access_flags, %JVM_ACC_IS_HIDDEN_CLASS
  %isHiddenClass = icmp ne i32 %masked, 0
  %result = select i1 %isHiddenClass, i32 1, i32 0
  ret i32 %result

return_false:
  ret i32 0
}

define i1 @azul.is_array_helper(ptr addrspace(1) %lang.class)
   alwaysinline "azul-late-inline"="0"
   willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind readonly {
entry:
  ; Get KlassOop from lang.class
  %klassOop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %lang.class)

  ; If klassOop is null we have a primitive type
  %klassOopIsNull = icmp eq ptr addrspace(1) %klassOop, null
  br i1 %klassOopIsNull, label %return_false, label %not_primitive

not_primitive:
  %klassId = call i32 @azul.get_kid_from_klass_oop(ptr addrspace(1) %klassOop)
  %lh = call i32 @azul.load_layout_helper(i32 %klassId)
  %lhNeutralValue = load i32, ptr @Klass.layout_helper_neutral_value
  %isArray = icmp slt i32 %lh, %lhNeutralValue
  ret i1 %isArray

return_false:
  ret i1 false
}

define zing i32 @_isArray(
     ptr addrspace(1) %lang.class)
  nounwind readonly
  alwaysinline 
  willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
entry:
  %isArray = call i1 @azul.is_array_helper(ptr addrspace(1) %lang.class)
  %res = select i1 %isArray, i32 1, i32 0
  ret i32 %res
}

define zing ptr addrspace(1) @_getSuperclass(ptr addrspace(1) %lang.class)
  alwaysinline 
  willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind readonly {
entry:
  ;; VM Constants
  %super_offset_bytes = load i32, ptr @klassOopDesc.super_offset_in_bytes
  br label %impl

impl:
  %klassOop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %lang.class)
  %is_primitive = icmp eq ptr addrspace(1) %klassOop, null
  br i1 %is_primitive, label %return_null, label %has_klass

has_klass:
  %isInterface = call i1 @azul.is_interface(ptr addrspace(1) %lang.class)
  br i1 %isInterface, label %return_null, label %check_array

check_array:
  %isArray = call i1 @azul.is_array_helper(ptr addrspace(1) %lang.class)
  br i1 %isArray, label %return_object, label %get_super_klass

return_object:
  %is_comp_oop_mode = load i1, ptr @UseCompressedOops.flag
  %klassTable.java_lang_object_class.i8 = select i1 %is_comp_oop_mode, ptr @klassTable.java_lang_object_class_coop, ptr @klassTable.java_lang_object_class
  %j.l.o.mirror = call ptr addrspace(1) @azul.load_ref_p0(ptr %klassTable.java_lang_object_class.i8)
  ret ptr addrspace(1) %j.l.o.mirror

get_super_klass:
  %address.i8 = getelementptr inbounds i8, ptr addrspace(1) %klassOop, i32 %super_offset_bytes
  ;; TODO-PERF: Aliasing facts...
  %super_klass = call ptr addrspace(1) @azul.load_ref_p1(ptr addrspace(1) %address.i8)
  %is_null = icmp eq ptr addrspace(1) %super_klass, null
  br i1 %is_null, label %return_null, label %get_class

get_class:
  %mirror = call ptr addrspace(1) @azul.get_mirror_from_instance_klass_oop(ptr addrspace(1) %super_klass)
  ret ptr addrspace(1) %mirror

return_null:
  ret ptr addrspace(1) null
}

define zing ptr addrspace(1) @_getComponentType(
     ptr addrspace(1) %lang.class)
  alwaysinline 
  willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind readonly {
entry:
  ;; VM Constants
  %component_mirror_offset_bytes = load i32, ptr @arrayKlassOopDesc.component_mirror_offset_in_bytes

  %isArray = call i1 @azul.is_array_helper(ptr addrspace(1) %lang.class)
  br i1 %isArray, label %get_component_mirror, label %return_null

get_component_mirror:
  ;; Note: We're reloading the klass here after getting it inside is_array_helper, but doing so
  ;; makes the code more readable and should fold away during optimization.
  %klassOop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %lang.class)
  %address.i8 = getelementptr inbounds i8, ptr addrspace(1) %klassOop, i32 %component_mirror_offset_bytes
  ;; TODO-PERF: Aliasing facts...
  %component_mirror = call ptr addrspace(1) @azul.load_ref_p1(ptr addrspace(1) %address.i8)
  ret ptr addrspace(1) %component_mirror

return_null:
  ret ptr addrspace(1) null
}

;; java.lang.Class.getModifiers()
define zing i32
  @_getModifiers(ptr addrspace(1) %lang.class)
  nounwind readonly alwaysinline
  willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
entry:
  ; Get KlassOop from lang.class
  %klassOop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %lang.class)

  ; If klassOop is null we have a primitive type
  %klassOopIsNull = icmp eq ptr addrspace(1) %klassOop, null
  br i1 %klassOopIsNull, label %primitive, label %not_primitive

not_primitive:
  %modifier_flags_offset = load i32, ptr @klassOopDesc.modifier_flags_offset_in_bytes
  %modifier_flags_addr = getelementptr inbounds i8, ptr addrspace(1) %klassOop, i32 %modifier_flags_offset
  %modifier_flags = load atomic i32, ptr addrspace(1) %modifier_flags_addr unordered, align 4, !tbaa !19
  ret i32 %modifier_flags

primitive:
  %primitive_modifier_flags = load i32, ptr @Primitive_ModifierFlags
  ret i32 %primitive_modifier_flags
}

define zing i32
  @_getClassAccessFlags(ptr addrspace(1) %lang.class)
  nounwind readonly alwaysinline
  willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
entry:
  ; Get KlassOop from lang.class
  %klassOop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %lang.class)

  ; If klassOop is null we have a primitive type
  %klassOopIsNull = icmp eq ptr addrspace(1) %klassOop, null
  br i1 %klassOopIsNull, label %primitive, label %not_primitive

not_primitive:
  %access_flags = call i32 @azul.get_access_flags_klass(ptr addrspace(1) %klassOop)
  ret i32 %access_flags

primitive:
  %primitive_access_flags = load i32, ptr @Primitive_ModifierFlags
  ret i32 %primitive_access_flags
}

;; Implementation of the intrinsic for the Class.isAssignableFrom(Class<?> cls)
;; For more info see:
;; https://docs.oracle.com/javase/8/docs/api/java/lang/Class.html#isAssignableFrom-java.lang.Class-
define zing i32
  @_isAssignableFrom(ptr addrspace(1) nonnull %super_class, ptr addrspace(1) %sub_class)
  alwaysinline {
entry:
  ; We don't check super_class argument for null because it's 'this' argument
  ; for a non-static call and checked for null before the call
  %sub_class_is_null = icmp eq ptr addrspace(1) %sub_class, null
  br i1 %sub_class_is_null, label %throw.npe, label %continue

continue:
  %super_klass = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %super_class)
  %sub_klass = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %sub_class)

  ; If klassOop is null we have a primitive type
  %super_is_primitive = icmp eq ptr addrspace(1) %super_klass, null
  %sub_is_primitive = icmp eq ptr addrspace(1) %sub_klass, null

  ; A full decision tree on {super_class is prim, sub_class is prim}:
  ;  {P,N} => false
  ;  {N,P} => false
  ;  {P,P} & super_class != sub_class => false
  ;  {P,P} & super_class == sub_class => true
  ;  {N,N} & subtype check wins => true
  ;  {N,N} & subtype check loses => false

  %prim_mismatch = icmp ne i1 %super_is_primitive, %sub_is_primitive
  br i1 %prim_mismatch, label %return.false, label %prim.match

prim.match:
  br i1 %super_is_primitive, label %both.primitive, label %both.non.primitive

both.primitive:
  %both_primitive_result = icmp eq ptr addrspace(1) %super_class, %sub_class
  %both_primitive_result.i32 = zext i1 %both_primitive_result to i32
  ret i32 %both_primitive_result.i32

both.non.primitive:
  %super_kid = call i32 @azul.get_kid_from_klass_oop(ptr addrspace(1) %super_klass)
  %sub_kid = call i32 @azul.get_kid_from_klass_oop(ptr addrspace(1) %sub_klass)
  %both_non_primitive_result = call i1 @azul.is_subtype_of(i32 %super_kid, i32 %sub_kid)
  %both_non_primitive_result.i32 = zext i1 %both_non_primitive_result to i32
  ret i32 %both_non_primitive_result.i32

return.false:
  ret i32 0

throw.npe:
  %current_thread = call i64 @azul.get_current_thread()
  call void @azul.throw_npe_and_deoptimize(i64 %current_thread) [ "deopt"() ]
  unreachable
}

define i32 @azul.object_hashcode.addr_hash(ptr addrspace(1) %oop)
       readnone nounwind noinline willreturn "gc-leaf-function" "azul-late-inline"="4" {
  ; addrHash = (intptr_t(oop) >> HashCode::addr_shift) & HashCode::addr_hash_mask 
  
  %addrShift = load i64, ptr @HashCode.addr_shift
  %addrHashMask = load i64, ptr @HashCode.addr_hash_mask

  %addrInt = ptrtoint ptr addrspace(1) %oop to i64, !verifier_exception !1
  %addrIntShifted = lshr i64 %addrInt, %addrShift
  %addrHash.i64 = and i64 %addrIntShifted, %addrHashMask
  %addrHash = trunc i64 %addrHash.i64 to i32
  ret i32 %addrHash
}

define i32 @azul.object_hashcode.page_hash(ptr addrspace(1) readnone %oop, ptr %pageInfoBase)
       argmemonly readonly nounwind noinline willreturn "gc-leaf-function" "azul-late-inline"="4" {
  ; pageHash = GPGC_PageInfo::_page_info[GPGC_Layout::addr_to_PageNum(oop)]._hash
entry:
  %bytesPerGPGCPage = load i64, ptr @LogBytesPerGPGCPage
  %bytesPerGPGCPageInfo = load i64, ptr @LogBytesPerGPGCPageInfo
  %idHashSeedOffsetInBytes = load i64, ptr @GPGC_PageInfo.id_hash_seed_offset_in_bytes
  
  %addrInt = ptrtoint ptr addrspace(1) %oop to i64, !verifier_exception !1
  %pageNum = lshr i64 %addrInt, %bytesPerGPGCPage
  %taggedAddressForJavaHeapEnabled = load i1, ptr @UseTaggedAddressForJavaHeap
  br i1 %taggedAddressForJavaHeapEnabled, label %gen_page_info_idx, label %fetch_page_hash

gen_page_info_idx:
  %pageSectorShift = load i64, ptr @GPGC_Layout.page_sector_shift
  %sectorAndMask = load i64, ptr @GPGC_Layout.SectorMask
  %nakedPageShift = load i64, ptr @GPGC_Layout.naked_page_bits
  %nakedPageAndMask = load i64, ptr @GPGC_Layout.naked_page_mask
  %pagenum.sector = lshr i64 %pageNum, %pageSectorShift
  %gcphase = and i64 %pagenum.sector, %sectorAndMask
  %gcphase.in.place = shl i64 %gcphase, %nakedPageShift
  %naked.pagenum = and i64 %pageNum, %nakedPageAndMask
  %pageInfoIdx1 = or i64 %gcphase.in.place, %naked.pagenum
  br label %fetch_page_hash

fetch_page_hash:
  %pageInfoIdx = phi i64 [%pageNum, %entry], [%pageInfoIdx1, %gen_page_info_idx]
  %pageInfoOffset = shl i64 %pageInfoIdx, %bytesPerGPGCPageInfo
  %pageInfoAddr = getelementptr inbounds i8, ptr %pageInfoBase, i64 %pageInfoOffset
  %pageHashAddr = getelementptr inbounds i8, ptr %pageInfoAddr, i64 %idHashSeedOffsetInBytes
  %pageHash = load atomic i32, ptr %pageHashAddr unordered, align 4
  ret i32 %pageHash
}

;; Technically object_hashcode doesn't need the second argument, it's a constant
;; pointer from @GPGC_PageInfo.page_info_base global. We pass it as an argument 
;; so we can mark object_hashcode as argmemonly.
define i32 @azul.object_hashcode(ptr addrspace(1) %oop, ptr %pageInfoBase)
       argmemonly noinline nounwind willreturn "gc-leaf-function" "vmstate-idempotent"="true"
       "invariant-object-property"="0" "azul-late-inline"="3" "specializable-function-id" "specialize-by-arg"="0" {
;; Common fast path for @_identityHashCode and @_hashCode

;; IMPORTANT: This function needs to be atomic with respect to safepoints (i.e. there can be no
;; safepoints within the function body, before or after inlining) since the has-preheader bit can
;; change at a safpeoint.  This is problematic because, for instance, the following can happen:
;;
;;  %hasPreheaderAndHashCode = i1 false // hashcode bit is set, but preheader is not
;;  << safepoint >> // The object now has the preheader bit set, and has been relocated.
;;                  // The correct hashcode is now stored in the preheader, and it is no
;;                  // longer correct to compute the hashcode based on the objects' address.
;;                  // making the following branch incorrect.
;;  br i1 %hasPreheaderAndHashCode, label %read_from_preheader, label %slow_path
;;
;; This is why this function is marked "azul-late-inline"="3" (exactly like the monitorenter and the
;; monitorexit JBAs).

create_vm_constants:
  %preheaderMask = load i64, ptr @markWord.preheader_mask_in_place
  %hashcodeMask = load i64, ptr @markWord.hashcode_mask_in_place
  br label %entry

entry:
  %preheaderAddr = getelementptr i64, ptr addrspace(1) %oop, i32 -1
  
  ;; We're racing with other mutator threads filling in the preheader and setting the preheader and
  ;; hashcode bits in the mark word.  Mutator threads filling in the hash code in the preheader
  ;; ensure the preheader hash is filled in *before* the hash is "published" using the preheader and
  ;; hashcode bits.  Here we have to do an acquire load to avoid speculating the preheader hash load
  ;; above the `%hasPreheaderAndHashCode` check.
  ;;
  ;; Note: I do not think the `acquire` is needed today in practice, since x86 does not do load-load
  ;; reordering, and LLVM will not hoist the preheader load over control flow.  It exists mostly to
  ;; convey intent.
  ;;
  %markWord = load atomic i64, ptr addrspace(1) %oop acquire, align 8, !tbaa !14

  %hasPreheaderBits = and i64 %markWord, %preheaderMask
  %hasPreheader = icmp eq i64 %hasPreheaderBits, %preheaderMask
  %hasHashCodeBits = and i64 %markWord, %hashcodeMask
  %hasHashCode = icmp eq i64 %hasHashCodeBits, %hashcodeMask
  %hasPreheaderAndHashCode = and i1 %hasPreheader, %hasHashCode
  br i1 %hasPreheaderAndHashCode, label %read_from_preheader, label %compute_hashcode, !prof !{ !"branch_weights", i32 64, i32 1 }

read_from_preheader:
  %preheaderHashCode = load atomic i32, ptr addrspace(1) %preheaderAddr unordered, align 4, !tbaa !15
  ret i32 %preheaderHashCode

compute_hashcode:
  ; Turn the oop into a hashcode:
  ;  Form a hash of the offset in the page
  %addrHash = call i32 @azul.object_hashcode.addr_hash(ptr addrspace(1) %oop)
  ;  Get a PageInfo hash value
  %pageHash = call i32 @azul.object_hashcode.page_hash(ptr addrspace(1) %oop,
                                                       ptr %pageInfoBase)
  ;  And then XOR the two together
  %identityHashRaw = xor i32 %addrHash, %pageHash

  %optimizeIdentityHashForDistribution = load i1, ptr @OptimizeIdentityHashForDistribution
  br i1 %optimizeIdentityHashForDistribution, label %finalize_hashcode, label %computed_hashcode

finalize_hashcode:
  ; Finalize the hash to improve hash quality
  %firstShift = lshr i32 %identityHashRaw, 15
  %firstXor = xor i32 %identityHashRaw, %firstShift
  %firstMul = mul i32 %firstXor, u0x2c1b3c6d
  %secondShift = lshr i32 %firstMul, 12
  %secondXor = xor i32 %firstMul, %secondShift
  %secondMul = mul i32 %secondXor, u0x297a2d39
  %thirdShift = lshr i32 %secondMul, 15
  %thirdXor = xor i32 %secondMul, %thirdShift
  %identityHashFinalized = and i32 %thirdXor, u0x7fffffff
  br label %computed_hashcode

computed_hashcode:
  %identityHash = phi i32  [ %identityHashRaw, %compute_hashcode ], [ %identityHashFinalized, %finalize_hashcode ]
  br i1 %hasPreheader, label %has_preheader, label %no_preheader

has_preheader:
  ; Store the calculated hash code and then mark that the hash code is present.
  store atomic i32 %identityHash, ptr addrspace(1) %preheaderAddr unordered, align 4, !tbaa !15
  br label %no_hashcode

no_preheader:
  br i1 %hasHashCode, label %exit, label %no_hashcode

no_hashcode:
  atomicrmw or ptr addrspace(1) %oop, i64 %hashcodeMask acquire    
  br label %exit

exit:
  ret i32 %identityHash
}

define zing i32 @_identityHashCode(ptr addrspace(1) %oop) alwaysinline {
entry:
  %oop_is_null = icmp eq ptr addrspace(1) %oop, null
  br i1 %oop_is_null, label %is_null, label %not_null

not_null:
  %pageInfoBase = load ptr, ptr @GPGC_PageInfo.page_info_base
  %result = call i32 @azul.object_hashcode(ptr addrspace(1) %oop, 
                                           ptr %pageInfoBase)
  ret i32 %result

is_null:
  ret i32 0
}

define zing i32 @_hashCode(ptr addrspace(1) %this) alwaysinline {
entry:
  %pageInfoBase = load ptr, ptr @GPGC_PageInfo.page_info_base
  %result = call i32 @azul.object_hashcode(ptr addrspace(1) %this, 
                                           ptr %pageInfoBase)
  ret i32 %result
}

define zing i64 @_nativeThreadCurrent()
   nounwind alwaysinline
{
load_vm_constants:
  %osthread.Offset = load i32, ptr @Thread.osthread_offset_bytes
  %pthread_id.Offset = load i32, ptr @Thread.pthread_id_offset_bytes
  br label %entry

entry:
  ; get the pointer to thread local OSThread instance
  %osthread.Address = getelementptr inbounds i8, ptr addrspace(256) null, i32 %osthread.Offset
  %osthread.base = load ptr, ptr addrspace(256) %osthread.Address, !invariant.load !{}

  ; get the '_pthread_id' value located at OSThread[*pthread_id_offset]
  %osthread.pthread_id.Address = getelementptr inbounds i8, ptr %osthread.base, i32 %pthread_id.Offset
  %osthread.pthread_id = load atomic i64, ptr %osthread.pthread_id.Address unordered, align 8, !invariant.load !{}
  ret i64 %osthread.pthread_id
}

; This will be expanded into a call to DolphinRuntime::thread_is_interrupted
; with appropriate stores to the frame anchor by dolphinAbstractions.cpp.  
declare i32 @azul.thread_is_interrupted.slow(i64 %javaThread,
   ptr addrspace(1) %thrObj, i32 %clear_interrupted) "consumes-caller-vmstate" 

; Fast path to avoid a VM call to a private native non-static
; thrObj.isInterrupted(boolean clear_int):
;   (thrObj == Thread.current() && (!TLS._osthread._interrupted || !clear_int))
;   ? TLS._osthread._interrupted
;   : thrObj.isInterrupted(clear_int)
; We avoid making the call into the VM in the common case when the thrObj is
; currentThread and if the interrupt bit is false.
; Even if the interrupt bit is true but the clear_int argument
; is false, we again avoid the VM call.
; However, if the thrObj is NOT currentThread, we must call the VM because there
; must be some locking done around the ask OS-specific thread information.
define zing i32 @_isInterrupted(ptr addrspace(1) %thrObj, i32 %clear_int)
   nounwind alwaysinline 
{
entry:
  %oop.is.null = icmp eq ptr addrspace(1) %thrObj, null
  br i1 %oop.is.null, label %fast_return_false, label %load_vm_constants

load_vm_constants:
  %osthread.Offset = load i32, ptr @Thread.osthread_offset_bytes
  %interrupted_offset = load i32, ptr @Thread.interrupted_offset_bytes
  br label %continue

continue:
  %curr_thr_java_obj = call ptr addrspace(1) @azul.get_current_thread_java_object()
  %is_current = icmp eq ptr addrspace(1) %thrObj, %curr_thr_java_obj
  br i1 %is_current, label %in_current_thread, label %slow_path

in_current_thread:
  ; get the pointer to thread local OSThread instance
  %osthread.Address = getelementptr inbounds i8, ptr addrspace(256) null, i32 %osthread.Offset
  %osthread.base = load ptr, ptr addrspace(256) %osthread.Address

  ; get the '_interrupted' flag located at OSThread[*interrupted_offset]
  %OSThread.interrupted.Address = getelementptr inbounds i8, ptr %osthread.base, i32 %interrupted_offset
  ; Use atomic load to ensure that it's not possible to move the load of
  ; TLS._osthread._interrupted out of the function (see JDK-8003135).
  %OSThread.interrupted = load atomic i32, ptr %OSThread.interrupted.Address seq_cst, align 4

  %not_interrupted = icmp eq i32 %OSThread.interrupted, 0
  br i1 %not_interrupted, label %fast_return_false, label %interrupted_bit_is_set

interrupted_bit_is_set:
  %clear = icmp ne i32 %clear_int, 0
  br i1 %clear, label %slow_path, label %fast_return_true 

slow_path:
  %current_thread = call i64 @azul.get_current_thread()
  %res = call i32 @azul.thread_is_interrupted.slow(
      i64 %current_thread, ptr addrspace(1) %thrObj, i32 %clear_int) [ "deopt"() ]
  ret i32 %res

 fast_return_true:
  ret i32 1

 fast_return_false:
  ret i32 0
}

; This will be expanded into a call to DolphinRuntime::notify by dolphinAbstractions.cpp.
declare void @azul.notify(i64 %thread, ptr addrspace(1) %obj) "azul-deopt-on-throw" "consumes-caller-vmstate"

; This will be expanded into a call to DolphinRuntime::notifyAll by dolphinAbstractions.cpp.
declare void @azul.notifyAll(i64 %thread, ptr addrspace(1) %obj) "azul-deopt-on-throw" "consumes-caller-vmstate"

define void @_notify(ptr addrspace(1) %obj) {
  %thread = call i64 @azul.get_current_thread()
  call void @azul.notify(i64 %thread, ptr addrspace(1) %obj) [ "deopt"() ]
  ret void
}

define void @_notifyAll(ptr addrspace(1) %obj) {
  %thread = call i64 @azul.get_current_thread()
  call void @azul.notifyAll(i64 %thread, ptr addrspace(1) %obj) [ "deopt"() ]
  ret void
}

; This will be expanded into a call to DolphinRuntime::thread_park
; with appropriate stores to the frame anchor by dolphinAbstractions.cpp.  
declare void @azul.thread_park_slow(i64 %jt, i32 %isAbs, i64 %time)
  "consumes-caller-vmstate" "azul-deopt-on-throw"

; sun.misc.Unsafe.park(boolean isAbsolute, long time)
; NB: the code below is analogous to LibraryCallKit::inline_sun_misc_Unsafe_park()
define zing void @_park(ptr addrspace(1) %this, i32 %isAbs, i64 %time)
  nounwind alwaysinline
{
load_vm_constants:
  %park_permit_offset = load i32, ptr @Thread.park_permit_offset_bytes
  br label %continue

 continue:
  ; get the '_park_permit' flag located at JavaThread[*park_permit_offset]
  %park_permit.Address = getelementptr inbounds i8, ptr addrspace(256) null, i32 %park_permit_offset
  ; no need in atomic load due to dependant compare below
  %park_permit = load atomic i32, ptr addrspace(256) %park_permit.Address unordered, align 4

  %permit.is.available = icmp eq i32 %park_permit, 1
  br i1 %permit.is.available, label %fast_path, label %slow_call

 fast_path:
  ; erase the permit
  store i32 0, ; NB: it's NOT required to be atomic
        ptr addrspace(256) %park_permit.Address, align 4
  ; But we need prevent the eating of the permit to languish a store-buffer
  ; while we spin around and test the user's condition - without
  ; an unparking thread realizing that we ate the permit, i.e.
  ;    ParkThread does: store(0->permit), fence,  load(user_condition)
  ;  UnParkThread does: store(user_condition), fence, store(1->permit)
  fence seq_cst
  ret void

 slow_call:
  %jt = call i64 @azul.get_current_thread()
  call void @azul.thread_park_slow(i64 %jt, i32 %isAbs, i64 %time)  [ "deopt"() ]
  ret void
}

; Maps to JavaThread::unpark_wrapper
declare void @azul.thread_unpark_slow(i64 %jt) nounwind willreturn "gc-leaf-function"

; sun.misc.Unsafe.unpark(Object thread)
; NB: the code below is analogous to LibraryCallKit::inline_sun_misc_Unsafe_unpark()
define zing void @_unpark(ptr addrspace(1) %this, ptr addrspace(1) %thr_oop)
  nounwind alwaysinline
{
 entry:
  %oop.is.null = icmp eq ptr addrspace(1) %thr_oop, null
  br i1 %oop.is.null, label %fast_return, label %continue

 continue:
  %jt2unpark = call i64 @azul.get_native_JavaThread(ptr addrspace(1) %thr_oop)
  %jt.dead.or.stopped = icmp eq i64 %jt2unpark, 0
  br i1 %jt.dead.or.stopped, label %fast_return, label %check_park_permit

 check_park_permit:
  %park_permit_offset = load i32, ptr @Thread.park_permit_offset_bytes
  ; get the '_park_permit' flag located at JavaThread[*park_permit_offset]
  %jt2unpark.ptr = inttoptr i64 %jt2unpark to ptr
  %jthread.park_permit.Address = getelementptr inbounds i8, ptr %jt2unpark.ptr, i32 %park_permit_offset
  ; Use atomic load to guarantee a total ordering for the load of JT.park_permit
  %jthread.park_permit = load atomic i32, ptr %jthread.park_permit.Address seq_cst, align 4
  %permit.is.set = icmp eq i32 %jthread.park_permit, 1
  br i1 %permit.is.set, label %fast_return, label %slow_call

 slow_call:
  call void @azul.thread_unpark_slow(i64 %jt2unpark)
  ret void

 fast_return:
  ret void
}

; get the native JavaThread* from the eetop field in java.lang.Thread
define i64 @azul.get_native_JavaThread(ptr addrspace(1) %thr_oop)
  readonly nounwind alwaysinline
  "azul-late-inline"="0" willreturn "gc-leaf-function" {
entry:
  %eetop_offset = load i32, ptr @java.lang.Thread.eetop_offset_bytes
  ; get the native JavaThread* from the eetop field in java.lang.Thread
  %eetop.Addr = getelementptr inbounds i8, ptr addrspace(1) %thr_oop, i32 %eetop_offset
  %jt = load atomic i64, ptr addrspace(1) %eetop.Addr unordered, align 8
  ret i64 %jt
}

;; static Object[] java.lang.Thread::scopedValueCache()
define zing ptr addrspace(1) @_scopedValueCache()
   nounwind readonly
   alwaysinline {
entry:
  %scopedValueCacheOffsetInBytes = load i32, ptr @Thread.scoped_value_cache_offset_bytes
  %scopedValueCacheAddress = getelementptr i8, ptr addrspace(256) null, i32 %scopedValueCacheOffsetInBytes
  %scopedValueCache = load atomic ptr addrspace(1), ptr addrspace(256) %scopedValueCacheAddress unordered, align 8
  ret ptr addrspace(1) %scopedValueCache
}

;; static void java.lang.Thread::setScopedValueCache(Object[] cache)
define zing void @_setScopedValueCache(ptr addrspace(1) %cache)
  alwaysinline nounwind willreturn "azul-late-inline"="0" "gc-leaf-function" {
entry:
  %scopedValueCacheOffsetInBytes = load i32, ptr @Thread.scoped_value_cache_offset_bytes
  %scopedValueCacheAddress = getelementptr i8, ptr addrspace(256) null, i32 %scopedValueCacheOffsetInBytes
  store atomic ptr addrspace(1) %cache, ptr addrspace(256) %scopedValueCacheAddress unordered, align 8

  ret void
}

define zing nonnull ptr addrspace(1) @_currentCarrierThread()
   nounwind readonly
   alwaysinline {
 entry:
  %threadObj = call ptr addrspace(1) @azul.get_current_carrier_thread_java_object()
  ret ptr addrspace(1) %threadObj
}

define nonnull ptr addrspace(1) @azul.get_current_carrier_thread_java_object()
  readonly nounwind alwaysinline "azul-late-inline"="0" willreturn "gc-leaf-function" {
 entry:
  %currentCarrierThreadOffsetInBytes = load i32, ptr @Thread.current_carrier_thread_offset_bytes
  %currentCarrierThreadAddress = getelementptr i8, ptr addrspace(256) null, i32 %currentCarrierThreadOffsetInBytes
  ;; The current carrier j.l.Thread reference is never null.
  %currentCarrierThread = load atomic ptr addrspace(1), ptr addrspace(256) %currentCarrierThreadAddress unordered, align 8, !nonnull !{}
  ret ptr addrspace(1) %currentCarrierThread
}

define zing nonnull ptr addrspace(1) @_currentThread()
   nounwind readonly
   alwaysinline {
 entry:
  %threadObj = call ptr addrspace(1) @azul.get_current_thread_java_object()
  ret ptr addrspace(1) %threadObj
}

;; void java.lang.Thread::setCurrentThread(Thread thread)
define zing void @_setCurrentThread(ptr addrspace(1) %this, ptr addrspace(1) %theThread)
  alwaysinline nounwind willreturn "azul-late-inline"="0" "gc-leaf-function" {
 entry:
  %carrierThreadObj = call ptr addrspace(1) @azul.get_current_carrier_thread_java_object()

  %currentThreadOffsetInBytes = load i32, ptr @Thread.current_thread_offset_bytes
  %currentThreadAddress = getelementptr i8, ptr addrspace(256) null, i32 %currentThreadOffsetInBytes
  store atomic ptr addrspace(1) %theThread, ptr addrspace(256) %currentThreadAddress unordered, align 8

  ret void
}

define nonnull ptr addrspace(1) @azul.get_current_thread_java_object()
  readonly nounwind alwaysinline "azul-late-inline"="0" willreturn "gc-leaf-function" {
 entry:
  %currentThreadOffsetInBytes = load i32, ptr @Thread.current_thread_offset_bytes
  %currentThreadAddress = getelementptr i8, ptr addrspace(256) null, i32 %currentThreadOffsetInBytes
  ;; The current j.l.Thread reference is never null.
  %currentThread = load atomic ptr addrspace(1), ptr addrspace(256) %currentThreadAddress unordered, align 8, !nonnull !{}
  ret ptr addrspace(1) %currentThread
}

; It is essential that during Escape-Analysis passes, we ensure that 
; %object escapes through this function. Hence we use inline assembly
; to ensure that the optimizer does not eliminate %object.
define zing void @_ensureMaterializedForStackWalk(ptr addrspace(1) %object)
  alwaysinline nounwind willreturn "azul-late-inline"="0" "gc-leaf-function" {
    call void asm sideeffect "", "imr,~{memory}"(ptr addrspace(1) %object) nounwind willreturn "gc-leaf-function"
    ret void
}

; While the collection state can generally change outside a safepoint, the 
; transitions InitialMarkSafepoint->ConcurrentMarking->WeakRefSafepoint->ConcurrentRefProcessing
; are done inside safepoints.
; caller must be "azul-late-inline"="4" to avoid possible safepoints between
; read and use of the collector state
define i8 @azul.get_collector_state(ptr addrspace(1) %oop)
    "azul-late-inline"="0" alwaysinline nounwind readonly willreturn "gc-leaf-function"  {
  %old_gen = call i1 @azul.is_old_gen(ptr addrspace(1) %oop)
  %collection_state_addr = select i1 %old_gen, ptr @GPGC_OldCollector.collection_state_addr, 
                                               ptr @GPGC_NewCollector.collection_state_addr
  %collection_state = load atomic i8, ptr %collection_state_addr unordered, align 1
  ret i8 %collection_state
}

; caller must be "azul-late-inline"="4" to avoid possible safepoints between
; read and use of the collector state
define i1 @azul.is_in_concurrent_marking(i8 %collection_state)
    "azul-late-inline"="0" alwaysinline nounwind readonly willreturn "gc-leaf-function"  {
  %concurrent_marking      = load i8, ptr @GPGC_Collector.ConcurrentMarking
  %does_concurrent_marking = icmp eq i8 %collection_state, %concurrent_marking
  ret i1 %does_concurrent_marking
}

; caller must be "azul-late-inline"="4" to avoid possible safepoints between
; read and use of the collector state
define i1 @azul.is_in_concurrent_ref_processing(i8 %collection_state)
    "azul-late-inline"="0" alwaysinline nounwind readonly willreturn "gc-leaf-function"  {
  %concurrent_ref_processing      = load i8, ptr @GPGC_Collector.ConcurrentRefProcessing
  %does_concurrent_ref_processing = icmp eq i8 %collection_state, %concurrent_ref_processing
  ret i1 %does_concurrent_ref_processing
}

declare ptr addrspace(1) @"StubRoutines::concurrentGetRef()"(ptr addrspace(1)) 
    willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind

declare i32 @"GPGC_Collector::java_lang_ref_Referent_refersTo"(ptr addrspace(1), ptr addrspace(1), i32) 
    willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind

define zing ptr addrspace(1) @_concurrentGetReferent(ptr addrspace(1) %this)
   alwaysinline {
entry:
  ; Use an abstraction to delay lowering until late inline 4
  %result = call ptr addrspace(1) @azul.concurrentGetReferent(ptr addrspace(1) %this)
  ret ptr addrspace(1) %result
}

; Use this API to load an oop for late-inline-3 or 4 functions
; when lvb are not getting automatically generated
define ptr addrspace(1) @azul.load_with_explicit_lvb(ptr addrspace(1) %addr)
   "azul-late-inline"="4" willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind {
  ; load & uncompress
  %oop.raw     = call ptr addrspace(1) @azul.load_ref_p1(ptr addrspace(1) %addr)
  ; jHeapLVBHelper accepts poisoned oop
  %oop = call ptr addrspace(1) @jHeapLvb(ptr addrspace(1) %oop.raw, ptr addrspace(1) %addr)
  ret ptr addrspace(1) %oop
}

; Late inline is used to prevent safepoint from appearing - 
; paths it chooses depend on the collector state, while a safepoint
; would let it change
define ptr addrspace(1) @azul.concurrentGetReferent(ptr addrspace(1) %this)
   "azul-late-inline"="4" willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind
{
entry:
  ; Note: we can do much better than this -- the call to
  ; StubRoutines::concurrentGetRef() is semantically a load and
  ; can be optimized like a load.  The only gotcha is that the call
  ; (the "load") blows %rdi and %rsi.  We could still model these
  ; loads as loads via address space X and lower them to a call before
  ; codegen.
  ;
  %referent_offset_in_bytes = load i32, ptr @java_lang_ref_Reference.referent_offset_in_bytes
  %addr = getelementptr i8, ptr addrspace(1) %this, i32 %referent_offset_in_bytes

  %enabled = load i1, ptr @FalconInlineConcurrentGetRefFastPath
  br i1 %enabled, label %fast.test, label %slow.path

 fast.test:
  %collection_state                = call i8 @azul.get_collector_state(ptr addrspace(1) %this)
  %is_in_concurrent_ref_processing = call i1 @azul.is_in_concurrent_ref_processing(i8 %collection_state)
  br i1 %is_in_concurrent_ref_processing, label %slow.path, label %lvb.path
  
 lvb.path:
  ; lvb does not get generated for late-inline-4, so we call a load with explicit lvb
  %res = call ptr addrspace(1) @azul.load_with_explicit_lvb(ptr addrspace(1) %addr)
  ret ptr addrspace(1) %res

 slow.path:
  %result = call ptr addrspace(1) @"StubRoutines::concurrentGetRef()"(ptr addrspace(1) %addr)
  ret ptr addrspace(1) %result
}

define zing i32 @azul.Reference_refersTo0_impl(ptr addrspace(1) %this, ptr addrspace(1) %obj, i32 %is_phantom)
   "azul-late-inline"="4" willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind {
 entry:
  ; compute referent location
  %referent_offset_in_bytes = load i32, ptr @java_lang_ref_Reference.referent_offset_in_bytes
  %addr = getelementptr i8, ptr addrspace(1) %this, i32 %referent_offset_in_bytes

  ; first check if the referent needs a trap at all
  ; - load & uncompress. LVB is not applied as function is late-inline-4
  ; - unposion and cast to i64
  ; - do the trap mask check
  %oop.raw            = call ptr addrspace(1) @azul.load_ref_p1(ptr addrspace(1) %addr)
  %oop.raw.unpoisoned = call ptr addrspace(1) @azul.unpoison.oop(ptr addrspace(1) %oop.raw)
  %oop.raw.unpoisoned.as_int = ptrtoint ptr addrspace(1) %oop.raw.unpoisoned to i64, !verifier_exception !1
  %needs.lvb          = call i1 @LvbTest(i64 %oop.raw.unpoisoned.as_int)
  br i1 %needs.lvb, label %out.of.phase.path, label %in.phase.path

 in.phase.path:
  ; oop is in phase and we can directly compare it with reference value
  %result.in.phase.path.i1  = icmp eq ptr addrspace(1) %oop.raw.unpoisoned, %obj
  %result.in.phase.path.i32 = zext i1 %result.in.phase.path.i1 to i32
  ret i32 %result.in.phase.path.i32

  ; oop is not in phase, we may need special "weak" lvb, depending on the gc state
 out.of.phase.path:
  ; Only when reference and referent are in the same gen, collector has a chance to collect the referent.
  ; Thus we can base generational-depending tests on the reference object itself.
  %collection_state = call i8 @azul.get_collector_state(ptr addrspace(1) %this)

  ; We must not strengthen liveness of the referent during the refersTo test.
  ; However, it only makes difference when collector is doing concurrent marking.
  %is_in_concurrent_marking = call i1 @azul.is_in_concurrent_marking(i8 %collection_state)
  br i1 %is_in_concurrent_marking, label %slow.path, label %may.keep.alive.path

 may.keep.alive.path:
  ; When in concurrent ref processing, we cannot use lvb directly, to access 
  ; referent as it might already be considered for clean up.
  %does_concurrent_ref_processing = call i1 @azul.is_in_concurrent_ref_processing(i8 %collection_state)
  br i1 %does_concurrent_ref_processing, label %slow.path, label %lvb.path

 lvb.path:
  %res  = call ptr addrspace(1) @azul.load_with_explicit_lvb(ptr addrspace(1) %addr)
  %result.i1  = icmp eq ptr addrspace(1) %res, %obj
  %result.i32 = zext i1 %result.i1 to i32
  ret i32 %result.i32

 slow.path:
  ; Collector is either in ConcurrentMarking or ConcurrentRefProcessing, so access needs special handling
  %result = call i32 @"GPGC_Collector::java_lang_ref_Referent_refersTo"(ptr addrspace(1) %this, ptr addrspace(1) %obj, i32 %is_phantom)
  ret i32 %result
}

define zing i32 @_Reference_refersTo0(ptr addrspace(1) %this, ptr addrspace(1) %obj)
   alwaysinline {
  %result = call zing i32 @azul.Reference_refersTo0_impl(ptr addrspace(1) %this, ptr addrspace(1) %obj, i32 0)
  ret i32 %result
}

define zing i32 @_PhantomReference_refersTo0(ptr addrspace(1) %this, ptr addrspace(1) %obj)
   alwaysinline {
 entry:
  %result = call zing i32 @azul.Reference_refersTo0_impl(ptr addrspace(1) %this, ptr addrspace(1) %obj, i32 1)
  ret i32 %result
}

;; Implementation of the intrinsic for the Character.reverseBytes(char)
;; For more info see:
;; https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#reverseBytes-char-
define zing i32 @_reverseBytes_c(i32 %arg)
    nounwind alwaysinline readnone {
entry:
  %two-bytes = trunc i32 %arg to i16
  %reversed = call i16 @llvm.bswap.i16(i16 %two-bytes)
  %result = zext i16 %reversed to i32
  ret i32 %result
}
declare i16 @llvm.bswap.i16(i16 %arg) willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind readnone

;; Implementation of the intrinsic for the Short.reverseBytes(short)
;; For more info see:
;; https://docs.oracle.com/javase/8/docs/api/java/lang/Short.html#reverseBytes-short-
define zing i32 @_reverseBytes_s(i32 %arg)
    nounwind alwaysinline readnone {
entry:
  %two-bytes = trunc i32 %arg to i16
  %reversed = call i16 @llvm.bswap.i16(i16 %two-bytes)
  %result = sext i16 %reversed to i32
  ret i32 %result
}

;; Implementation of the intrinsic for the Integer.reverseBytes(int)
;; For more info see:
;; https://docs.oracle.com/javase/8/docs/api/java/lang/Integer.html#reverseBytes-int-
define zing i32 @_reverseBytes_i(i32 %arg)
    nounwind alwaysinline readnone {
entry:
  %result = call i32 @llvm.bswap.i32(i32 %arg)
  ret i32 %result
}
declare i32 @llvm.bswap.i32(i32 %arg) willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind readnone

;; Implementation of the intrinsic for the Long.reverseBytes(long)
;; For more info see:
;; https://docs.oracle.com/javase/8/docs/api/java/lang/Long.html#reverseBytes-long-
define zing i64 @_reverseBytes_l(i64 %arg)
    nounwind alwaysinline readnone {
entry:
  %result = call i64 @llvm.bswap.i64(i64 %arg)
  ret i64 %result
}
declare i64 @llvm.bswap.i64(i64 %arg) willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind readnone

; Implementation of java.lang.Integer::numberOfLeadingZeros
define zing i32 @_numberOfLeadingZeros_i(i32 %arg)
   nounwind alwaysinline readnone {
  ;; Second arg is zero to indicate ctlz(0) == 32, not undef
  %result = call i32 @llvm.ctlz.i32(i32 %arg, i1 0)
  ret i32 %result
}

; Implementation of java.lang.Long::numberOfLeadingZeros
define zing i32 @_numberOfLeadingZeros_l(i64 %arg)
   nounwind alwaysinline readnone {
  ;; Second arg is zero to indicate ctlz(0) == 64, not undef
  %result = call i64 @llvm.ctlz.i64(i64 %arg, i1 0)
  %result.trunc = trunc i64 %result to i32
  ret i32 %result.trunc
}

; Implementation of java.lang.Integer::numberOfTrailingZeros
define zing i32 @_numberOfTrailingZeros_i(i32 %arg)
   nounwind alwaysinline readnone {
  ;; Second arg is zero to indicate ctlz(0) == 32, not undef
  %result = call i32 @llvm.cttz.i32(i32 %arg, i1 0)
  ret i32 %result
}

; Implementation of java.lang.Long::numberOfTrailingZeros
define zing i32 @_numberOfTrailingZeros_l(i64 %arg)
   nounwind alwaysinline readnone {
  ;; Second arg is zero to indicate ctlz(0) == 64, not undef
  %result = call i64 @llvm.cttz.i64(i64 %arg, i1 0)
  %result.trunc = trunc i64 %result to i32
  ret i32 %result.trunc
}

; Implementation of java.lang.Integer::bitCount
define zing i32 @_bitCount_i(i32 %arg)
   nounwind alwaysinline readnone {
  %result = call i32 @llvm.ctpop.i32(i32 %arg)
  ret i32 %result
}

; Implementation of java.lang.Long::bitCount
define zing i32 @_bitCount_l(i64 %arg)
   nounwind alwaysinline readnone {
  %result = call i64 @llvm.ctpop.i64(i64 %arg)
  %result.trunc = trunc i64 %result to i32
  ret i32 %result.trunc
}

declare i32 @llvm.ctlz.i32(i32, i1)
declare i64 @llvm.ctlz.i64(i64, i1)
declare i32 @llvm.cttz.i32(i32, i1)
declare i64 @llvm.cttz.i64(i64, i1)
declare i32 @llvm.ctpop.i32(i32)
declare i64 @llvm.ctpop.i64(i64 %arg)

;; Implementation of the intrinsic for the Float.floatToRawIntBits(float)
;; For more info see:
;; https://docs.oracle.com/javase/7/docs/api/java/lang/Float.html#floatToRawIntBits(float)
define zing i32 @_floatToRawIntBits(float %arg)
    nounwind alwaysinline readnone {
entry:
  %result = bitcast float %arg to i32
  ret i32 %result
}

;; Implementation of the intrinsic for the Float.floatToIntBits(float)
;; (Do not confuse with Float.floatToRawIntBits)
;; For more info see:
;; https://docs.oracle.com/javase/7/docs/api/java/lang/Float.html#floatToIntBits(float)
define zing i32 @_floatToIntBits(float %arg)
    nounwind alwaysinline readnone {
entry:
  %is_nan = fcmp uno float %arg, 0.0
  br i1 %is_nan, label %nan, label %not_nan, !prof !8

nan:
  ; 2143289344 <=> 0x7fc00000 <=> NaN
  ret i32 2143289344

not_nan:
  %result = bitcast float %arg to i32
  ret i32 %result
}

declare i16 @llvm.convert.to.fp16.f32(float)
;; int Float.floatToFloat16(float val)
define zing i32 @_floatToFloat16(float %val)
    nounwind alwaysinline readnone {
entry:
  %res.i16 = call i16 @llvm.convert.to.fp16.f32(float %val)
  %result = sext i16 %res.i16 to i32
  ret i32 %result
}

declare float @llvm.convert.from.fp16.f32(i16)
;; int Float.float16ToFloat(short val)
define zing float @_float16ToFloat(i32 %val)
    nounwind alwaysinline readnone {
entry:
  %res.16 = trunc i32 %val to i16
  %res = call float @llvm.convert.from.fp16.f32(i16 %res.16)
  ret float %res
}

declare i32 @llvm.bitreverse.i32(i32)
;; int Int.reverse(Int val)
define zing i32 @_reverse_i(i32 %val)
    nounwind alwaysinline readnone {
entry:
  %res = call i32 @llvm.bitreverse.i32(i32 %val)
  ret i32 %res
}

declare i64 @llvm.bitreverse.i64(i64)
;; int Long.reverse(Long val)
define zing i64 @_reverse_l(i64 %val)
    nounwind alwaysinline readnone {
entry:
  %res = call i64 @llvm.bitreverse.i64(i64 %val)
  ret i64 %res
}

declare i1 @llvm.is.fpclass.f32(float, i32)
declare i1 @llvm.is.fpclass.f64(double, i32)

define zing i32 @_floatIsInfinite(float %arg)
    nounwind alwaysinline readnone {
entry:
  ;; 516 => 0000_0010_0000_0100b => "negative infinity OR positive infinity"
  ;; according to https://llvm.org/docs/LangRef.html#llvm-is-fpclass-intrinsic
  %res.i1 = call i1 @llvm.is.fpclass.f32(float %arg, i32 516)
  %res = zext i1 %res.i1 to i32
  ret i32 %res
}

define zing i32 @_doubleIsInfinite(double %arg)
    nounwind alwaysinline readnone {
entry:
  ;; 516 => 0000_0010_0000_0100b => "negative infinity OR positive infinity"
  ;; according to https://llvm.org/docs/LangRef.html#llvm-is-fpclass-intrinsic
  %res.i1 = call i1 @llvm.is.fpclass.f64(double %arg, i32 516)
  %res = zext i1 %res.i1 to i32
  ret i32 %res
}

;; Implementation of the intrinsic for the Float.intBitsToFloat(int)
;; For more info see:
;; https://docs.oracle.com/javase/7/docs/api/java/lang/Float.html#intBitsToFloat(int)
define zing float @_intBitsToFloat(i32 %arg)
    nounwind alwaysinline readnone {
entry:
  %result = bitcast i32 %arg to float
  ret float %result
}

;; Implementation of the intrinsic for the Double.doubleToRawLongBits(double)
;; For more info see:
;; https://docs.oracle.com/javase/7/docs/api/java/lang/Double.html#doubleToRawLongBits(double)
define zing i64 @_doubleToRawLongBits(double %arg)
    nounwind alwaysinline readnone {
entry:
  %result = bitcast double %arg to i64
  ret i64 %result
}

;; Implementation of the intrinsic for the Double.doubleToLongBits(double)
;; (Do not confuse with Double.doubleToRawLongBits)
;; For more info see:
;; https://docs.oracle.com/javase/7/docs/api/java/lang/Double.html#doubleToLongBits(double)
define zing i64 @_doubleToLongBits(double %arg)
    nounwind alwaysinline readnone {
entry:
  %is_nan = fcmp uno double %arg, 0.0
  br i1 %is_nan, label %nan, label %not_nan, !prof !8

nan:
  ; 9221120237041090560 <=> 0x7ff8000000000000L <=> NaN
  ret i64 9221120237041090560

not_nan:
  %result = bitcast double %arg to i64
  ret i64 %result
}

;; Implementation of the intrinsic for the Double.longBitsToDouble(long)
;; For more info see:
;; https://docs.oracle.com/javase/7/docs/api/java/lang/Double.html#longBitsToDouble(long)
define zing double @_longBitsToDouble(i64 %arg)
    nounwind alwaysinline readnone {
entry:
  %result = bitcast i64 %arg to double
  ret double %result
}

define ptr addrspace(1) @azul.get_array_klass_oop(ptr addrspace(1) %lang.class)
     "azul-late-inline"="0" alwaysinline nounwind readonly willreturn "gc-leaf-function" {
  %array.klass.offset = load i32, ptr @java_lang_Class.array_klass_offset
  %arrayKlassOop.addr.i8 = getelementptr inbounds i8, ptr addrspace(1) %lang.class, i32 %array.klass.offset
  %arrayKlassOop = call ptr addrspace(1) @azul.load_ref_p1(ptr addrspace(1) %arrayKlassOop.addr.i8)
  ret ptr addrspace(1) %arrayKlassOop
}

define i32 @azul.layout_helper_element_type(i32 %lh)
    nounwind alwaysinline readonly "azul-late-inline"="0" willreturn "gc-leaf-function" {
entry:
  %lh.element.type.mask = load i32, ptr @Klass.layout_helper_element_type_mask
  %lh.element.type.shift = load i32, ptr @Klass.layout_helper_element_type_shift
  %lh.element.type.shifted = lshr i32 %lh, %lh.element.type.shift
  %element.type = and i32 %lh.element.type.shifted, %lh.element.type.mask
  ret i32 %element.type
}

declare zing_stub_default nonnull noalias ptr addrspace(1)
@"StubRoutines::dolphin_reflect_newarray()"(i64 %javaThread, 
                       ptr addrspace(1), 
                       i64)  "consumes-caller-vmstate" "azul-deopt-on-throw"

define zing nonnull noalias ptr addrspace(1)
@_newArray(ptr addrspace(1) %mirror,
           i32 %length) "alwaysinline-top-level" "azul-deopt-on-throw" {
entry:
  %current.thread = call i64 @azul.get_current_thread()
  br label %check_mirror_not_null

check_mirror_not_null:
  ; Throw NullPointerException in case of null mirror.
  %mirror.not.null = icmp ne ptr addrspace(1) %mirror, null
  br i1 %mirror.not.null, label %check_length_not_negative, label %throw_npe, !prof !9

check_length_not_negative:
  ; Throw NegativeArraySizeException in case of negative length.
  %length.not.negative =  icmp sge i32 %length, 0
  br i1 %length.not.negative, label %check_array_klass_not_null, label %throw_nase, !prof !9

check_array_klass_not_null:
  %arrayKlassOop = call ptr addrspace(1) @azul.get_array_klass_oop(ptr addrspace(1) %mirror)
  %arrayKlassOop.is.null = icmp eq ptr addrspace(1) %arrayKlassOop, null
  br i1 %arrayKlassOop.is.null, label %slowpath, label %check_primitive, !prof !7

check_primitive:
  %klassOop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %mirror)
  ; If klassOop is null we have a primitive type like int.class.
  %klassOop.is.null = icmp eq ptr addrspace(1) %klassOop, null
  br i1 %klassOop.is.null, label %fastpath, label %non_primitive

non_primitive:
  %elem.kid.nonprim = call i32 @azul.get_kid_from_klass_oop(ptr addrspace(1) %klassOop)
  br label %fastpath

fastpath:
  %elem.kid = phi i32 [ 0, %check_primitive ], [ %elem.kid.nonprim, %non_primitive ]

  %array.kid = call i32 @azul.get_kid_from_klass_oop(ptr addrspace(1) %arrayKlassOop)
  %layout.helper = call i32 @azul.load_layout_helper(i32 %array.kid)
  %element.shift = call i32 @azul.layout_helper_log2_element_size(i32 %layout.helper)
  %header.size = call i32 @azul.layout_helper_header_size(i32 %layout.helper)
  %basictype = call i32 @azul.layout_helper_element_type(i32 %layout.helper)

  %newArray.fast = call ptr addrspace(1) @azul.new_array(i64 %current.thread,
          i32 %array.kid, i32 %elem.kid, i32 %basictype,
          i32 %length, i32 %header.size, i32 %element.shift, i64 0)  [ "deopt"() ]
  ret ptr addrspace(1) %newArray.fast

slowpath:
  %length.zext = zext i32 %length to i64
  %newArray.slow = call zing_stub_default ptr addrspace(1)
        @"StubRoutines::dolphin_reflect_newarray()"(i64 %current.thread,
                                                    ptr addrspace(1) %mirror,
                                                    i64 %length.zext) [ "deopt"() ]
  ret ptr addrspace(1) %newArray.slow

throw_npe:
  call void @azul.throw_npe_and_deoptimize(i64 %current.thread) [ "deopt"() ]
  unreachable

throw_nase:
  call void @azul.throw_nase_and_deoptimize(i64 %current.thread) [ "deopt"() ]
  unreachable
}

define zing i32 @_getLength(ptr addrspace(1) %array) 
  alwaysinline {
entry:
  %not_null = icmp ne ptr addrspace(1) %array, null
  br i1 %not_null, label %not.null, label %deopt.before, !prof !9

not.null:
  %lh_neutral_value = load i32, ptr @Klass.layout_helper_neutral_value
  %kid = call i32 @azul.get_klass_id(ptr addrspace(1) %array)
  %lh = call i32 @azul.load_layout_helper(i32 %kid)
  %is_array = icmp slt i32 %lh, %lh_neutral_value
  br i1 %is_array, label %is.array, label %deopt.before, !prof !9

is.array:
  %length = call i32 @azul.array_length(ptr addrspace(1) %array)
  ret i32 %length

deopt.before:
  ; Exception, deopt to before call and let the interpreter throw the exception
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  %ret = call zing_uncommon_trap i32(...) @llvm.experimental.deoptimize.isI32(i32 %reasonUnhandled) "azul-need-deopt-before-call" [ "deopt"() ]
  ret i32 %ret
}

define void @copyMemoryHelper(ptr addrspace(1) %this,
                              ptr addrspace(1) readonly %src, i64 %src_pos,
                              ptr addrspace(1) %dest, i64 %dest_pos,
                              i64 %length) alwaysinline {
entry:
  %src_non_null = icmp ne ptr addrspace(1) %src, null
  %dest_non_null = icmp ne ptr addrspace(1) %dest, null
  br i1 %src_non_null, label %src.j.heap, label %src.c.heap

src.j.heap:
  ; Note: since %dest can only be a primitive array or null, it can't be a
  ; mirror, so we don't need to handle it this way.
  %src_base.j = call ptr addrspace(1) @azul.get_base_oop(ptr addrspace(1) %src, i64 %src_pos)
  %src_offset.j = call i64 @azul.get_byte_offset(ptr addrspace(1) %src, i64 %src_pos)
  %src_ptr.j = getelementptr i8, ptr addrspace(1) %src_base.j, i64 %src_offset.j
  br i1 %dest_non_null, label %j.j.heap, label %j.c.heap

src.c.heap:
  %src_ptr.c = inttoptr i64 %src_pos to ptr
  br i1 %dest_non_null, label %c.j.heap, label %c.c.heap

j.j.heap:
  %dest_ptr.j.j = getelementptr i8, ptr addrspace(1) %dest, i64 %dest_pos
  call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(
       ptr addrspace(1) align 1 elementtype(i8) %dest_ptr.j.j,
       ptr addrspace(1) align 1 elementtype(i8) %src_ptr.j,
       i64 %length,
       i32 1)
  ret void

j.c.heap:
  %dest_ptr.j.c = inttoptr i64 %dest_pos to ptr
  call void @llvm.memmove.element.unordered.atomic.p0.p1.i64(
       ptr align 1 elementtype(i8) %dest_ptr.j.c,
       ptr addrspace(1) align 1 elementtype(i8) %src_ptr.j,
       i64 %length,
       i32 1)
  ret void

c.j.heap:
  %dest_ptr.c.j = getelementptr i8, ptr addrspace(1) %dest, i64 %dest_pos
  call void @llvm.memmove.element.unordered.atomic.p1.p0.i64(
       ptr addrspace(1) align 1 elementtype(i8) %dest_ptr.c.j,
       ptr align 1 elementtype(i8) %src_ptr.c,
       i64 %length,
       i32 1)
  ret void

c.c.heap:
  %dest_ptr.c.c = inttoptr i64 %dest_pos to ptr
  call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(
       ptr align 1 elementtype(i8) %dest_ptr.c.c,
       ptr align 1 elementtype(i8) %src_ptr.c,
       i64 %length,
       i32 1)
  ret void
}

; Implement Unsafe.copyMemory.  Note that the prims/unsafe.cpp enforces
; that the destination be either a type array or a C allocation.  We don't
; check that here.  The source can be any memory.
define zing void @_copyMemory(ptr addrspace(1) %this,
                              ptr addrspace(1) readonly %src, i64 %src_pos,
                              ptr addrspace(1) %dest, i64 %dest_pos,
                              i64 %length) "alwaysinline-top-level" {
load_vm_constants:
  %doing_unsafe_access_offset = load i32, ptr @Thread.doing_unsafe_access_offset_bytes
  br label %entry

entry:
  ; set the '_doing_unsafe_access' flag located at JavaThread[*doing_unsafe_access_offset]
  %doing_unsafe_access.Address.i8 = getelementptr inbounds i8, ptr addrspace(256) null, i32 %doing_unsafe_access_offset
  store atomic i8 1, ptr addrspace(256) %doing_unsafe_access.Address.i8 release, align 1

  call void @copyMemoryHelper(
       ptr addrspace(1) %this,
       ptr addrspace(1) readonly %src,
       i64 %src_pos,
       ptr addrspace(1) %dest,
       i64 %dest_pos,
       i64 %length)

  store atomic i8 0, ptr addrspace(256) %doing_unsafe_access.Address.i8 release, align 1
  ret void
}

; memset here doesn't invoke safepoint but neither does Unsafe_SetMemory0
define void @setMemoryHelper(ptr addrspace(1) writeonly %dest, i64 %dest_pos,
                             i64 %length, i8 %value) alwaysinline {
entry:
  %dest_non_null = icmp ne ptr addrspace(1) %dest, null
  br i1 %dest_non_null, label %dest.j.heap, label %dest.c.heap

dest.j.heap:
  %dest_base.j = call ptr addrspace(1) @azul.get_base_oop(ptr addrspace(1) %dest, i64 %dest_pos)
  %dest_offset.j = call i64 @azul.get_byte_offset(ptr addrspace(1) %dest, i64 %dest_pos)
  %dest_ptr.j = getelementptr i8, ptr addrspace(1) %dest_base.j, i64 %dest_offset.j
  call void @llvm.memset.element.unordered.atomic.p1.i64(
       ptr addrspace(1) align 1 elementtype(i8) %dest_ptr.j,
       i8 %value,
       i64 %length,
       i32 1)
  ret void

dest.c.heap:
  %dest_ptr.c = inttoptr i64 %dest_pos to ptr
  call void @llvm.memset.element.unordered.atomic.p0.i64(
       ptr align 1 elementtype(i8) %dest_ptr.c,
       i8 %value,
       i64 %length,
       i32 1)
  ret void
}

; Implement Unsafe.setMemory.
define zing void @_setMemory(ptr addrspace(1) %this,
                             ptr addrspace(1) writeonly %dest, i64 %dest_pos,
                             i64 %length, i32 %value) "alwaysinline-top-level" {
load_vm_constants:
  %doing_unsafe_access_offset = load i32, ptr @Thread.doing_unsafe_access_offset_bytes
  br label %entry

entry:
  ; set the '_doing_unsafe_access' flag located at JavaThread[*doing_unsafe_access_offset]
  %doing_unsafe_access.Address.i8 = getelementptr inbounds i8, ptr addrspace(256) null, i32 %doing_unsafe_access_offset
  store atomic i8 1, ptr addrspace(256) %doing_unsafe_access.Address.i8 release, align 1
  %val.tr = trunc i32 %value to i8

  call void @setMemoryHelper(
       ptr addrspace(1) %dest,
       i64 %dest_pos,
       i64 %length,
       i8 %val.tr)

  store atomic i8 0, ptr addrspace(256) %doing_unsafe_access.Address.i8 release, align 1
  ret void
}

define i32 @azul.load_klass_init_state(ptr addrspace(1) %oop)
  alwaysinline nounwind "vmstate-idempotent"="true" "azul-late-inline"="0" willreturn "gc-leaf-function" {

  %init_state_offset = load i32, ptr @klassOopDesc.init_state_offset_in_bytes
  %init_state_addr = getelementptr inbounds i8, ptr addrspace(1) %oop, i32 %init_state_offset
  %init_state = load atomic i32, ptr addrspace(1) %init_state_addr unordered, align 4, !tbaa !27

  ret i32 %init_state
}

define void @azul.set_klass_init_state(ptr addrspace(1) %oop, i32 %state)
  alwaysinline nounwind "vmstate-idempotent"="true" "azul-late-inline"="0" willreturn "gc-leaf-function" {

  %init_state_offset = load i32, ptr @klassOopDesc.init_state_offset_in_bytes
  %init_state_addr = getelementptr inbounds i8, ptr addrspace(1) %oop, i32 %init_state_offset
  store atomic i32 %state, ptr addrspace(1) %init_state_addr unordered, align 4, !tbaa !27

  ret void
}

declare zing_stub_default noalias nonnull noundef ptr addrspace(1) @"StubRoutines::dolphin_new_instance()"(i64, i64)
    "vmstate-idempotent"="true" nounwind  "consumes-replay-vmstate"  "allocation-site" "azul-allow-gcptrs-in-regs"

; Unsafe.allocateInstance
define zing noalias ptr addrspace(1) @_allocateInstance(ptr addrspace(1) %this, ptr addrspace(1) %mirror)
  alwaysinline "vmstate-idempotent"="true" "consumes-replay-vmstate" "azul-deopt-on-throw" {

  %thread = call i64 @azul.get_current_thread()

  ; Throw NPE in case of null or primitive mirror, see ZVM-8223.
  %mirror_null = icmp eq ptr addrspace(1) %mirror, null
  br i1 %mirror_null, label %throw_npe, label %check_klass

 check_klass:
  %oop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %mirror)
  
  ; Throw in case of class not suitable for instantiation:
  ; interface, abstract, primitive etc.
  ; If oop is null then we have primitive type.
  %oop_null = icmp eq ptr addrspace(1) %oop, null
  br i1 %oop_null, label %instantiation_error, label %check_flags

 check_flags:
  %JVM_ACC_INTERFACE = load i32, ptr @AccessFlags_JVM_ACC_INTERFACE_Mask
  %JVM_ACC_ABSTRACT = load i32, ptr @AccessFlags_JVM_ACC_ABSTRACT_Mask
  %mask = or i32 %JVM_ACC_INTERFACE, %JVM_ACC_ABSTRACT
  %access_flags = call i32 @azul.get_access_flags_klass(ptr addrspace(1) %oop)
  %masked_flags = and i32 %access_flags, %mask
  %cannot_instantiate = icmp ne i32 %masked_flags, 0
  br i1 %cannot_instantiate, label %instantiation_error, label %check_init_state

 check_init_state:
  ; kid is used later for new_instance call
  %kid = call i32 @azul.get_kid_from_klass_oop(ptr addrspace(1) %oop)

  ; check if class needs initialization
  %fully_initialized = load i32, ptr @instanceKlass.fully_initialized
  %init_state = call i32 @azul.load_klass_init_state(ptr addrspace(1) %oop)
  %is_fully_initialized = icmp eq i32 %init_state, %fully_initialized
  br i1 %is_fully_initialized, label %create_instance, label %create_instance_slow

 create_instance:
  %instance = call noalias ptr addrspace(1) @azul.new_instance(i64 %thread, i32 %kid) [ "deopt"() ]
  ret ptr addrspace(1) %instance

 create_instance_slow:
  %kid.i64 = zext i32 %kid to i64
  ; StubRoutines::dolphin_new_instance() handles class initialization
  %instance_slow = call zing_stub_default noalias ptr addrspace(1) @"StubRoutines::dolphin_new_instance()"(i64 %thread, i64 %kid.i64) [ "deopt"() ]
  ret ptr addrspace(1) %instance_slow

 instantiation_error:
  call void @azul.throw_instantiation_exception_and_deoptimize(i64 %thread) [ "deopt"() ]
  unreachable

 throw_npe:
  call void @azul.throw_npe_and_deoptimize(i64 %thread) [ "deopt"() ]
  unreachable
}

declare void @"StubRoutines::aescrypt_encryptBlock()"(
    ptr addrspace(1) %src, ptr addrspace(1) %dest, ptr addrspace(1) %subK)
  argmemonly willreturn "gc-leaf-function" nounwind

; private void com.sun.crypto.provider.AESCrypt.implEncryptBlock(byte[] in, int inOffset, byte[] out, int outOffset, int[] subK)
define zing void @_aescrypt_encryptBlock(
    ptr addrspace(1) %this,
    ptr addrspace(1) %in, i32 %inOffset,
    ptr addrspace(1) %out, i32 %outOffset,
    ptr addrspace(1) %subK) alwaysinline argmemonly {
 entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  ; in,out have already been range checked in com.sun.crypto.provider.AESCrypt.cryptBlockCheck
  %in.base = getelementptr i8, ptr addrspace(1) %in, i32 %byteArrayHeaderSize
  %in.start = getelementptr i8, ptr addrspace(1) %in.base, i32 %inOffset
  %out.base = getelementptr i8, ptr addrspace(1) %out, i32 %byteArrayHeaderSize
  %out.start = getelementptr i8, ptr addrspace(1) %out.base, i32 %outOffset
  %subK.base = getelementptr i8, ptr addrspace(1) %subK, i32 %intArrayHeaderSize

  call void @"StubRoutines::aescrypt_encryptBlock()"(
      ptr addrspace(1) %in.start, ptr addrspace(1) %out.start, ptr addrspace(1) %subK.base)
  ret void
}

declare void @"StubRoutines::aescrypt_decryptBlock()"(
    ptr addrspace(1) %src, ptr addrspace(1) %dest, ptr addrspace(1) %subK)
  argmemonly willreturn "gc-leaf-function" nounwind

; private void com.sun.crypto.provider.AESCrypt.implDecryptBlock(byte[] in, int inOffset, byte[] out, int outOffset, int[] subK)
define zing void @_aescrypt_decryptBlock(
    ptr addrspace(1) %this,
    ptr addrspace(1) %in, i32 %inOffset,
    ptr addrspace(1) %out, i32 %outOffset,
    ptr addrspace(1) %subK) alwaysinline argmemonly {
 entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  ; in,out have already been range checked in com.sun.crypto.provider.AESCrypt.cryptBlockCheck
  %in.base = getelementptr i8, ptr addrspace(1) %in, i32 %byteArrayHeaderSize
  %in.start = getelementptr i8, ptr addrspace(1) %in.base, i32 %inOffset
  %out.base = getelementptr i8, ptr addrspace(1) %out, i32 %byteArrayHeaderSize
  %out.start = getelementptr i8, ptr addrspace(1) %out.base, i32 %outOffset
  %subK.base = getelementptr i8, ptr addrspace(1) %subK, i32 %intArrayHeaderSize

  call void @"StubRoutines::aescrypt_decryptBlock()"(
      ptr addrspace(1) %in.start, ptr addrspace(1) %out.start, ptr addrspace(1) %subK.base)
  ret void
}

declare i32 @"StubRoutines::electronicCodeBook_encryptAESCrypt()"(
    ptr addrspace(1) nonnull %src,
    ptr addrspace(1) nonnull %dest,
    ptr addrspace(1) %k,
    i64 %len
) argmemonly willreturn "gc-leaf-function" nounwind

define zing i32 @_electronicCodeBook_encryptAESCrypt(
    ptr addrspace(1) readnone %this,
    ptr addrspace(1) nonnull %in,
    i32 %inOff,
    i32 %len,
    ptr addrspace(1) nonnull %out,
    i32 %outOff,
    ptr addrspace(1) %k
) alwaysinline argmemonly willreturn "gc-leaf-function" nounwind "azul-late-inline"="0" {
  entry:
    %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
    %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
    %in.base = getelementptr i8, ptr addrspace(1) %in, i32 %byteArrayHeaderSize
    %in.start = getelementptr i8, ptr addrspace(1) %in.base, i32 %inOff
    %out.base = getelementptr i8, ptr addrspace(1) %out, i32 %byteArrayHeaderSize
    %out.start = getelementptr i8, ptr addrspace(1) %out.base, i32 %outOff
    %k.base = getelementptr i8, ptr addrspace(1) %k, i32 %intArrayHeaderSize

    %len.zext = zext i32 %len to i64

    %res = call i32 @"StubRoutines::electronicCodeBook_encryptAESCrypt()"(
        ptr addrspace(1) %in.start, ptr addrspace(1) %out.start,
        ptr addrspace(1) %k.base, i64 %len.zext)
    ret i32 %res
}

declare i32 @"StubRoutines::electronicCodeBook_decryptAESCrypt()"(
    ptr addrspace(1) nonnull %src,
    ptr addrspace(1) nonnull %dest,
    ptr addrspace(1) %k,
    i64 %len
) argmemonly willreturn "gc-leaf-function" nounwind

define zing i32 @_electronicCodeBook_decryptAESCrypt(
    ptr addrspace(1) readnone %this,
    ptr addrspace(1) nonnull %in,
    i32 %inOff,
    i32 %len,
    ptr addrspace(1) nonnull %out,
    i32 %outOff,
    ptr addrspace(1) %k
) alwaysinline argmemonly willreturn "gc-leaf-function" nounwind "azul-late-inline"="0" {
  entry:
    %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
    %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
    %in.base = getelementptr i8, ptr addrspace(1) %in, i32 %byteArrayHeaderSize
    %in.start = getelementptr i8, ptr addrspace(1) %in.base, i32 %inOff
    %out.base = getelementptr i8, ptr addrspace(1) %out, i32 %byteArrayHeaderSize
    %out.start = getelementptr i8, ptr addrspace(1) %out.base, i32 %outOff
    %k.base = getelementptr i8, ptr addrspace(1) %k, i32 %intArrayHeaderSize

    %len.zext = zext i32 %len to i64

    %res = call i32 @"StubRoutines::electronicCodeBook_decryptAESCrypt()"(
        ptr addrspace(1) %in.start, ptr addrspace(1) %out.start,
        ptr addrspace(1) %k.base, i64 %len.zext)
    ret i32 %res
}

declare i32 @"StubRoutines::galoisCounterMode_AESCrypt()"(
    ptr addrspace(1) nonnull %in_start,
    i64 %len,
    ptr addrspace(1) nonnull %ct_start,
    ptr addrspace(1) nonnull %out_start,
    ptr addrspace(1) nonnull %k_start,
    ptr addrspace(1) nonnull %state_start,
    ptr addrspace(1) nonnull %subkeyHtbl_start,
    ptr addrspace(1) nonnull %cnt_start
) argmemonly willreturn "gc-leaf-function" nounwind

; private static int implGCMCryptAzul0(byte[] in, int inOfs, int inLen,
;        byte[] ct, int ctOfs, byte[] out, int outOfs, GCTR gctr, GHASH ghash,
;        int[] k, byte[] counter, long[] subkeyHtbl, long[] state)
define zing i32 @_galoisCounterMode_AESCrypt(
    ptr addrspace(1) nonnull %in, i32 %inOff, i32 %inLen,
    ptr addrspace(1) nonnull %ct, i32 %ctOff,
    ptr addrspace(1) nonnull %out, i32 %outOff,
    ptr addrspace(1) %gctr.unused, ptr addrspace(1) %ghash.unused,
    ptr addrspace(1) nonnull %k,
    ptr addrspace(1) nonnull %counter,
    ptr addrspace(1) nonnull %subkeyHtbl,
    ptr addrspace(1) nonnull %state)
  alwaysinline argmemonly willreturn "gc-leaf-function" nounwind "azul-late-inline"="0"
{
 entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  %longArrayHeaderSize = load i32, ptr @arrayOopDesc.longArrayHeaderSize

  %in.base = getelementptr i8, ptr addrspace(1) %in, i32 %byteArrayHeaderSize
  %ct.base = getelementptr i8, ptr addrspace(1) %ct, i32 %byteArrayHeaderSize
  %out.base = getelementptr i8, ptr addrspace(1) %out, i32 %byteArrayHeaderSize
  %in.start = getelementptr i8, ptr addrspace(1) %in.base, i32 %inOff
  %out.start = getelementptr i8, ptr addrspace(1) %out.base, i32 %outOff
  %ct.start = getelementptr i8, ptr addrspace(1) %ct.base, i32 %ctOff

  %k.base = getelementptr i8, ptr addrspace(1) %k, i32 %intArrayHeaderSize
  %counter.base = getelementptr i8, ptr addrspace(1) %counter, i32 %byteArrayHeaderSize
  %subkeyHtbl.base = getelementptr i8, ptr addrspace(1) %subkeyHtbl, i32 %longArrayHeaderSize
  %state.base = getelementptr i8, ptr addrspace(1) %state, i32 %longArrayHeaderSize

  %len.zext = zext i32 %inLen to i64

  %result = call i32 @"StubRoutines::galoisCounterMode_AESCrypt()"(
    ptr addrspace(1) nonnull %in.start, i64 %len.zext,
    ptr addrspace(1) nonnull %ct.start,
    ptr addrspace(1) nonnull %out.start,
    ptr addrspace(1) nonnull %k.base,
    ptr addrspace(1) nonnull %state.base,
    ptr addrspace(1) nonnull %subkeyHtbl.base,
    ptr addrspace(1) nonnull %counter.base)
  ret i32 %result
}

declare i32 @"StubRoutines::cipherBlockChaining_encryptAESCrypt()"(
    ptr addrspace(1) nonnull %src, ptr addrspace(1) nonnull %dest, ptr addrspace(1) %k,
    ptr addrspace(1) %r, i64 %len) argmemonly willreturn "gc-leaf-function" nounwind

define zing i32 @azul.CBC_encryptAESCrypt(
    ptr addrspace(1) readnone %this,
    ptr addrspace(1) nonnull %in, i32 %in.ofs.bytes,
    i32 %len,
    ptr addrspace(1) nonnull %out, i32 %out.ofs.bytes,
    ptr addrspace(1) %k,
    ptr addrspace(1) %r)
  alwaysinline argmemonly willreturn "gc-leaf-function" nounwind "azul-late-inline"="0" {
 entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  ; in,out,len have already been range checked in com.sun.crypto.provider.CipherBlockChaining.encrypt
  %in.base = getelementptr i8, ptr addrspace(1) %in, i32 %byteArrayHeaderSize
  %in.start = getelementptr i8, ptr addrspace(1) %in.base, i32 %in.ofs.bytes
  %out.base = getelementptr i8, ptr addrspace(1) %out, i32 %byteArrayHeaderSize
  %out.start = getelementptr i8, ptr addrspace(1) %out.base, i32 %out.ofs.bytes
  %k.base = getelementptr i8, ptr addrspace(1) %k, i32 %intArrayHeaderSize
  %r.base = getelementptr i8, ptr addrspace(1) %r, i32 %byteArrayHeaderSize

  %len.zext = zext i32 %len to i64 ; len is jint in C

  %res = call i32 @"StubRoutines::cipherBlockChaining_encryptAESCrypt()"(
      ptr addrspace(1) %in.start, ptr addrspace(1) %out.start,
      ptr addrspace(1) %k.base, ptr addrspace(1) %r.base, i64 %len.zext)
  ret i32 %res
}

declare i32 @"StubRoutines::cipherBlockChaining_decryptAESCrypt()"(
    ptr addrspace(1) nonnull %src, ptr addrspace(1) nonnull %dest, ptr addrspace(1) %k,
    ptr addrspace(1) %r, i64 %len) argmemonly willreturn "gc-leaf-function" nounwind

define zing i32 @azul.CBC_decryptAESCrypt(
    ptr addrspace(1) readnone %this,
    ptr addrspace(1) nonnull %in, i32 %in.ofs.bytes,
    i32 %len,
    ptr addrspace(1) nonnull %out, i32 %out.ofs.bytes,
    ptr addrspace(1) %k,
    ptr addrspace(1) %r)
  alwaysinline argmemonly willreturn "gc-leaf-function" nounwind "azul-late-inline"="0" {
 entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  ; in,out,len have already been range checked in com.sun.crypto.provider.CipherBlockChaining.decrypt
  %in.base = getelementptr i8, ptr addrspace(1) %in, i32 %byteArrayHeaderSize
  %in.start = getelementptr i8, ptr addrspace(1) %in.base, i32 %in.ofs.bytes
  %out.base = getelementptr i8, ptr addrspace(1) %out, i32 %byteArrayHeaderSize
  %out.start = getelementptr i8, ptr addrspace(1) %out.base, i32 %out.ofs.bytes
  %k.base = getelementptr i8, ptr addrspace(1) %k, i32 %intArrayHeaderSize
  %r.base = getelementptr i8, ptr addrspace(1) %r, i32 %byteArrayHeaderSize

  %len.zext = zext i32 %len to i64 ; len is jint in C

  %res = call i32 @"StubRoutines::cipherBlockChaining_decryptAESCrypt()"(
      ptr addrspace(1) %in.start, ptr addrspace(1) %out.start,
      ptr addrspace(1) %k.base, ptr addrspace(1) %r.base, i64 %len.zext)
  ret i32 %res
}

declare i32 @"StubRoutines::counterMode_AESCrypt()"(
    ptr addrspace(1) %src, ptr addrspace(1) %dest,
    ptr addrspace(1) %k, ptr addrspace(1) %cnt,
    i64 %len, ptr addrspace(1) %ecnt,
    ptr addrspace(1) %u)
  argmemonly nounwind willreturn "gc-leaf-function"

define zing i32 @azul.counterMode_AESCrypt(
    ptr addrspace(1) readonly %this, ptr addrspace(1) %in, i32 %in.ofs.bytes,
    i32 %len, ptr addrspace(1) %out, i32 %out.ofs.bytes, ptr addrspace(1) %k,
    ptr addrspace(1) %cnt, ptr addrspace(1) %ecnt, i32 %used.ofs.bytes)
  nounwind alwaysinline "azul-late-inline"="0" willreturn "gc-leaf-function"
{
 entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize

  ; in,out,len have already been range checked in com.sun.crypto.provider.CounterMode.crypt
  %in.base = getelementptr i8, ptr addrspace(1) %in, i32 %byteArrayHeaderSize
  %in.start = getelementptr i8, ptr addrspace(1) %in.base, i32 %in.ofs.bytes
  %out.base = getelementptr i8, ptr addrspace(1) %out, i32 %byteArrayHeaderSize
  %out.start = getelementptr i8, ptr addrspace(1) %out.base, i32 %out.ofs.bytes
  %len.zext = zext i32 %len to i64 ; len is jint in C

  %k.base = getelementptr i8, ptr addrspace(1) %k, i32 %intArrayHeaderSize
  %cnt.base = getelementptr i8, ptr addrspace(1) %cnt, i32 %byteArrayHeaderSize
  %ecnt.base = getelementptr i8, ptr addrspace(1) %ecnt, i32 %byteArrayHeaderSize
  %used.adr = getelementptr i8, ptr addrspace(1) %this, i32 %used.ofs.bytes

  %res = call i32 @"StubRoutines::counterMode_AESCrypt()"(
      ptr addrspace(1) %in.start, ptr addrspace(1) %out.start,
      ptr addrspace(1) %k.base, ptr addrspace(1) %cnt.base,
      i64 %len.zext, ptr addrspace(1) %ecnt.base,
      ptr addrspace(1) %used.adr)
  ret i32 %res
}

declare void @"StubRoutines::md5_implCompress()"(
    ptr addrspace(1) %buf, ptr addrspace(1) %state)
  argmemonly willreturn "gc-leaf-function" nounwind

; private void sun.security.provider.MD5.implCompress0(byte[] buf, int ofs, int[] st)
define zing void @_md5_implCompress(
    ptr addrspace(1) readonly %this,
    ptr addrspace(1) %buf,
    i32 %ofs,
    ptr addrspace(1) %st) alwaysinline argmemonly {
 entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize

  %buf.base = getelementptr i8, ptr addrspace(1) %buf, i32 %byteArrayHeaderSize
  %buf.start = getelementptr i8, ptr addrspace(1) %buf.base, i32 %ofs
  %state.base = getelementptr i8, ptr addrspace(1) %st, i32 %intArrayHeaderSize

  call void @"StubRoutines::md5_implCompress()"(
      ptr addrspace(1) %buf.start, ptr addrspace(1) %state.base)
  ret void
}

declare i32 @"StubRoutines::md5_implCompressMB()"(
    ptr addrspace(1) nonnull %buf, ptr addrspace(1) nonnull %state, i64 %ofs, i64 %limit)
  argmemonly nounwind willreturn "gc-leaf-function"

; private int sun.security.provider.DigestBase.implCompressMultiBlockMD5(byte[] buf, int ofs, int limit, int[] st)
define zing i32 @_md5_digestBase_implCompressMB(
    ptr addrspace(1) readnone %this,
    ptr addrspace(1) nonnull %buf,
    i32 %ofs,
    i32 %lim,
    ptr addrspace(1) nonnull %st) alwaysinline argmemonly nounwind {
 entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize

  ; buf,ofs,lim have already been range checked in sun.security.provider.DigestBase.implCompressMultiBlockCheck
  %buf.base = getelementptr i8, ptr addrspace(1) %buf, i32 %byteArrayHeaderSize
  %buf.start = getelementptr i8, ptr addrspace(1) %buf.base, i32 %ofs
  %state.base = getelementptr i8, ptr addrspace(1) %st, i32 %intArrayHeaderSize

  %ofs.zext = zext i32 %ofs to i64 ; ofs is jint in C
  %lim.zext = zext i32 %lim to i64 ; limit is jint in C

  %res.ofs = call i32 @"StubRoutines::md5_implCompressMB()"(
      ptr addrspace(1) %buf.start,
      ptr addrspace(1) %state.base,
      i64 %ofs.zext,
      i64 %lim.zext)
  ret i32 %res.ofs
}

declare void @"StubRoutines::sha1_implCompress()"(
    ptr addrspace(1) %buf, ptr addrspace(1) %state)
  argmemonly willreturn "gc-leaf-function" nounwind

; private void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs, int[] st)
define zing void @_sha_implCompress(
    ptr addrspace(1) readonly %this,
    ptr addrspace(1) %buf,
    i32 %ofs,
    ptr addrspace(1) %st) alwaysinline argmemonly {
 entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize

  %buf.base = getelementptr i8, ptr addrspace(1) %buf, i32 %byteArrayHeaderSize
  %buf.start = getelementptr i8, ptr addrspace(1) %buf.base, i32 %ofs
  %state.base = getelementptr i8, ptr addrspace(1) %st, i32 %intArrayHeaderSize

  call void @"StubRoutines::sha1_implCompress()"(
      ptr addrspace(1) %buf.start, ptr addrspace(1) %state.base)
  ret void
}

declare i32 @"StubRoutines::sha1_implCompressMB()"(
    ptr addrspace(1) nonnull %buf, ptr addrspace(1) nonnull %state, i64 %ofs, i64 %limit)
  argmemonly nounwind willreturn "gc-leaf-function"

; private int sun.security.provider.DigestBase.implCompressMultiBlockSHA1(byte[] buf, int ofs, int limit, int[] st)
define zing i32 @_sha_digestBase_implCompressMB(
    ptr addrspace(1) readnone %this,
    ptr addrspace(1) nonnull %buf,
    i32 %ofs,
    i32 %lim,
    ptr addrspace(1) nonnull %st) alwaysinline argmemonly nounwind {
 entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize

  ; buf,ofs,lim have already been range checked in sun.security.provider.DigestBase.implCompressMultiBlockCheck
  %buf.base = getelementptr i8, ptr addrspace(1) %buf, i32 %byteArrayHeaderSize
  %buf.start = getelementptr i8, ptr addrspace(1) %buf.base, i32 %ofs
  %state.base = getelementptr i8, ptr addrspace(1) %st, i32 %intArrayHeaderSize

  %ofs.zext = zext i32 %ofs to i64 ; ofs is jint in C
  %lim.zext = zext i32 %lim to i64 ; limit is jint in C

  %res.ofs = call i32 @"StubRoutines::sha1_implCompressMB()"(
      ptr addrspace(1) %buf.start,
      ptr addrspace(1) %state.base,
      i64 %ofs.zext,
      i64 %lim.zext)
  ret i32 %res.ofs
}

declare void @"StubRoutines::sha256_implCompress()"(
    ptr addrspace(1) %buf, ptr addrspace(1) %state)
  argmemonly willreturn "gc-leaf-function" nounwind

; private void sun.security.provider.SHA2.implCompress0(byte[] buf, int ofs, int[] st)
define zing void @_sha2_implCompress(
    ptr addrspace(1) readonly %this,
    ptr addrspace(1) %buf,
    i32 %ofs,
    ptr addrspace(1) %st) alwaysinline argmemonly {
 entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize

  %buf.base = getelementptr i8, ptr addrspace(1) %buf, i32 %byteArrayHeaderSize
  %buf.start = getelementptr i8, ptr addrspace(1) %buf.base, i32 %ofs
  %state.base = getelementptr i8, ptr addrspace(1) %st, i32 %intArrayHeaderSize

  call void @"StubRoutines::sha256_implCompress()"(
      ptr addrspace(1) %buf.start, ptr addrspace(1) %state.base)
  ret void
}

declare i32 @"StubRoutines::sha256_implCompressMB()"(
    ptr addrspace(1) nonnull %buf, ptr addrspace(1) nonnull %state, i64 %ofs, i64 %limit)
  argmemonly nounwind willreturn "gc-leaf-function"

; private int sun.security.provider.DigestBase.implCompressMultiBlockSHA2(byte[] buf, int ofs, int limit, int[] st)
define zing i32 @_sha2_digestBase_implCompressMB(
    ptr addrspace(1) readnone %this,
    ptr addrspace(1) nonnull %buf,
    i32 %ofs,
    i32 %lim,
    ptr addrspace(1) nonnull %st) alwaysinline argmemonly nounwind {
 entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize

  ; buf,ofs,lim have already been range checked in sun.security.provider.DigestBase.implCompressMultiBlockCheck
  %buf.base = getelementptr i8, ptr addrspace(1) %buf, i32 %byteArrayHeaderSize
  %buf.start = getelementptr i8, ptr addrspace(1) %buf.base, i32 %ofs
  %state.base = getelementptr i8, ptr addrspace(1) %st, i32 %intArrayHeaderSize

  %ofs.zext = zext i32 %ofs to i64 ; ofs is jint in C
  %lim.zext = zext i32 %lim to i64 ; limit is jint in C

  %res.ofs = call i32 @"StubRoutines::sha256_implCompressMB()"(
      ptr addrspace(1) %buf.start,
      ptr addrspace(1) %state.base,
      i64 %ofs.zext,
      i64 %lim.zext)
  ret i32 %res.ofs
}

declare void @"StubRoutines::sha512_implCompress()"(
    ptr addrspace(1) nonnull %buf, ptr addrspace(1) nonnull %state)
  argmemonly nounwind willreturn "gc-leaf-function"

; private void sun.security.provider.SHA5.implCompress0(byte[] buf, int ofs, long[] st)
define zing void @_sha5_implCompress(
    ptr addrspace(1) readnone %this,
    ptr addrspace(1) nonnull %buf,
    i32 %ofs,
    ptr addrspace(1) nonnull %st) alwaysinline argmemonly nounwind {
 entry:
  %longArrayHeaderSize = load i32, ptr @arrayOopDesc.longArrayHeaderSize
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize

  ; buf,ofs have already been range checked in sun.security.provider.SHA5.implCompressCheck
  %buf.base = getelementptr i8, ptr addrspace(1) %buf, i32 %byteArrayHeaderSize
  %buf.start = getelementptr i8, ptr addrspace(1) %buf.base, i32 %ofs
  %state.base = getelementptr i8, ptr addrspace(1) %st, i32 %longArrayHeaderSize

  call void @"StubRoutines::sha512_implCompress()"(
      ptr addrspace(1) %buf.start, ptr addrspace(1) %state.base)
  ret void
}

declare i32 @"StubRoutines::sha512_implCompressMB()"(
    ptr addrspace(1) nonnull %buf, ptr addrspace(1) nonnull %state, i64 %ofs, i64 %limit)
  argmemonly nounwind willreturn "gc-leaf-function"

; private int sun.security.provider.SHA5.implCompressMultiBlockSHA5(byte[] buf, int ofs, int limit, long[] st)
define zing i32 @_sha5_digestBase_implCompressMB(
    ptr addrspace(1) readnone %this,
    ptr addrspace(1) nonnull %buf,
    i32 %ofs,
    i32 %lim,
    ptr addrspace(1) nonnull %st) alwaysinline argmemonly nounwind {
 entry:
  %longArrayHeaderSize = load i32, ptr @arrayOopDesc.longArrayHeaderSize
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize

  ; buf,ofs,lim have already been range checked in sun.security.provider.DigestBase.implCompressMultiBlockCheck
  %buf.base = getelementptr i8, ptr addrspace(1) %buf, i32 %byteArrayHeaderSize
  %buf.start = getelementptr i8, ptr addrspace(1) %buf.base, i32 %ofs
  %state.base = getelementptr i8, ptr addrspace(1) %st, i32 %longArrayHeaderSize

  %ofs.zext = zext i32 %ofs to i64 ; ofs is jint in C
  %lim.zext = zext i32 %lim to i64 ; limit is jint in C

  %res.ofs = call i32 @"StubRoutines::sha512_implCompressMB()"(
      ptr addrspace(1) %buf.start,
      ptr addrspace(1) %state.base,
      i64 %ofs.zext,
      i64 %lim.zext)
  ret i32 %res.ofs
}

declare void @"StubRoutines::sha3_implCompress()"(
  ptr addrspace(1) nonnull %buf,
  ptr addrspace(1) nonnull %state,
  i64 %blockSize,
  i64 %offset) argmemonly nounwind willreturn "gc-leaf-function"

define void @_sha3_implCompress(
  ptr addrspace(1) nonnull %this,
  ptr addrspace(1) nonnull %buffer,
  i32 %offset,
  ptr addrspace(1) nonnull %state
) alwaysinline argmemonly nounwind {  
entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %offset.64 = zext i32 %offset to i64
  %buffer.base = getelementptr i8, ptr addrspace(1) %buffer, i32 %byteArrayHeaderSize
  %buffer.start = getelementptr i8, ptr addrspace(1) %buffer.base, i32 %offset
  %state.base = getelementptr i8, ptr addrspace(1) %state, i32 %byteArrayHeaderSize
  ; get the digest length from this 
  %blockSize.offset = load i32, ptr @sun_security_provider_DigestBase.blockSize_offset_in_bytes   
  %blockSize.addr = getelementptr inbounds i8, ptr addrspace(1) %this, i32 %blockSize.offset
  %blockSize = load i32, ptr addrspace(1) %blockSize.addr
  %blockSize.64 = zext i32 %blockSize to i64
  ; and call the stub 
  call void @"StubRoutines::sha3_implCompress()"(
      ptr addrspace(1) %buffer.start, 
      ptr addrspace(1) %state.base, 
      i64 %blockSize.64,
      i64 %offset.64)
  ret void
}

declare i32 @"StubRoutines::sha3_implCompressMB()"(
  ptr addrspace(1) nonnull %buf,
  ptr addrspace(1) nonnull %state,
  i64 %blockSize,
  i64 %offset,
  i64 %lim) argmemonly nounwind willreturn "gc-leaf-function"

define i32 @_sha3_digestBase_implCompressMB(
  ptr addrspace(1) nonnull %this,
  ptr addrspace(1) nonnull %buffer,
  i32 %offset,
  i32 %limit,
  ptr addrspace(1) nonnull %state
) alwaysinline argmemonly nounwind {  
entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %offset.64 = zext i32 %offset to i64
  %limit.64 = zext i32 %limit to i64
  %buffer.base = getelementptr i8, ptr addrspace(1) %buffer, i32 %byteArrayHeaderSize
  %buffer.start = getelementptr i8, ptr addrspace(1) %buffer.base, i32 %offset
  %state.base = getelementptr i8, ptr addrspace(1) %state, i32 %byteArrayHeaderSize
  ; get the digest length from this 
  %blockSize.offset = load i32, ptr @sun_security_provider_DigestBase.blockSize_offset_in_bytes   
  %blockSize.addr = getelementptr inbounds i8, ptr addrspace(1) %this, i32 %blockSize.offset
  %blockSize = load i32, ptr addrspace(1) %blockSize.addr
  %blockSize.64 = zext i32 %blockSize to i64
  ; and call the stub 
  %result = call i32 @"StubRoutines::sha3_implCompressMB()"(
      ptr addrspace(1) %buffer.start, 
      ptr addrspace(1) %state.base, 
      i64 %blockSize.64,
      i64 %offset.64,
      i64 %limit.64)
  ret i32 %result
}


declare void @"StubRoutines::multiplyToLen()"(
                                  ptr addrspace(1) readonly %xstart, 
                                  i64 %xlen, 
                                  ptr addrspace(1) readonly %ystart, 
                                  i64 %ylen, 
                                  ptr addrspace(1) %zstart, 
                                  i64 %zlen) argmemonly nounwind willreturn "gc-leaf-function"

; private static int[] implMultiplyToLen(int[] x, int xlen, int[] y, int ylen, int[] z)
define zing ptr addrspace(1) @_multiplyToLen(
  ptr addrspace(1) readonly %x, i32 %xlen,
  ptr addrspace(1) readonly %y, i32 %ylen,
  ptr addrspace(1) %z)
"alwaysinline-top-level" argmemonly {
entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize

  %xstart = getelementptr i8, ptr addrspace(1) %x, i32 %intArrayHeaderSize
  %ystart = getelementptr i8, ptr addrspace(1) %y, i32 %intArrayHeaderSize
  %zstart = getelementptr i8, ptr addrspace(1) %z, i32 %intArrayHeaderSize
  %zlen = call i32 @azul.array_length(ptr addrspace(1) %z)
  %xlen.zext = zext i32 %xlen to i64
  %ylen.zext = zext i32 %ylen to i64
  %zlen.zext = zext i32 %zlen to i64

  call void @"StubRoutines::multiplyToLen()"(
                                    ptr addrspace(1) %xstart, 
                                    i64 %xlen.zext, 
                                    ptr addrspace(1) %ystart, 
                                    i64 %ylen.zext, 
                                    ptr addrspace(1) %zstart, 
                                    i64 %zlen.zext)
  ret ptr addrspace(1) %z
}

declare void @"StubRoutines::squareToLen()"(
                                  ptr addrspace(1) readonly %xstart, 
                                  i64 %xlen, 
                                  ptr addrspace(1) %zstart, 
                                  i64 %zlen) argmemonly willreturn "gc-leaf-function" nounwind

; private static final int[] implSquareToLen(int[] x, int len, int[] z, int zlen)
define zing ptr addrspace(1) @_squareToLen(
  ptr addrspace(1) readonly %x, i32 %xlen,
  ptr addrspace(1) %z, i32 %zlen)
"alwaysinline-top-level" argmemonly {
entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize

  %xstart = getelementptr i8, ptr addrspace(1) %x, i32 %intArrayHeaderSize
  %zstart = getelementptr i8, ptr addrspace(1) %z, i32 %intArrayHeaderSize
  %xlen.zext = zext i32 %xlen to i64
  %zlen.zext = zext i32 %zlen to i64

  call void @"StubRoutines::squareToLen()"(
                                    ptr addrspace(1) %xstart, 
                                    i64 %xlen.zext, 
                                    ptr addrspace(1) %zstart, 
                                    i64 %zlen.zext)
  ret ptr addrspace(1) %z
}

; Note that offset here should be (out.length - real_offset)
declare i32 @"StubRoutines::mulAdd()"(
                                  ptr addrspace(1) %out_start, 
                                  ptr addrspace(1) readonly %in_start, 
                                  i64 %offset, i64 %len, i64 %k)
        argmemonly nounwind willreturn "gc-leaf-function"

; private static int implMulAdd(int[] out, int[] in, int offset, int len, int k) 
define zing i32 @_mulAdd(
  ptr addrspace(1) %out, ptr addrspace(1) readonly %in,
  i32 %offset, i32 %len, i32 %k)
"alwaysinline-top-level" argmemonly {
entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize

  %in_start = getelementptr i8, ptr addrspace(1) %in, i32 %intArrayHeaderSize
  %out_start = getelementptr i8, ptr addrspace(1) %out, i32 %intArrayHeaderSize
  %out_len = call i32 @azul.array_length(ptr addrspace(1) %out)
  %fixed_offset = sub nsw nuw i32 %out_len, %offset

  %fixed_offset.i64 = zext i32 %fixed_offset to i64
  %len.i64 = zext i32 %len to i64
  %k.i64 = zext i32 %k to i64

  %carry = call i32 @"StubRoutines::mulAdd()"(
                                  ptr addrspace(1) %out_start, 
                                  ptr addrspace(1) %in_start, 
                                  i64 %fixed_offset.i64, i64 %len.i64,
                                  i64 %k.i64)
                                  
  ret i32 %carry
}

declare void @"StubRoutines::montgomeryMultiply()"(
  ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %n,
  i64 %len, i64 %inv, ptr addrspace(1) %m) argmemonly willreturn "gc-leaf-function" nounwind

; private static int[] implMontgomeryMultiply(int[] a, int[] b, int[] n, int len, long inv, int[] product)
define zing ptr addrspace(1) @_montgomeryMultiply(
  ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %n,
  i32 %len, i64 %inv, ptr addrspace(1) %m)
"alwaysinline-top-level" argmemonly {
entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  ; a,b,n,m have already been range checked in java.math.BigInteger.implMontgomeryMultiplyChecks
  %a.start = getelementptr i8, ptr addrspace(1) %a, i32 %intArrayHeaderSize
  %b.start = getelementptr i8, ptr addrspace(1) %b, i32 %intArrayHeaderSize
  %n.start = getelementptr i8, ptr addrspace(1) %n, i32 %intArrayHeaderSize
  %m.start = getelementptr i8, ptr addrspace(1) %m, i32 %intArrayHeaderSize
  %len.zext = zext i32 %len to i64 ; len is jint in C

  call void @"StubRoutines::montgomeryMultiply()"(
                                    ptr addrspace(1) %a.start, 
                                    ptr addrspace(1) %b.start, 
                                    ptr addrspace(1) %n.start, 
                                    i64 %len.zext,
                                    i64 %inv,
                                    ptr addrspace(1) %m.start)
  ret ptr addrspace(1) %m
}

declare void @"StubRoutines::montgomerySquare()"(
  ptr addrspace(1) %a, ptr addrspace(1) %n,
  i64 %len, i64 %inv, ptr addrspace(1) %m) argmemonly willreturn "gc-leaf-function" nounwind

; private static int[] implMontgomerySquare(int[] a, int[] n, int len, long inv, int[] product)
define zing ptr addrspace(1) @_montgomerySquare(
  ptr addrspace(1) %a, ptr addrspace(1) %n, i32 %len, i64 %inv,
  ptr addrspace(1) %m)
"alwaysinline-top-level" argmemonly {
entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  ; a,n,m have already been range checked in java.math.BigInteger.implMontgomeryMultiplyChecks
  %a.start = getelementptr i8, ptr addrspace(1) %a, i32 %intArrayHeaderSize
  %n.start = getelementptr i8, ptr addrspace(1) %n, i32 %intArrayHeaderSize
  %m.start = getelementptr i8, ptr addrspace(1) %m, i32 %intArrayHeaderSize
  %len.zext = zext i32 %len to i64 ; len is jint in C

  call void @"StubRoutines::montgomerySquare()"(
                                    ptr addrspace(1) %a.start, 
                                    ptr addrspace(1) %n.start, 
                                    i64 %len.zext,
                                    i64 %inv,
                                    ptr addrspace(1) %m.start)

  ret ptr addrspace(1) %m
}

declare i32 @"StubRoutines::vectorizedMismatch()"(
  ptr addrspace(1) %a, ptr addrspace(1) %b,
  i64 %length, i64 %log_2_array_index_scale
) argmemonly willreturn "gc-leaf-function" nounwind
; public static int vectorizedMismatch(Object a, long a_offset,
;                                      Object b, long b_offset,
;                                      int length, int log_2_array_index_scale)
define zing i32 @_vectorizedMismatch(
    ptr addrspace(1) %a, i64 %a_offset,
    ptr addrspace(1) %b, i64 %b_offset,
    i32 %length, i32 %log_2_array_index_scale
) "alwaysinline-top-level" argmemonly "min-legal-vector-width"="512" {
entry:
    %a.start = getelementptr i8, ptr addrspace(1) %a, i64 %a_offset
    %b.start = getelementptr i8, ptr addrspace(1) %b, i64 %b_offset
    %length.zext = zext i32 %length to i64
    %log_2_array_index_scale.zext = zext i32 %log_2_array_index_scale to i64
    %should_pi = load i1, ptr @FalconPartialInlineVectorizedMismatch

    br i1 %should_pi, label %len_check, label %stub

len_check:
    %byte_len = shl i64 %length.zext, %log_2_array_index_scale.zext ; i64 to avoid overflow
    %cmp = icmp ult i64 %byte_len, 32
    br i1 %cmp, label %partial, label %stub

partial:
    %byte_len.trunc = trunc i64 %byte_len to i32 ; byte_len < 32, so the following is correct
    %len_shifted = shl i32 1, %byte_len.trunc
    %scalar_mask = sub i32 %len_shifted, 1
    %vector_mask = bitcast i32 %scalar_mask to <32 x i1>
    %a_vec = call <32 x i8> @llvm.masked.load.v32i8.p1(ptr addrspace(1) %a.start, i32 1, <32 x i1> %vector_mask, <32 x i8> poison)
    %b_vec = call <32 x i8> @llvm.masked.load.v32i8.p1(ptr addrspace(1) %b.start, i32 1, <32 x i1> %vector_mask, <32 x i8> poison)
    %vcmp = icmp eq <32 x i8> %a_vec, %b_vec
    ; %vcmp has poison elements at places not included in the %vector_mask, so use 'select'
    ; instead of 'and' to get rid of them. Otherwise, the following bitcast would produce poison.
    %mcmp = select <32 x i1> %vector_mask, <32 x i1> %vcmp, <32 x i1> %vector_mask
    %cmp_sc = bitcast <32 x i1> %mcmp to i32 ; llvm IR and x86 are little-endian
    %compare_masks = icmp eq i32 %scalar_mask, %cmp_sc
    br i1 %compare_masks, label %all_equal, label %continue

continue:
    %cmp_inv = xor i32 %cmp_sc, -1
    %result = call i32 @llvm.cttz.i32(i32 %cmp_inv, i1 0)
    %res_shifted = lshr i32 %result, %log_2_array_index_scale

    ret i32 %res_shifted

all_equal:
    ret i32 -1

stub:
    %res = call i32 @"StubRoutines::vectorizedMismatch()"(
        ptr addrspace(1) %a.start,
        ptr addrspace(1) %b.start,
        i64 %length.zext,
        i64 %log_2_array_index_scale.zext
    )
    ret i32 %res
}

declare i32 @"StubRoutines::base64_decodeBlock()"(
  i8 addrspace(1)* %src, i64 %sp, i64 %sl,
  i8 addrspace(1)* %dst, i64 %dp, i64 %isURL, i64 %isMIME)
argmemonly willreturn "gc-leaf-function" nounwind

; private int decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
define zing i32 @_base64_decodeBlock(
  i8 addrspace(1)* %this_obj,
  i8 addrspace(1)* %src, i32 %sp, i32 %sl,
  i8 addrspace(1)* %dst, i32 %dp, i32 %isURL, i32 %isMIME)
"alwaysinline-top-level" argmemonly {
entry:
  %byteArrayHeaderSize = load i32, i32* @arrayOopDesc.byteArrayHeaderSize
  %src.start = getelementptr i8, i8 addrspace(1)* %src, i32 %byteArrayHeaderSize
  %sp.i64 = zext i32 %sp to i64
  %sl.i64 = zext i32 %sl to i64
  %dst.start = getelementptr i8, i8 addrspace(1)* %dst, i32 %byteArrayHeaderSize
  %dp.i64 = zext i32 %dp to i64
  %isURL.i64 = zext i32 %isURL to i64
  %isMIME.i64 = zext i32 %isMIME to i64

  %ret_val = call i32 @"StubRoutines::base64_decodeBlock()"(
                                    i8 addrspace(1)* %src.start,
                                    i64 %sp.i64,
                                    i64 %sl.i64,
                                    i8 addrspace(1)* %dst.start,
                                    i64 %dp.i64,
                                    i64 %isURL.i64,
                                    i64 %isMIME.i64)
  ret i32 %ret_val
}

declare void @"StubRoutines::base64_encodeBlock()"(
  ptr addrspace(1) %src, i64 %sp, i64 %sl, ptr addrspace(1) %dst, i64 %dp, i64 %isURL)
argmemonly willreturn "gc-leaf-function" nounwind

; private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
define zing void @_base64_encodeBlock(
  ptr addrspace(1) %this_obj,
  ptr addrspace(1) %src, i32 %sp, i32 %sl,
  ptr addrspace(1) %dst, i32 %dp, i32 %isURL)
"alwaysinline-top-level" argmemonly {
entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %src.start = getelementptr i8, ptr addrspace(1) %src, i32 %byteArrayHeaderSize
  %sp.i64 = zext i32 %sp to i64
  %sl.i64 = zext i32 %sl to i64
  %dst.start = getelementptr i8, ptr addrspace(1) %dst, i32 %byteArrayHeaderSize
  %dp.i64 = zext i32 %dp to i64
  %isURL.i64 = zext i32 %isURL to i64

  call void @"StubRoutines::base64_encodeBlock()"(
                                    ptr addrspace(1) %src.start,
                                    i64 %sp.i64,
                                    i64 %sl.i64,
                                    ptr addrspace(1) %dst.start,
                                    i64 %dp.i64,
                                    i64 %isURL.i64)
  ret void
}

declare void @"StubRoutines::ghash_processBlocks()"(
  ptr addrspace(1) %st, ptr addrspace(1) %subH, ptr addrspace(1) %data, i64 %len)
argmemonly willreturn "gc-leaf-function" nounwind

; private static void processBlocks(byte[] data, int inOfs, int blocks, long[] st, long[] subH)
define zing void @_ghash_processBlocks(
  ptr addrspace(1) %data, i32 %inOfs, i32 %blocks,
  ptr addrspace(1) %st, ptr addrspace(1) %subH)
"alwaysinline-top-level" argmemonly {
entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %longArrayHeaderSize = load i32, ptr @arrayOopDesc.longArrayHeaderSize
  ; data, state and subkeyH  have already been range checked in com.sun.crypto.provider.GHASH.ghashRangeCheck
  %state.start = getelementptr i8, ptr addrspace(1) %st, i32 %longArrayHeaderSize
  %subH.start = getelementptr i8, ptr addrspace(1) %subH, i32 %longArrayHeaderSize
  %data.base = getelementptr i8, ptr addrspace(1) %data, i32 %byteArrayHeaderSize
  %data.start = getelementptr i8, ptr addrspace(1) %data.base, i32 %inOfs
  %len.zext = zext i32 %blocks to i64 ; len is jint in C

  call void @"StubRoutines::ghash_processBlocks()"(
                                    ptr addrspace(1) %state.start, 
                                    ptr addrspace(1) %subH.start,
                                    ptr addrspace(1) %data.start,
                                    i64 %len.zext)
  ret void
}

declare i32 @"StubRoutines::chacha20Block()"(
  ptr addrspace(1) %initState,
  ptr addrspace(1) %result
) argmemonly willreturn "gc-leaf-function" nounwind

; private static int implChaCha20Block(int[] initState, byte[] result)
define zing i32 @_chacha20Block(
  ptr addrspace(1) %initState,
  ptr addrspace(1) %result
) "alwaysinline-top-level" argmemonly {
entry:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  %initState.base = getelementptr inbounds i8, ptr addrspace(1) %initState, i32 %intArrayHeaderSize
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %result.base = getelementptr i8, ptr addrspace(1) %result, i32 %byteArrayHeaderSize
  %res = call i32 @"StubRoutines::chacha20Block()"(
    ptr addrspace(1) %initState.base,
    ptr addrspace(1) %result.base)
  ret i32 %res
}

declare void @"StubRoutines::poly1305_processBlocks()"(
    ptr addrspace(1) %input,
    i64 %length,
    ptr addrspace(1) %aLimbs,
    ptr addrspace(1) %rLimbs
) willreturn "gc-leaf-function" nounwind

; private void processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
define zing void @_poly1305_processBlocks(
    ptr addrspace(1) %this_obj,
    ptr addrspace(1) %input,
    i32 %offset,
    i32 %length,
    ptr addrspace(1) %aLimbs,
    ptr addrspace(1) %rLimbs
) "alwaysinline-top-level" argmemonly {
entry:
    %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
    %input.base = getelementptr i8, ptr addrspace(1) %input, i32 %byteArrayHeaderSize
    %input.start = getelementptr i8, ptr addrspace(1) %input.base, i32 %offset
    %longArrayHeaderSize = load i32, ptr @arrayOopDesc.longArrayHeaderSize
    %aLimbs.base = getelementptr i8, ptr addrspace(1) %aLimbs, i32 %longArrayHeaderSize
    %rLimbs.base = getelementptr i8, ptr addrspace(1) %rLimbs, i32 %longArrayHeaderSize
    %length.zext = zext i32 %length to i64
    call void @"StubRoutines::poly1305_processBlocks()"(
        ptr addrspace(1) %input.start,
        i64 %length.zext,
        ptr addrspace(1) %aLimbs.base,
        ptr addrspace(1) %rLimbs.base
    )
    ret void
}

declare void @"StubRoutines::intpoly_assign()"(
    i64 %set,
    ptr addrspace(1) %a,
    ptr addrspace(1) %b,
    i64 %length
) willreturn "gc-leaf-function" nounwind

; protected static void conditionalAssign(int set, long[] a, long[] b)
define zing void @_intpoly_assign(
    i32 %set,
    ptr addrspace(1) %a,
    ptr addrspace(1) %b
) "alwaysinline-top-level" argmemonly {
entry:
    %longArrayHeaderSize = load i32, ptr @arrayOopDesc.longArrayHeaderSize
    %a.base = getelementptr i8, ptr addrspace(1) %a, i32 %longArrayHeaderSize
    %b.base = getelementptr i8, ptr addrspace(1) %b, i32 %longArrayHeaderSize
    %set.zext = zext i32 %set to i64
    %length = call i32 @azul.array_length(ptr addrspace(1) %a)
    %length.zext = zext i32 %length to i64
    call void @"StubRoutines::intpoly_assign()"(
        i64 %set.zext,
        ptr addrspace(1) %a.base,
        ptr addrspace(1) %b.base,
        i64 %length.zext
    )
    ret void
}

declare void @"StubRoutines::intpoly_montgomeryMult_P256()"(
    ptr addrspace(1) %a,
    ptr addrspace(1) %b,
    ptr addrspace(1) %r
) willreturn "gc-leaf-function" nounwind

; private void multImpl(long[] a, long[] b, long[] r)
define zing void @_intpoly_montgomeryMult_P256(
    ptr addrspace(1) %this_obj,
    ptr addrspace(1) %a,
    ptr addrspace(1) %b,
    ptr addrspace(1) %r
) "alwaysinline-top-level" argmemonly {
entry:
    %longArrayHeaderSize = load i32, ptr @arrayOopDesc.longArrayHeaderSize
    %a.base = getelementptr i8, ptr addrspace(1) %a, i32 %longArrayHeaderSize
    %b.base = getelementptr i8, ptr addrspace(1) %b, i32 %longArrayHeaderSize
    %r.base = getelementptr i8, ptr addrspace(1) %r, i32 %longArrayHeaderSize
    call void @"StubRoutines::intpoly_montgomeryMult_P256()"(
        ptr addrspace(1) %a.base,
        ptr addrspace(1) %b.base,
        ptr addrspace(1) %r.base
    )
    ret void
}

; int java.util.zip.CRC32.update(int crc, int b)
define zing i32 @_updateCRC32(i32 %crc, i32 %b)
  "alwaysinline-top-level" readnone {
entry:
  ; int c = ~ crc;
  ; b = CRCTable[(b ^ c) & 0xFF];
  ; b = b ^ (c >>> 8);
  ; crc = ~b;
  
  %c = xor i32 %crc, -1
  %b_xor_c = xor i32 %b, %c
  %crc_table_index = and i32 %b_xor_c, 255 ; 0xFF
  %crc_one_byte_addr = getelementptr inbounds [256 x i32], ptr @StubRoutines.CRCTable, i32 0, i32 %crc_table_index  
  %crc_one_byte = load atomic i32, ptr %crc_one_byte_addr unordered, align 4

  %c_shift_8 = lshr i32 %c, 8
  %new_b = xor i32 %c_shift_8, %crc_one_byte
  %new_crc = xor i32 %new_b, -1
  ret i32 %new_crc
}

define i32 @azul.get_current_tid()
     "azul-late-inline"="0" alwaysinline "vmstate-idempotent"="true" willreturn "gc-leaf-function" readonly nounwind {
 entry:
  %threadIDOffset = load i32, ptr @Thread.reversible_tid_offset_bytes
  %threadIDAddress = getelementptr i8, ptr addrspace(256) null, i32 %threadIDOffset
  %threadID = load i32, ptr addrspace(256) %threadIDAddress, !tbaa !13
  ret i32 %threadID
}

define i64 @azul.get_current_thread() 
     "azul-late-inline"="0" alwaysinline "vmstate-idempotent"="true" willreturn "gc-leaf-function" readonly nounwind {
 entry:
  %SelfOffset = load i32, ptr @Thread.self_offset_bytes
  %SelfAddress = getelementptr i8, ptr addrspace(256) null, i32 %SelfOffset
  %jdk_version = load i32, ptr @JDK_VERSION_MAJOR
  %jdk_21_or_above = icmp uge i32 %jdk_version, 21
  br i1 %jdk_21_or_above, label %support_virtual_threads, label %invariant_thread_ptr

 invariant_thread_ptr:
  ; Note: the use of invariant load here allows these loads to fold up when
  ; inlining.  We might technically be breaking semantics here because a thread
  ; is invariant on one executing thread, but not globally.  However, that's 
  ; unlikely to every matter in practice.
  %thread.0 = load i64, ptr addrspace(256) %SelfAddress, !invariant.load !{}
  ret i64 %thread.0

 support_virtual_threads:
  ; With virtual threads the current thread may change after certain calls, so
  ; we can't mark the load as invariant.
  %thread.1 = load i64, ptr addrspace(256) %SelfAddress
  ret i64 %thread.1
}

define ptr addrspace(256) @azul.get_current_tlab_top_addr() 
     "azul-late-inline"="0" alwaysinline "vmstate-idempotent"="true" willreturn "gc-leaf-function" readonly nounwind {
 entry:
  %Offset = load i32, ptr @Thread.tlab_top_offset_bytes
  %Address = getelementptr i8, ptr addrspace(256) null, i32 %Offset
  ret ptr addrspace(256) %Address
}

define ptr addrspace(256) @azul.get_current_tlab_zend_addr() 
     "azul-late-inline"="0" alwaysinline "vmstate-idempotent"="true" willreturn "gc-leaf-function" readonly nounwind {
 entry:
  %Offset = load i32, ptr @Thread.tlab_zend_offset_bytes
  %Address = getelementptr i8, ptr addrspace(256) null, i32 %Offset
  ret ptr addrspace(256) %Address
}

define ptr @azul.get_current_tlab_end() 
     "azul-late-inline"="0" alwaysinline "vmstate-idempotent"="true" willreturn "gc-leaf-function" readonly nounwind {
 entry:
  %Offset = load i32, ptr @Thread.tlab_end_offset_bytes
  %Address = getelementptr i8, ptr addrspace(256) null, i32 %Offset
  %end = load ptr, ptr addrspace(256) %Address
  ret ptr %end
}

;; Increment JavaThread::_held_monitor_count if needed.
define void @azul.inc_held_monitor_count()
     nounwind alwaysinline willreturn "gc-leaf-function" "azul-late-inline"="0"
{
entry:
  %supportHeldMonitorCount = load i1, ptr @SupportHeldMonitorCount
  br i1 %supportHeldMonitorCount, label %increment_held_monitor_count, label %exit

increment_held_monitor_count:
  %heldMonitorCountOffsetInBytes = load i32, ptr @Thread.held_monitor_count_offset_bytes
  %heldMonitorCount.ptr = getelementptr inbounds i8, ptr addrspace(256) null, i32 %heldMonitorCountOffsetInBytes
  %heldMonitorCount = load i64, ptr addrspace(256) %heldMonitorCount.ptr
  %heldMonitorCount.inc = add i64 %heldMonitorCount, 1
  store i64 %heldMonitorCount.inc, ptr addrspace(256) %heldMonitorCount.ptr
  br label %exit

exit:
  ret void
}

;; Decrement JavaThread::_held_monitor_count if needed.
define void @azul.dec_held_monitor_count()
     nounwind alwaysinline willreturn "gc-leaf-function" "azul-late-inline"="0"
{
entry:
  %supportHeldMonitorCount = load i1, ptr @SupportHeldMonitorCount
  br i1 %supportHeldMonitorCount, label %decrement_held_monitor_count, label %exit

decrement_held_monitor_count:
  %heldMonitorCountOffsetInBytes = load i32, ptr @Thread.held_monitor_count_offset_bytes
  %heldMonitorCount.ptr = getelementptr inbounds i8, ptr addrspace(256) null, i32 %heldMonitorCountOffsetInBytes
  %heldMonitorCount = load i64, ptr addrspace(256) %heldMonitorCount.ptr
  %heldMonitorCount.dec = sub i64 %heldMonitorCount, 1
  store i64 %heldMonitorCount.dec, ptr addrspace(256) %heldMonitorCount.ptr
  br label %exit

exit:
  ret void
}

;; Try to lock %oop "quickly".  Return true on success.
;;
;; IMPORTANT: This function needs to be atomic with respect to
;; safepoints (i.e. there can be no safepoints within the function
;; body, before or after inlining).  The reason is that locks may be
;; inflated at a checkpoint (requested by a thread contending for the
;; lock) and a mark word loaded before a taken checkpoint is rendered
;; invalid after it.  This is why this function is marked
;; "azul-late-inline"="3".
define i1 @azul.monitorenter.fastpath(ptr addrspace(1) %oop) 
    noinline "azul-late-inline"="3" nounwind willreturn "gc-leaf-function" {

 ;; For an overview of our locking scheme, see markWord.hpp; and
 ;; lock_fast_path in assembler_x86.cpp
 load_vm_constants:
  ;; All values defined in this basic block should be constants
  %lockInfoOffset = load i32, ptr @oopDesc.lock_info_offset_in_bytes
  %recursionCountMaskInPlace = load i32, ptr @markWord.rec_mask_in_place
  %lockInfoRecursionShift = load i32, ptr @markWord.rec_shift
  %dontCareBitPoisonMask = load i32, ptr @markWord.dont_care_bit_poison_xor
  %threadIDMaskInPlace = load i32, ptr @markWord.tid_mask_in_place

  %oneRecursionCount = shl i32 1, %lockInfoRecursionShift
  %inverseThreadIDMaskInPlace = xor i32 -1, %threadIDMaskInPlace
  br label %entry

 entry:
  %threadID = call i32 @azul.get_current_tid()
  %lockInfoAddr = getelementptr i8, ptr addrspace(1) %oop, i32 %lockInfoOffset

  ;; lockInfo is the lower 32 bits of the mark word
  %lockInfo = load atomic i32, ptr addrspace(1) %lockInfoAddr unordered, align 4, !tbaa !14
  %recursionCount = and i32 %lockInfo, %recursionCountMaskInPlace
  %isUnlocked = icmp eq i32 %recursionCount, 0
  ;; Java makes heavy use of synchronized methods and classes that are never
  ;; shared across threads.  We want the fast path for unlocked objects placed 
  ;; in-line with the non-fat lock fast paths placed out of line.  We (mis)use
  ;; profiling metadata to achieve this effect.
  br i1 %isUnlocked, label %lock_unlocked, label %lock_not_unlocked, !prof !6

 lock_unlocked:
  ;; transition: { unlocked } -> { locked by current thread }
  %lockInfo.tid.cleared = and i32 %lockInfo, %inverseThreadIDMaskInPlace
  ;; assertion: the result of %lockInfo.tid.cleared should be either zero
  ;; or have only the high bit set.  Since that bit is specifically don't
  ;; care, we can pretend it's exactly zero.  By doing so, we can replace
  ;; andl;orl;orl; instruction sequence with an single orl insruction.
  ;; This saves about 1ns on each lock attempt.  If this bit ever becomes
  ;; something other than don't care, remove the following line.
  %lockInfo.tid.cleared.assume = and i32 %lockInfo.tid.cleared, 0
  %lockInfo.tid.mask = or i32 %oneRecursionCount, %threadID
  %newLockInfo = or i32 %lockInfo.tid.cleared.assume, %lockInfo.tid.mask
  ;; If we succeed, we want to establish a synchronizes-with edge with
  ;; the last unlock operation.  If we fail we don't really care about
  ;; ordering.
  %cmpxchg_result = cmpxchg ptr addrspace(1) %lockInfoAddr, i32 %lockInfo, i32 %newLockInfo acquire monotonic
  %success = extractvalue { i32, i1 } %cmpxchg_result, 1
  br i1 %success, label %lock_fastpath_success, label %lock_fastpath_bailout

 lock_not_unlocked:
  ;; Check if we're fat locked, and bail out to the slow path if we
  ;; are.

  ;; If we are fat locked, the LSB of the lock info is 0.  If we are
  ;; thin locked, the low 28 bits contain the reversible thread ID
  ;; (TID), and the TID always has 1 as its LSB.
  %fatLockedBit = and i32 %lockInfo, 1
  %isThinLocked = icmp ne i32 %fatLockedBit, 0
  br i1 %isThinLocked, label %thin_locked, label %lock_fastpath_bailout, !prof !6 ;; likely(%isThinLocked)

 thin_locked:
  ;; We're not fat locked.  This means we are thin locked.  Check if
  ;; we are thin locked by ourselves and bail out to the slow path if
  ;; not.
  %currentlyOwningThread = and i32 %lockInfo, %threadIDMaskInPlace
  %isThinLockedBySelf = icmp eq i32 %currentlyOwningThread, %threadID
  br i1 %isThinLockedBySelf, label %thin_locked_by_self, label %lock_fastpath_bailout, !prof !6 ;; likely(%isThinLockedBySelf)

 thin_locked_by_self:
  ;; We are thin locked by ourselves.  If adding 1 to the recursion
  ;; count will not overflow, we are good to go.

  %lockInfoWithFullRecCount = or i32 %lockInfo, %recursionCountMaskInPlace
  %recCountWontOverflow = icmp ne i32 %lockInfoWithFullRecCount, %lockInfo
  br i1 %recCountWontOverflow, label %lock_thin_recursable, label %lock_fastpath_bailout ,!prof !6 ;; likely(%recCountWontOverflow)

 lock_thin_recursable:
  ;; transition: { locked-by-self, recursion-count = N } -> { locked-by-self, recursion-count = N + 1 }
  ;; we know (N + 1) will not overflow
  %newLockInfo.recursable = add i32 %lockInfo, %oneRecursionCount

  ;; Since then entire lock half-word (32 bits) belongs to the locking
  ;; mechanism, we can get away with a plain store here.  The highest
  ;; bit in %lockInfo is a "don't-care" bit that is currently unused,
  ;; and if the VM starts using it for some other reason we will
  ;; probably have to change the store into a CAS.  To make such a
  ;; situation easier to detect, we artificially flip the don't-care
  ;; bit before storing it back.
  ;;
  ;; NB: at least currently, the xor is free because LLVM folds it
  ;; into the computation of %newLockInfo.recursable above.

  %newLockInfo.recursable.poisoned = xor i32 %newLockInfo.recursable, %dontCareBitPoisonMask
  store atomic i32 %newLockInfo.recursable.poisoned, ptr addrspace(1) %lockInfoAddr unordered, align 4, !tbaa !14
  br label %lock_fastpath_success

 lock_fastpath_success:
  call void @azul.inc_held_monitor_count()
  ret i1 true

 lock_fastpath_bailout:
  ret i1 false
}

;; This version of monitorenter fastpath is only valid on a provably 
;; thread local object.  Since it doesn't have to worry about concurrent 
;; interaction, it can be much simpler.  Note that this is still one
;; case where the fastpath might fail.
define i1 @azul.monitorenter.thread_local.fastpath(ptr addrspace(1) %oop) 
    alwaysinline "azul-late-inline"="1" nounwind willreturn "gc-leaf-function" {

 ;; For an overview of our locking scheme, see markWord.hpp; and
 ;; lock_fast_path in assembler_x86.cpp
 load_vm_constants:
  ;; All values defined in this basic block should be constants
  %lockInfoOffset = load i32, ptr @oopDesc.lock_info_offset_in_bytes
  %recursionCountMaskInPlace = load i32, ptr @markWord.rec_mask_in_place
  %lockInfoRecursionShift = load i32, ptr @markWord.rec_shift
  %dontCareBitPoisonMask = load i32, ptr @markWord.dont_care_bit_poison_xor
  %threadIDMaskInPlace = load i32, ptr @markWord.tid_mask_in_place

  %oneRecursionCount = shl i32 1, %lockInfoRecursionShift
  %inverseThreadIDMaskInPlace = xor i32 -1, %threadIDMaskInPlace
  br label %entry

 entry:
  %threadID = call i32 @azul.get_current_tid()
  %lockInfoAddr = getelementptr i8, ptr addrspace(1) %oop, i32 %lockInfoOffset

  ;; lockInfo is the lower 32 bits of the mark word
  %lockInfo = load atomic i32, ptr addrspace(1) %lockInfoAddr unordered, align 4, !tbaa !14

  ;; Assume: We are locking this object uncontended.  It is either unlocked,
  ;; or currently thin locked by this thread.  We need to ensure our 
  ;; tid is in place and increment the recursion count.  If we overflow, we
  ;; have to bail back the interpreter.  (In theory, we could hit a deopt 
  ;; recompile cycle here, but that would require a 8 deep lock on a provably
  ;; thread local object.  That's rare enough we're going to ignore it for
  ;; now.)
  %lockInfo.tid.cleared = and i32 %lockInfo, %inverseThreadIDMaskInPlace
  %lockInfo.tid = or i32 %lockInfo.tid.cleared, %threadID
  %newLockInfo.recursable = add i32 %lockInfo.tid, %oneRecursionCount

  ;; If adding 1 to the recursion count will not overflow, we are good to go.
  %lockInfoWithFullRecCount = or i32 %lockInfo, %recursionCountMaskInPlace
  %recCountWontOverflow = icmp ne i32 %lockInfoWithFullRecCount, %lockInfo
  br i1 %recCountWontOverflow, label %lock_thin_recursable, label %lock_fastpath_bailout ,!prof !6 ;; likely(%recCountWontOverflow)

 lock_thin_recursable:
  ;; transition: { locked-by-self, recursion-count = N } -> { locked-by-self, recursion-count = N + 1 }
  ;; we know (N + 1) will not overflow
  %newLockInfo.recursable.poisoned = xor i32 %newLockInfo.recursable, %dontCareBitPoisonMask
  store atomic i32 %newLockInfo.recursable.poisoned, ptr addrspace(1) %lockInfoAddr unordered, align 4,!tbaa !14
  call void @azul.inc_held_monitor_count()
  ret i1 true

 lock_fastpath_bailout:
  ret i1 false
}




;; This is the monitorexit fastpath we inline.
;;
;; IMPORTANT; This function has to be atomic with respect to
;; safepoints, for the same reason as monitorenter.fastpath.
define i1 @azul.monitorexit.fastpath(ptr addrspace(1) %oop)
      noinline "azul-late-inline"="3" nounwind willreturn "gc-leaf-function" {
 ;; For an overview of our locking scheme, see markWord.hpp; and
 ;; unlock_fast_path in assembler_x86.cpp
 load_vm_constants:
  ;; All values defined in this basic block should be constants
  %lockInfoOffset = load i32, ptr @oopDesc.lock_info_offset_in_bytes
  %lockInfoRecursionShift = load i32, ptr @markWord.rec_shift
  %dontCareBitPoisonMask = load i32, ptr @markWord.dont_care_bit_poison_xor

  %oneRecursionCount = shl i32 1, %lockInfoRecursionShift
  br label %entry

 entry:
  %lockInfoAddr = getelementptr i8, ptr addrspace(1) %oop, i32 %lockInfoOffset

  ;; lockInfo is the lower 32 bits of the mark word
  %lockInfo = load atomic i32, ptr addrspace(1) %lockInfoAddr unordered, align 4, !tbaa !14
  %fatLockedBit = and i32 %lockInfo, 1
  %isThinLocked = icmp ne i32 %fatLockedBit, 0
  br i1 %isThinLocked, label %thin_locked, label %unlock_fastpath_bailout, !prof !6

 thin_locked:
  %newLockInfo = sub i32 %lockInfo, %oneRecursionCount
  %newLockInfo.poisoned = xor i32 %newLockInfo, %dontCareBitPoisonMask
  store atomic i32 %newLockInfo.poisoned, ptr addrspace(1) %lockInfoAddr release, align 4, !tbaa !14
  call void @azul.dec_held_monitor_count()
  ret i1 true

 unlock_fastpath_bailout:
  ret i1 false
}

; This is a version of the monitorexit code which applies to a provably thread
; local object.  In particular, there can be no other thread accessing the object
; so there's no issue with safepoints as in the normal fastpath.  We also assume
; that thin-locking *always* succeeds on a thread local object.
define void @azul.monitorexit.thread_local(ptr addrspace(1) "removable-allocation-use" %oop)
      "azul-late-inline"="1" nounwind willreturn "gc-leaf-function" "has-latent-use" {
 ;; This code is essentially the monitorexit.fastpath simplified under
 ;; the assumption that the object will always be thinlocked.
 load_vm_constants:
  ;; All values defined in this basic block should be constants
  %lockInfoOffset = load i32, ptr @oopDesc.lock_info_offset_in_bytes
  %lockInfoRecursionShift = load i32, ptr @markWord.rec_shift
  %dontCareBitPoisonMask = load i32, ptr @markWord.dont_care_bit_poison_xor

  %oneRecursionCount = shl i32 1, %lockInfoRecursionShift
  br label %entry

 entry:
  %lockInfoAddr = getelementptr i8, ptr addrspace(1) %oop, i32 %lockInfoOffset

  ;; lockInfo is the lower 32 bits of the mark word
  %lockInfo = load atomic i32, ptr addrspace(1) %lockInfoAddr unordered, align 4, !tbaa !14
  %newLockInfo = sub i32 %lockInfo, %oneRecursionCount
  %newLockInfo.poisoned = xor i32 %newLockInfo, %dontCareBitPoisonMask
  store atomic i32 %newLockInfo.poisoned, ptr addrspace(1) %lockInfoAddr unordered, align 4, !tbaa !14
  call void @azul.dec_held_monitor_count()
  ret void
}

!6 = !{ !"branch_weights", i32 1024, i32 1 }


;; These locking stubs support RTM
declare void @"StubRoutines::c2_lock()"  (ptr addrspace(1)) "consumes-replay-vmstate" nounwind "azul-allow-gcptrs-in-regs"
declare void @"StubRoutines::c2_unlock()"(ptr addrspace(1)) willreturn "gc-leaf-function" nounwind

define void @azul.monitorenter(ptr addrspace(1) %oop)
   "azul-late-inline"="1" noinline nounwind "consumes-replay-vmstate" {
 entry:
  %useMonitorEnterFastPath = load i1, ptr @MonitorEnterFastPath.flag
  br i1 %useMonitorEnterFastPath, label %fastpath, label %slowpath

 fastpath:
  %fastpath.succeeded = call i1 @azul.monitorenter.fastpath(ptr addrspace(1) %oop)
  br i1 %fastpath.succeeded, label %locked, label %slowpath, !prof !9

 slowpath:
  call void @"StubRoutines::c2_lock()"(ptr addrspace(1) %oop) [ "deopt"() ]
  ret void

 locked:
  ret void
}

declare zing_uncommon_trap void @llvm.experimental.deoptimize.isVoid(...)
declare zing_uncommon_trap i32 @llvm.experimental.deoptimize.isI32(...)
declare zing_uncommon_trap i64 @llvm.experimental.deoptimize.isI64(...)
declare zing_uncommon_trap ptr addrspace(1) @llvm.experimental.deoptimize.isPtr(...)

;; Version of monitorenter which is only valid on a provably thread
;; local object.  Note that this will unconditionally deoptimize on
;; deeply nested locks on the same object.
define void @azul.monitorenter.thread_local(ptr addrspace(1) "removable-allocation-use" %oop)
   "azul-late-inline"="1" nounwind "consumes-replay-vmstate" "has-latent-use" {
 entry:
  %useMonitorEnterFastPath = load i1, ptr @MonitorEnterFastPath.flag
  br i1 %useMonitorEnterFastPath, label %fastpath, label %slowpath

 fastpath:
  %fastpath.succeeded = call i1 @azul.monitorenter.thread_local.fastpath(ptr addrspace(1) %oop)
  br i1 %fastpath.succeeded, label %locked, label %slowpath, !prof !9

 slowpath:
  ;; The deoptimization state associated with calls to @azul.monitorenter (and hence
  ;; @azul.monitorenter.thread_local) restart execution at abstract state //after// the monitor has
  ;; been acquired (for monitorenter bytecodes this is the next bci, while for monitorenter
  ;; operations implied as part of a synchronized method, this is bci 0).  This means we cannot just
  ;; uncommon_trap at the deopt state given to us -- if we did, we'd be in the interpreter saying
  ;; we've acquired a monitor that we haven't.  So, instead, we explicitly acquire the lock here and
  ;; then take the side exit.
  ;;
  ;; The deopt state associated with the (blocking) call below is *also* the post-monitorenter
  ;; state.  This is fine (and required!) since @"StubRoutines::c2_lock()" is not idempotent, and
  ;; we don't want to re-execute the monitorenter even if we deoptimize while blocked on this call
  ;; to @"StubRoutines::c2_lock()"
  ;;
  ;; The c2_lock is marked nocapture under the belief that the 
  ;; locking code does not escape the oop in any way which is visible to compiled 
  ;; code (and that even if it did, we're about exit the codeblob so there's 
  ;; a minimal chance we'll exploit the problem.)  This is a short term hack to 
  ;; get monte_carlo performance up, the general topic of monitors and nocapture
  ;; is tracked in Zilla 6406.
  ;;
  call void @"StubRoutines::c2_lock()"(ptr addrspace(1) nocapture %oop) [ "deopt"() ]
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  call zing_uncommon_trap void(...) @llvm.experimental.deoptimize.isVoid(i32 %reasonUnhandled) "azul-allow-gcptrs-in-regs" "deopt-lowering"="live-in" [ "deopt"() ]
  ret void

 locked:
  ret void
}

define void @azul.monitorexit(ptr addrspace(1) %oop)
   willreturn "gc-leaf-function" "azul-late-inline"="1" noinline nounwind {
 entry:
  %useMonitorExitFastPath = load i1, ptr @MonitorExitFastPath.flag
  br i1 %useMonitorExitFastPath, label %fastpath, label %slowpath

 fastpath:
  %fastpath.succeeded = call i1 @azul.monitorexit.fastpath(ptr addrspace(1) %oop)
  br i1 %fastpath.succeeded, label %unlocked, label %slowpath, !prof !9

 slowpath:
  call void @"StubRoutines::c2_unlock()"(ptr addrspace(1) %oop)
  ret void

 unlocked:
  ret void
}

;; Forward declaration.  dolphinAbstractions fills in the body and the
;; attributes.
declare noalias nonnull ptr addrspace(1) @azul.newarray.slow(i64, i32, i32, i32) "consumes-replay-vmstate" nounwind

declare noalias nonnull ptr addrspace(1) @azul.anewarray.slow(i64, i32, i32, i32) "consumes-replay-vmstate" nounwind

declare ptr addrspace(1) @llvm.orca.alloc.barrier.p1.p0(ptr)
  argmemonly nounwind willreturn "allocation-site"
declare ptr addrspace(1) @llvm.orca.alloc.barrier.p1.p1(ptr addrspace(1))
  argmemonly nounwind willreturn "allocation-site"
  
;; Array allocation fastpath, args (i64 %klassID, i64 %size, i64 %len_ekid).
;; *size* computation:
;; size = round_to_heap(array_header + (length << elem_shift));
;; where round_to_heap(x) == round_to(x, MinObjAlignmentInBytes)
;; and round_to(x, y) == ((x + y-1) & ~(y-1))
;;
;; len_ekid = EKID<<32 | length
;;
;; The stub does not safepoint (i.e. is a leaf function). Returns non-null on
;; success, and null on TLAB-full.
declare zing_stub_new_fast noalias ptr addrspace(1) @"StubRoutines::new_fast_array_orca()"(i64, i64, i64, i64)
    willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind "allocation-site"
    
define i64 @azul.round_to_heap(i64 %array_size.i64) 
    "azul-late-inline"="0" alwaysinline "vmstate-idempotent"="true" willreturn "gc-leaf-function" readonly nounwind {
 entry:
  %MinObjAlignmentInBytes.i32 = load i32, ptr @MinObjAlignmentInBytes
  %round_mask.i32 = sub i32 %MinObjAlignmentInBytes.i32, 1
  %round_mask.i64 = zext i32 %round_mask.i32 to i64
  %adjusted_array_size.i64 = add i64 %array_size.i64, %round_mask.i64
  %round_mask_not.i64 = xor i64 %round_mask.i64, -1
  %array_size_rounded.i64 = and i64 %adjusted_array_size.i64, %round_mask_not.i64
  ret i64 %array_size_rounded.i64
}

;; Initialize array object by storing kid and combined length and element kid.
define nonnull ptr @azul.init_array(
  ptr %top, i32 %kid, i64 %len_ekid, i64 %header_size.i64, i64 %size_in_bytes.i64)
     alwaysinline nounwind willreturn "gc-leaf-function" "azul-late-inline"="0"
{
entry:
  ;; Initialize mark_word
  %kid.shift_bits = load i64, ptr @markWord.kid_shift
  %kid.i64 = zext i32 %kid to i64
  %kid.shifted =  shl i64 %kid.i64, %kid.shift_bits
  store i64 %kid.shifted, ptr %top
  ;; Initialize array lenght and element's kid
  %len_offset_in_bytes = load i32, ptr @arrayOopDesc.length_offset_in_bytes
  %len_addr = getelementptr i8, ptr %top, i32 %len_offset_in_bytes
  store i64 %len_ekid, ptr %len_addr

  %new_instance_ver = load i32, ptr @FalconNewInstanceVersion
  switch i32 %new_instance_ver, label %def
                        [ i32 4, label %ver4 ]

ver4:
  ; Don't touch header (makrword + len_ekid)
  %zero_addr = getelementptr i8, ptr %top, i64 %header_size.i64
  %bytes_to_zero = sub i64 %size_in_bytes.i64, %header_size.i64
  call void @azul.tlab_zero(ptr nonnull %zero_addr, i64 %bytes_to_zero)
  ret ptr %top

def:
  ret ptr %top
}

define noalias ptr addrspace(1) @azul.new_array_instance_fast(
    i32 orca_alloc_java_type_kid %kid, i64 %header_size.i64,
    i64 %size_in_bytes.i64, i64 %length_ekid.i64, i64 %zero_from_offset.i64)
     noinline nounwind willreturn "gc-leaf-function" "azul-late-inline"="3" "allocation-site"
{
entry:
  %kid.i64 = zext i32 %kid to i64
  %top_addr = call ptr addrspace(256) @azul.get_current_tlab_top_addr()
  %top = load ptr, ptr addrspace(256) %top_addr
  %new_top = getelementptr i8, ptr %top, i64 %size_in_bytes.i64

  %new_instance_ver = load i32, ptr @FalconNewInstanceVersion
  switch i32 %new_instance_ver, label %fast_path.default
                        [ i32 2, label %fast_path.ver2
                          i32 3, label %fast_path.ver3
                          i32 4, label %fast_path.ver4 ]

fast_path.ver2:
  %is_inside_zero_region.ver2 = call i1 @azul.is_inside_zero_region_ver2(ptr %new_top) 
  br i1 %is_inside_zero_region.ver2, label %init_and_return, label %fast_path.default, !prof !10

fast_path.ver3:
  %is_inside_zero_region.ver3 = call i1 @azul.is_inside_zero_region_ver3(ptr %top, ptr %new_top) 
  br i1 %is_inside_zero_region.ver3, label %init_and_return, label %fast_path.default, !prof !10

fast_path.ver4:
  %is_inside_tlab = call i1 @azul.is_inside_tlab(ptr %new_top)
  br i1 %is_inside_tlab, label %init_and_return, label %fail, !prof !9

init_and_return:
  store ptr %new_top, ptr addrspace(256) %top_addr
  %newObject.fast = call nonnull ptr @azul.init_array(
    ptr %top, i32 %kid, i64 %length_ekid.i64, i64 %header_size.i64, i64 %size_in_bytes.i64)
  %newObject.release = call nonnull ptr addrspace(1) @llvm.orca.alloc.barrier.p1.p0(ptr %newObject.fast)
  ret ptr addrspace(1) %newObject.release

fast_path.default:
  ;; This is main fast allocation path for NewInstanceVersion == 0 && NewInstanceVersion == 1.
  %newArray = call zing_stub_new_fast noalias ptr addrspace(1) @"StubRoutines::new_fast_array_orca()"(
    i64 orca_alloc_java_type_kid %kid.i64, i64 %size_in_bytes.i64, i64 %length_ekid.i64,
    i64 %zero_from_offset.i64)
  ret ptr addrspace(1) %newArray
 
fail:
  ret ptr addrspace(1) null
}

define i64 @azul.compute_array_heap_size(
    i64 %length.i64, i64 %element_shift.i64, i64 %header_size.i64)
    "azul-late-inline"="0" alwaysinline nounwind willreturn "gc-leaf-function" {
  %length_shifted.i64 = shl i64 %length.i64, %element_shift.i64
  %array_size.i64 = add i64 %header_size.i64, %length_shifted.i64
  %array_size_rounded.i64 = call i64 @azul.round_to_heap(i64 %array_size.i64)
  ret i64 %array_size_rounded.i64
}

; The parameter %zero_from_offset is used to optimze array allocation zeroing
; by reducing the zeroing length. If this parameter is non-zero then this function
; fills with zeroes only the tail of the allocated region starting from the specified
; zero_from_offset argument. This is possible if the beginning of the allocated region
; is filled with content of another array by an immediately following memcpy instruction.
;
; Optimization:
;   %dest = @azul.new_array(..., i64 0)
;   ... no uses of dest
;   memcpy(to: %dest, from: %src, length: %length)
; =>
;   %dest = @azul.new_array(..., zero_from_offset: %length)
;   ... no uses of dest
;   memcpy(to: %dest, from: %src, length: %length)
;
define noalias nonnull ptr addrspace(1) @azul.new_array(
    i64 %javaThread, i32 orca_alloc_java_type_kid %array_kid, i32 %element_kid,
    i32 %basictype, i32 %length, i32 %header_size, i32 %element_shift, i64 %zero_from_offset)
    "azul-late-inline"="1" noinline "vmstate-idempotent"="true" "consumes-replay-vmstate"
    nounwind "has-latent-use" {
entry:
;; This could change a negative %length to a positive %length.i64. However, new_fast_array()
;; will do a sign check on the lower 4 bytes of the i64 length we pass in. It won't affect
;; the correctness.
  %length.i64 = zext i32 %length to i64
  %element_shift.i64 = zext i32 %element_shift to i64
  %header_size.i64 = zext i32 %header_size to i64
  %array_size_rounded.i64 = call i64 @azul.compute_array_heap_size(
    i64 %length.i64, i64 %element_shift.i64, i64 %header_size.i64)
  %element_kid.i64 = zext i32 %element_kid to i64
  %element_kid_shifted.i64 = shl i64 %element_kid.i64, 32
  %length_ekid.i64 = or i64 %element_kid_shifted.i64, %length.i64

  %is_negative_length = icmp slt i32 %length, 0
  br i1 %is_negative_length, label %slow_path, label %fast_path.is_allowed

fast_path.is_allowed:
  %useNewArrayFastPath = load i1, ptr @NewArrayFastPath.flag
  br i1 %useNewArrayFastPath, label %fast_path, label %slow_path

fast_path:
  %newArray.fast = call noalias ptr addrspace(1) @azul.new_array_instance_fast(
    i32 orca_alloc_java_type_kid %array_kid, i64 %header_size.i64,
    i64 %array_size_rounded.i64, i64 %length_ekid.i64, i64 %zero_from_offset)
  %success.fast = icmp ne ptr addrspace(1) %newArray.fast, null
  br i1 %success.fast, label %fast_path.done, label %slow_path, !prof !9

fast_path.done:
  ret ptr addrspace(1) %newArray.fast

slow_path:
  %has_ekid.icmp = icmp ne i32 %element_kid, 0
  br i1 %has_ekid.icmp, label %slow_path.anewarray, label %slow_path.newarray
 
slow_path.anewarray:
  %anewArray.slow = call noalias ptr addrspace(1) @azul.anewarray.slow(i64 %javaThread,
    i32 orca_alloc_java_type_kid %array_kid, i32 %element_kid, i32 %length) [ "deopt"() ]
  ret ptr addrspace(1) %anewArray.slow
 
slow_path.newarray:
  %newArray.slow = call noalias ptr addrspace(1) @azul.newarray.slow(i64 %javaThread,
    i32 orca_alloc_java_type_kid %array_kid, i32 %basictype, i32 %length) [ "deopt"() ]
  ret ptr addrspace(1) %newArray.slow
}

; Object jdk.internal.misc.Unsafe::allocateUninitializedArray0(Class<?> componentType, int length);
define noalias ptr addrspace(1) @_allocateUninitializedArray(
    ptr addrspace(1) %this, ptr addrspace(1) %java.lang.class, i32 %length)
    "azul-late-inline"="1" "vmstate-idempotent"="true" "consumes-replay-vmstate" nounwind {
  %current_thread = call i64 @azul.get_current_thread()
  %arrayKlassOop = call ptr addrspace(1) @azul.get_array_klass_oop(ptr addrspace(1) %java.lang.class)
  %arrayKlassOop.is.null = icmp eq ptr addrspace(1) %arrayKlassOop, null
  br i1 %arrayKlassOop.is.null, label %ret_null, label %next

ret_null:
  ; Java code checks that this intrinsic can never be called if componentType.isPrimitive()
  ; returns false. But void.class.isPrimitive() returns true and Unsafe.allocateUninitializedArray0()
  ; must return null in this case. The other possible component types here are basic types that are
  ; guaranteed to have their array klass set. See java_lang_Class::create_basic_type_mirror() where
  ; set_array_klass() is called.
  ret ptr addrspace(1) null

next:
  %array.kid = call i32 @azul.get_kid_from_klass_oop(ptr addrspace(1) %arrayKlassOop)
  %layout.helper_offset_bytes = load i32, ptr @klassOopDesc.layout_helper_offset_in_bytes
  %layout.helper_address = getelementptr i8, ptr addrspace(1) %arrayKlassOop, i32 %layout.helper_offset_bytes
  %layout.helper = load atomic i32, ptr addrspace(1) %layout.helper_address unordered, align 4, !tbaa !26
  %element.shift = call i32 @azul.layout_helper_log2_element_size(i32 %layout.helper)
  %header.size = call i32 @azul.layout_helper_header_size(i32 %layout.helper)
  %basictype = call i32 @azul.layout_helper_element_type(i32 %layout.helper)
  %length.z64 = zext i32 %length to i64
  %element_shift.i64 = zext i32 %element.shift to i64
  %uninitialized_size = shl i64 %length.z64, %element_shift.i64
  %result = call noalias noundef nonnull align 8 ptr addrspace(1) @azul.new_array(
      i64 %current_thread, i32 %array.kid, i32 0, ; element_kid: zero means a primitive type
      i32 %basictype, i32 %length, i32 %header.size, i32 %element.shift, i64 %uninitialized_size)  [ "deopt"() ]
  call void @azul.final_publication_barrier(ptr addrspace(1) %result)
  ret ptr addrspace(1) %result
}

; Patch a filler object of the given size at the given offset.
; See fillerObjKlass.hpp for details.
define void @azul.initialize_filler_obj(ptr addrspace(1) %addr,
    i64 %size_in_words, i64 %has_preheader_bits)
    alwaysinline "azul-late-inline"="0" willreturn "gc-leaf-function" nounwind {
  %kid_shift = load i64, ptr @markWord.kid_shift
  %filler_kid = load i32, ptr @fillerObjKlass.kid
  %size_bits_mask = load i64, ptr @fillerObjKlass.size_bits_mask

  ; Initialize mark_word
  %filler_kid.i64 = zext i32 %filler_kid to i64
  %kid_shifted = shl i64 %filler_kid.i64, %kid_shift
  %kid_and_preheader = or i64 %kid_shifted, %has_preheader_bits
  %size_in_words_masked = and i64 %size_in_words, %size_bits_mask

  %markword = or i64 %kid_and_preheader, %size_in_words_masked
  store atomic i64 %markword, ptr addrspace(1) %addr unordered, align 8
  ret void
}

declare void @llvm.memset.p1.i64(ptr addrspace(1), i8, i64, i1)
declare i64 @llvm.umin.i64(i64, i64)

; Patches the markword and the content of the old array to become the newly
; relocated array. 
;
; This abstraction manipulates the array header and breaks the abstract object
; model. This can happen only after RS4GC, thus "azul-late-inline"="3".
define ptr addrspace(1) @azul.patch_array_for_realloc(ptr addrspace(1) %src,
    i32 %array_kid, i32 %element_kid, i64 %length.i64, i64 %header_size.i64,
    i64 %memcpy.length.in.bytes, i64 %dest_end_offset, i64 %src_size, i1 %needs_end_fill)
 noinline "azul-late-inline"="3" willreturn "gc-leaf-function" nounwind {
  ; It's possible that we don't copy the whole length of the source array into
  ; the dest array. In this case we need to zero the tail of the array as if it
  ; was zero initialized and we copied %memcpy.length bytes from the source into
  ; the dest array:
  ;
  ; source array:
  ; |------------------------------|
  ; %src                           +%src_size
  ; dest array:
  ; |-------------|--to be zeroed--|--already zeroed from tlab--|
  ; %result       +%memcpy.length  +%src_size                   +%dest_end_offset
  ;
  %zero_start_offset = add nsw nuw i64 %header_size.i64, %memcpy.length.in.bytes
  ; We don't need to zero past the %src_end because the space we get from TLAB
  ; is prezeroed.
  %zero_end_offset = call i64 @llvm.umin.i64(i64 %dest_end_offset, i64 %src_size)
  %zero_length = sub nsw nuw i64 %zero_end_offset, %zero_start_offset

  ; Check if we can zero it without a safepoint, otherwise bail out.
  %MemoryOpsChunkSizeInBytes.i64 = load i64, ptr @MemoryOpsChunkSizeInBytes
  %over_threshold = icmp ugt i64 %zero_length, %MemoryOpsChunkSizeInBytes.i64
  br i1 %over_threshold, label %fail, label %check.preheader

check.preheader:
  %preheader_mask = load i64, ptr @markWord.preheader_mask_in_place

  ; This abstraction needs to be atomic with respect to safepoints.
  ; %has_preheader_bits can change at a safepoint. If we have a safepoint in
  ; between the markword load and markword update we can have stale
  ; %has_preheader_bits (we can get a safepoint between the load and the update
  ; if we reorder either of this operation with a safepoint).
  %old_markword = load i64, ptr addrspace(1) %src, !tbaa !14
  %has_preheader_bits = and i64 %old_markword, %preheader_mask
  br label %check.needs.expand

check.needs.expand:
  %dest_end = getelementptr i8, ptr addrspace(1) %src, i64 %dest_end_offset
  %needs_expand = icmp ugt i64 %dest_end_offset, %src_size
  br i1 %needs_expand, label %try.expand, label %fill.end

try.expand:
  %top_addr = call ptr addrspace(256) @azul.get_current_tlab_top_addr()
  %top = load ptr, ptr addrspace(256) %top_addr
  %top.p1 = addrspacecast ptr %top to ptr addrspace(1), !verifier_exception !1
  %src_end = getelementptr inbounds i8, ptr addrspace(1) %src, i64 %src_size
  %new_instance_ver = load i32, ptr @FalconNewInstanceVersion
  %new_instance_ver_2 = icmp eq i32 %new_instance_ver, 2

  ; Check that the end of src object is the top of the current TLAB. 
  ; This excludes cases where src object is not in a TLAB (like in a shared
  ; allocation page, relocated page) as well as cases where the source was in
  ; a TLAB that already got closed.
  %src_end_is_tlab_top = icmp eq ptr addrspace(1) %src_end, %top.p1

  %dest_end.p0 = addrspacecast ptr addrspace(1) %dest_end to ptr, !verifier_exception !1

  ; Don't support any other new instance version for now.
  %can_expand = and i1 %src_end_is_tlab_top, %new_instance_ver_2
  br i1 %can_expand, label %check.zero.region, label %fail

check.zero.region:
  ; This greatly restricts scope and reliability of the optimization since 
  ; current maximum available space is just 512bytes (size of zero region).
  ; TODO: we should check for the end of TLAB and zero the memory ourselves.
  %is_inside_zero_region = call i1 @azul.is_inside_zero_region_ver2(ptr %dest_end.p0)
  br i1 %is_inside_zero_region, label %update.top, label %fail

update.top:
  store ptr %dest_end.p0, ptr addrspace(256) %top_addr
  br label %patch.markword

fill.end:
  br i1 %needs_end_fill, label %do.fill.end, label %patch.markword

do.fill.end:
  %end_fill_in_bytes = sub nsw nuw i64 %src_size, %dest_end_offset

  ; Both sizes are rounded to heap, so they must be multiples of %bytes_in_a_word.
  ; The diff is also a multiple of %bytes_in_a_word.
  %bytes_in_a_word = load i32, ptr @azul.BytesInAWord
  %bytes_in_a_word.i64 = zext i32 %bytes_in_a_word to i64
  %end_fill_in_words = udiv exact i64 %end_fill_in_bytes, %bytes_in_a_word.i64
  call void @azul.initialize_filler_obj(
    ptr addrspace(1) %dest_end, i64 %end_fill_in_words, i64 0)
  br label %patch.markword

patch.markword:
  ; The original array might have some bits in the header set, e.g. a hashcode
  ; might have been computed for it. Simply update the newly allocated array
  ; header to make sure that we don't get stale bits. The only exception is
  ; we preserve has_preheader_bits. This doesn't have any application visible
  ; effect but eliminates the need to fill the preheader space.

  %kid_shift = load i64, ptr @markWord.kid_shift
  %len_offset_in_bytes = load i32, ptr @arrayOopDesc.length_offset_in_bytes

  ; Initialize mark_word
  %array_kid.i64 = zext i32 %array_kid to i64
  %kid_shifted = shl i64 %array_kid.i64, %kid_shift
  %new_markword = or i64 %kid_shifted, %has_preheader_bits
  store atomic i64 %new_markword, ptr addrspace(1) %src unordered, align 8

  ; Initialize the new array length
  ; Note that we assume that the element kid stays unchanged.
  %length.i32 = trunc i64 %length.i64 to i32
  %len_addr = getelementptr i8, ptr addrspace(1) %src, i32 %len_offset_in_bytes
  store atomic i32 %length.i32, ptr addrspace(1) %len_addr unordered, align 4

  %result = call nonnull ptr addrspace(1) @llvm.orca.alloc.barrier.p1.p1(ptr addrspace(1) %src)
  br label %zero.tail

zero.tail:
  ; Have a fast path check for the case when we copy the whole %src array and
  ; don't need to fill the end of the new array with zeroes.
  %zero_length_is_zero = icmp eq i64 %zero_length, 0
  br i1 %zero_length_is_zero, label %return.result, label %do.zero.tail

do.zero.tail:
  %new_zero_start = getelementptr inbounds i8, ptr addrspace(1) %result, i64 %zero_start_offset
  call void @llvm.memset.p1.i64(ptr addrspace(1) %new_zero_start, i8 0, i64 %zero_length, i1 false)
  br label %return.result

return.result:
  ret ptr addrspace(1) %result

fail:
  ret ptr addrspace(1) null
}

; This is a helper function for @azul.new_array_realloc. It doesn't do the
; rewrite, so it's safe to be "azul-late-inline"="0". It delegates the actual
; rewrite in the heap to @azul.patch_array_for_realloc which is
; "azul-late-inline"="3".
;
; We don't want the whole @azul.new_array_realloc_fast to be late inline 3
; because it uses some higher level abstractions and we want to give the
; optimizer a chance to optimize them.
define ptr addrspace(1) @azul.new_array_realloc_fast(i32 %array_kid,
    i32 %element_kid, i32 %basictype, i32 %length, i32 %header_size,
    i32 %element_shift, ptr addrspace(1) %src, i64 %offset, i64 %memcpy.length.in.bytes)
  "azul-late-inline"="0" alwaysinline nounwind willreturn "gc-leaf-function" {
  %length.i64 = zext i32 %length to i64
  %element_shift.i64 = zext i32 %element_shift to i64
  %header_size.i64 = zext i32 %header_size to i64

  %src_length = call i32 @azul.array_length(ptr addrspace(1) %src)
  %src_length.i64 = zext i32 %src_length to i64
  %src_size = call i64 @azul.compute_array_heap_size(
    i64 %src_length.i64, i64 %element_shift.i64, i64 %header_size.i64)

  ; For now we don't support realloc with offset
  %offset_ne_zero = icmp ne i64 %offset, 0
  br i1 %offset_ne_zero, label %fail, label %offset.checked

offset.checked:
  %dest_size = call i64 @azul.compute_array_heap_size(
    i64 %length.i64, i64 %element_shift.i64, i64 %header_size.i64)

  ; If the dest is smaller than the source we'll need to fill the
  ; remaining space and the end of the source with filler object.
  %needs_end_fill = icmp ult i64 %dest_size, %src_size
  br i1 %needs_end_fill, label %check.can.fill, label %check.gc.space

check.can.fill:
  ; We will need to fill at the end of the array. We do this by marking the
  ; space with filler objects. We can only do the filling if the underlying
  ; array is a primitive type array. Otherwise we will be writing garbage to
  ; reference fields which might be scanned by the GC in the meantime.
  ;
  ; Specifically, we can have a problem with fixup pass (+GPGCDoFixupPasses). 
  ; We do not give up TLABs during relocation safepoint and we record the top
  ; at safepoint to mark end of parseable region. Shrinking here can coincide 
  ; with concurrent fixup following the safepoint.
  ;
  ; Note that we really only need to test the tag bits, but since all object
  ; arrays share the same bits for the other subfields, we can cheat and just
  ; use a 32 bit comparison. This avoids the need to shift and mask.
  %lh = call i32 @azul.load_layout_helper(i32 %array_kid)
  %object_array_lh = load i32, ptr @Klass.object_array_layout_helper
  %src_is_objarray = icmp eq i32 %lh, %object_array_lh
  br i1 %src_is_objarray, label %fail, label %check.gc.space

check.gc.space:
  ; Don't realloc if either the source or the dest arrays are larger than mid
  ; space minimum size. Such realloc will change the GC space the object
  ; belongs to.
  %mid_space_min_object_size_words = load i64, ptr @GPGC_Layout.mid_space_min_object_size_words
  %bytes_in_a_word = load i32, ptr @azul.BytesInAWord
  %bytes_in_a_word.i64 = zext i32 %bytes_in_a_word to i64
  %mid_space_min_object_size_bytes = mul i64 %mid_space_min_object_size_words, %bytes_in_a_word.i64
  %src_is_not_small = icmp uge i64 %src_size, %mid_space_min_object_size_bytes
  %dest_is_not_small = icmp uge i64 %dest_size, %mid_space_min_object_size_bytes
  %src_or_dest_is_not_small = or i1 %src_is_not_small, %dest_is_not_small
  br i1 %src_or_dest_is_not_small, label %fail, label %patch.array.for.realloc

patch.array.for.realloc:
  %result = call ptr addrspace(1) @azul.patch_array_for_realloc(
    ptr addrspace(1) %src, i32 %array_kid, i32 %element_kid, i64 %length.i64,
    i64 %header_size.i64, i64 %memcpy.length.in.bytes, i64 %dest_size, 
    i64 %src_size, i1 %needs_end_fill)
  ret ptr addrspace(1) %result

fail:
  ret ptr addrspace(1) null
}

; @azul.new_array_realloc abstractions is used by the optimzer to rewrite the
; realloc-like pattern:
;
;   %src = @azul.new_array(...)
;   ...
;   %dest = @azul.new_array(...)
;   ...
;   memcpy(to: %dest, from: %src + %offset, length: %memcpy.length.in.bytes)
; =>
;   %src = @azul.new_array(...)
;   ...
;   %dest = @azul.new_array_realloc(..., %src, %offset, %memcpy.length.in.bytes)
;
; This kind of rewrite is done if %src is an unescaped allocation and it is not
; used after the memcpy.
;
; Note that safepoints are allowed between allocations of the source and the dest
; arrays.
;
; Implementation of the realloc abstraction can take advantage of the fact that
; the source array is not used after the realloc call and reuse the storage of
; the source array for the new allocation.
;
; Note that realloc can be done with a non-zero offset, effectively trimming
; first elements of the source array. Also %memcpy.length.in.bytes is not 
; necessarily the full length of the source array.
define ptr addrspace(1) @azul.new_array_realloc(
    i64 %javaThread, i32 orca_alloc_java_type_kid %array_kid, i32 %element_kid,
    i32 %basictype, i32 %length, i32 %header_size, i32 %element_shift,
    ptr addrspace(1) %src, i64 %offset, i64 %memcpy.length.in.bytes)
  "azul-late-inline"="1" noinline "vmstate-idempotent"="true" "consumes-replay-vmstate"
  nounwind "has-latent-use" {
entry:
  %enabled = load i1, ptr @FalconUseOptimizedReallocInPlace
  br i1 %enabled, label %try.realloc, label %slow

try.realloc:
  %fast = call ptr addrspace(1) @azul.new_array_realloc_fast(i32 %array_kid, i32 %element_kid,
    i32 %basictype, i32 %length, i32 %header_size, i32 %element_shift,
    ptr addrspace(1) %src, i64 %offset, i64 %memcpy.length.in.bytes)
  %success = icmp ne ptr addrspace(1) %fast, null
  br i1 %success, label %return.fast, label %slow

return.fast:
  ret ptr addrspace(1) %fast

slow:
  ; Slow path fallback - simply allocate a new array and copy the content of
  ; the source array.
  %dest = call ptr addrspace(1) @azul.new_array(i64 %javaThread,
          i32 %array_kid, i32 %element_kid, i32 %basictype,
          i32 %length, i32 %header_size, i32 %element_shift, i64 0) [ "deopt"() ]

  %header_size.i64 = zext i32 %header_size to i64

  ; src_ptr = src + header_size + offset
  %src_offset = add i64 %header_size.i64, %offset
  %src_ptr = getelementptr inbounds i8, ptr addrspace(1) %src, i64 %src_offset

  ; dest_ptr = dest + header_size
  %dest_ptr = getelementptr inbounds i8, ptr addrspace(1) %dest, i64 %header_size.i64

  ; We don't worry about atomicity (note element_size = 1) here because both
  ; %src and %dest are unescaped.
  call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(
       ptr addrspace(1) align 1 elementtype(i8) %dest_ptr,
       ptr addrspace(1) align 1 elementtype(i8) %src_ptr,
       i64 %memcpy.length.in.bytes,
       i32 1) [ "deopt"() ]

  ret ptr addrspace(1) %dest
}

;; Allocation fastpath, args (i64 %klassID, i64 %size).
;; Does not safepoint (i.e. is a leaf function).  Returns non-null on
;; success, and null on TLAB-full.
declare zing_stub_new_fast noalias ptr addrspace(1) @"StubRoutines::new_fast_orca()"(i64, i64)
    willreturn "gc-leaf-function" "vmstate-idempotent"="true" nounwind "allocation-site"

define i32 @azul.load_layout_helper(i32 %kid)
     willreturn "gc-leaf-function" "vmstate-idempotent"="true" alwaysinline "azul-late-inline"="0" nounwind {
 entry:
  %layout_helper_offset_bytes = load i32, ptr @klassOopDesc.layout_helper_offset_in_bytes

  %klass = call ptr addrspace(1) @azul.load_klass(i32 %kid)
  %layout_helper_address = getelementptr i8, ptr addrspace(1) %klass, i32 %layout_helper_offset_bytes
  %layout_helper = load atomic i32, ptr addrspace(1) %layout_helper_address unordered, align 4, !tbaa !26
  ret i32 %layout_helper
}

define i32 @azul.layout_helper_log2_element_size(i32 %lh) 
    nounwind alwaysinline readnone "azul-late-inline"="0" willreturn "gc-leaf-function" {
entry:
  %lh_log2_element_size_mask = load i32, ptr @Klass.layout_helper_log2_element_size_mask
  %lh_log2_element_size_shift = load i32, ptr @Klass.layout_helper_log2_element_size_shift
  
  %lh_log2_element_size_shifted = lshr i32 %lh, %lh_log2_element_size_shift   
  %log2_element_size = and i32 %lh_log2_element_size_shifted, %lh_log2_element_size_mask
  ret i32 %log2_element_size
}

define i32 @azul.layout_helper_header_size(i32 %lh) 
    nounwind alwaysinline readnone "azul-late-inline"="0" willreturn "gc-leaf-function" {
entry:
  %lh_header_size_mask = load i32, ptr @Klass.layout_helper_header_size_mask
  %lh_header_size_shift = load i32, ptr @Klass.layout_helper_header_size_shift

  %lh_header_size_shifted = lshr i32 %lh, %lh_header_size_shift   
  %header_size = and i32 %lh_header_size_shifted, %lh_header_size_mask
  ret i32 %header_size
}

;; This is the JBA that allocates new *objects* (specifically, not
;; arrays).  It calls out to the fastpath stubroutine, and then to the
;; slowpath if the fastpath fails.

define noalias nonnull noundef ptr addrspace(1) @azul.new_instance(
    i64 %javaThread, i32 orca_alloc_java_type_kid %kid) 
    "vmstate-idempotent"="true" nounwind "azul-late-inline"="1" noinline "consumes-replay-vmstate" {
entry:
  %layout_helper = call i32 @azul.load_layout_helper(i32 %kid)
  %layoutHelperNeedsSlowPathBit = load i32, ptr @Klass.layout_helper_needs_slow_path_bit
  %layoutHelperSizeInBytesMask = load i32, ptr @Klass.layout_helper_size_in_bytes_mask
  %header_size = load i32, ptr @oopDesc.headerSize
  %header_size.i64 = zext i32 %header_size to i64
  %size_in_bytes = and i32 %layoutHelperSizeInBytesMask, %layout_helper
  %size_in_bytes.i64 = zext i32 %size_in_bytes to i64

  %useNewInstanceFastPath = load i1, ptr @NewInstanceFastPath.flag
  br i1 %useNewInstanceFastPath, label %fast_path.is_allowed, label %slow_path

fast_path.is_allowed:
  %needs_slow_path = and i32 %layoutHelperNeedsSlowPathBit, %layout_helper
  %needs_slow_path.icmp = icmp ne i32 %needs_slow_path, 0
  br i1 %needs_slow_path.icmp, label %slow_path, label %fast_path, !prof !0

fast_path:
  %newObject.fast = call noalias ptr addrspace(1) @azul.new_instance_fast(
    i32 orca_alloc_java_type_kid %kid, i64 %header_size.i64, i64 %size_in_bytes.i64)
  %success.fast = icmp ne ptr addrspace(1) %newObject.fast, null
  br i1 %success.fast, label %fast_path.done, label %slow_path, !prof !9

fast_path.done:
  ret ptr addrspace(1) %newObject.fast

slow_path:
  %kid.i64 = zext i32 %kid to i64
  %newObject.slow = call zing_stub_default noalias ptr addrspace(1) @"StubRoutines::dolphin_new_instance()"(
            i64 %javaThread,
            i64 orca_alloc_java_type_kid %kid.i64) [ "deopt"() ]
  ret ptr addrspace(1) %newObject.slow
}

declare void @llvm.memset.p0.i64(ptr, i8, i64, i1)

; Prefetches specified number of bytes starting from "%start+@FalconTLABPrefetchDistance"
define void @azul.tlab_prefetch(ptr nonnull %start, i64 %bytes_to_prefetch)
    alwaysinline nounwind willreturn "gc-leaf-function" "azul-late-inline"="0" {
entry:
  ; Adjust for prefetch distance
  %pref.distance = load i32, ptr @FalconTLABPrefetchDistance
  %pref.addr.init = getelementptr i8, ptr %start, i32 %pref.distance
  br label %prefetching_loop

prefetching_loop:
  %pref.bytes = phi i64 [ 0, %entry ], [ %pref.bytes.next, %prefetching_loop ]
  %pref.addr = getelementptr i8, ptr %pref.addr.init, i64 %pref.bytes 
  call void @llvm.prefetch.p0(ptr %pref.addr, i32 1, i32 3, i32 1)
  ; Shift one cache line. TODO: shouldn't be hardcoded
  %pref.bytes.next = add i64 %pref.bytes, 64
  %is_prefetched = icmp uge i64 %pref.bytes.next, %bytes_to_prefetch
  br i1 %is_prefetched, label %done, label %prefetching_loop

done:
  ret void
}

define void @azul.tlab_zero(
  ptr nonnull %zero.addr.init, i64 %bytes_to_zero)
    alwaysinline nounwind willreturn "gc-leaf-function" "azul-late-inline"="0" {
entry:
  %zregion_size = load i32, ptr @Thread.tlab_zregion_size_bytes
  %zregion_size.i64 = zext i32 %zregion_size to i64
  %use_stosb = load i1, ptr @FalconTLABUseFastStosb
  %min_stosb_size = load i64, ptr @FalconTLABMinSizeForFastStosb
  %is_large_object = icmp uge i64 %bytes_to_zero, %min_stosb_size
  %do_one_piece_zeroing = and i1 %use_stosb, %is_large_object
  br i1 %do_one_piece_zeroing, label %zero_in_one_piece, label %zero_chunked

zero_chunked:
  %zero.addr = phi ptr [ %zero.addr.init, %entry ], [ %zero.addr.next, %zero_region ]
  %rem.bytes = phi i64 [ %bytes_to_zero, %entry ], [ %rem.bytes.next, %zero_region ]
  %has_enough_bytes = icmp uge i64 %rem.bytes, %zregion_size.i64
  br i1 %has_enough_bytes, label %zero_region, label %zero_remainder

zero_region:
  call void @azul.tlab_prefetch(ptr %zero.addr, i64 %zregion_size.i64)
  call void @llvm.memset.p0.i64(ptr align 8 %zero.addr, i8 0, i64 %zregion_size.i64, i1 false)
  %rem.bytes.next = sub i64 %rem.bytes, %zregion_size.i64
  %zero.addr.next = getelementptr i8, ptr %zero.addr, i64 %zregion_size.i64
  br label %zero_chunked

zero_remainder:
  call void @azul.tlab_prefetch(ptr %zero.addr, i64 %rem.bytes)
  call void @llvm.memset.p0.i64(ptr align 8 %zero.addr, i8 0, i64 %rem.bytes, i1 false)
  ret void

zero_in_one_piece:
  ;call void @llvm.memset.p0.i64(ptr align 8 %zero.addr.init, i8 0, i64 %bytes_to_zero, i1 false)
  ; As measured on allocation intensive worloads 'rep stosb' is benefitial for
  ; large allocations on modern x86 CPUs. Unfortunately, we can't (don't want)
  ; rely on 'memset' provided by the system as there are known cases of
  ; sub-optimal implementation ('rep stosb' is not used).
  ; Alternative (better?) approach would be to ship optimized memset with Zing.
  call { ptr, i64 } asm sideeffect "rep stosb", "={rdi},={rcx},0,1,{rax}"(ptr align 8 %zero.addr.init, i64 %bytes_to_zero, i64 0) nounwind willreturn  "gc-leaf-function"
  ret void
}

define nonnull ptr @azul.init_object(
  ptr nonnull %top, i32 %kid, i64 %header_size.i64, i64 %size_in_bytes.i64)
     alwaysinline nounwind willreturn "gc-leaf-function" "azul-late-inline"="0"
{
entry:
  ;; Initialize mark_word
  %kid.shift_bits = load i64, ptr @markWord.kid_shift
  %kid.i64 = zext i32 %kid to i64
  %kid.shifted =  shl i64 %kid.i64, %kid.shift_bits
  store i64 %kid.shifted, ptr %top

  %new_instance_ver = load i32, ptr @FalconNewInstanceVersion
  switch i32 %new_instance_ver, label %def
                        [ i32 4, label %ver4 ]

ver4:
  ; Don't touch object's header
  %zero.addr = getelementptr i8, ptr %top, i64 %header_size.i64
  %bytes_to_zero = sub i64 %size_in_bytes.i64, %header_size.i64
  call void @azul.tlab_zero(ptr nonnull %zero.addr, i64 %bytes_to_zero)
  ret ptr %top
  
def:
  ret ptr %top
}

;; Returns true if there is enough space in current zero region, false otherwise.
;;
;; This version uses cached end of zero region to find out if there is enough
;; space in zero region.
define i1 @azul.is_inside_zero_region_ver2(ptr %new_top)
     readonly nounwind alwaysinline willreturn "gc-leaf-function" "azul-late-inline"="0"
{
 do_tls_zchk:
  %zend.addr = call ptr addrspace(256) @azul.get_current_tlab_zend_addr()
  ; We do zeroing by at least one cache line -> at least 64 byte alignment
  %zend = load ptr, ptr addrspace(256) %zend.addr, !align !{i64 64} 
  %is_inside_zero_region = icmp ult ptr %new_top, %zend
  ret i1 %is_inside_zero_region
}

;; Returns true if there is enough space in current zero region, false otherwise.
;;
;; This version uses XOR based comparison to find out if there is enough
;; space in zero region.
;;
define i1 @azul.is_inside_zero_region_ver3(ptr %top, ptr %new_top)
     readonly nounwind alwaysinline willreturn "gc-leaf-function" "azul-late-inline"="0"
{
is_tlab_alive:
  %top_as_int = ptrtoint ptr %top to i64, !verifier_exception !1
  %is_alive = icmp ne i64 %top_as_int, 0
  br i1 %is_alive, label %do_xor_zchk, label %fail, !prof !9

 do_xor_zchk:
  %zregion_size = load i32, ptr @Thread.tlab_zregion_size_bytes
  %zregion_size.i64 = zext i32 %zregion_size to i64
  %new_top_as_int = ptrtoint ptr %new_top to i64
  %top_xored = xor i64 %top_as_int, %new_top_as_int
  %is_inside_zero_region = icmp ult i64 %top_xored, %zregion_size.i64
  ret i1 %is_inside_zero_region
  
fail:
  ret i1 false
}

;; Returns true if there is enough space in TLAB, false otherwise.
define i1 @azul.is_inside_tlab(ptr %new_top)
     readonly nounwind alwaysinline willreturn "gc-leaf-function" "azul-late-inline"="0"
{
check_tlab_end:
  %tlab_end = call ptr @azul.get_current_tlab_end()
  %is_inside_tlab = icmp ult ptr %new_top, %tlab_end
  ret i1 %is_inside_tlab
}

declare void @llvm.prefetch.p1(ptr addrspace(1) readonly %address, i32 %rw, i32 %locality, i32 %cache_type)

define noalias ptr addrspace(1) @azul.new_instance_fast(
    i32 orca_alloc_java_type_kid %kid, i64 %header_size.i64, i64 %size_in_bytes.i64)
     noinline readonly nounwind willreturn "gc-leaf-function" "azul-late-inline"="3" "allocation-site"
{
entry:
  %kid.i64 = zext i32 %kid to i64
  %top_addr = call ptr addrspace(256) @azul.get_current_tlab_top_addr()
  %top = load ptr, ptr addrspace(256) %top_addr, !nonnull !{}
  %new_top = getelementptr i8, ptr %top, i64 %size_in_bytes.i64

  %new_instance_ver = load i32, ptr @FalconNewInstanceVersion
  switch i32 %new_instance_ver, label %fast_path.default
                        [ i32 2, label %fast_path.ver2
                          i32 3, label %fast_path.ver3
                          i32 4, label %fast_path.ver4 ]

fast_path.ver2:
  %is_inside_zero_region.ver2 = call i1 @azul.is_inside_zero_region_ver2(ptr %new_top) 
  br i1 %is_inside_zero_region.ver2, label %init_and_return, label %fast_path.default, !prof !10

fast_path.ver3:
  %is_inside_zero_region.ver3 = call i1 @azul.is_inside_zero_region_ver3(ptr %top, ptr %new_top) 
  br i1 %is_inside_zero_region.ver3, label %init_and_return, label %fast_path.default, !prof !10

fast_path.ver4:
  %is_inside_tlab = call i1 @azul.is_inside_tlab(ptr %new_top)
  br i1 %is_inside_tlab, label %init_and_return, label %fail, !prof !9
  
init_and_return:
  store ptr %new_top, ptr addrspace(256) %top_addr
  %newObject.fast = call nonnull ptr @azul.init_object(
    ptr %top, i32 %kid, i64 %header_size.i64, i64 %size_in_bytes.i64)
  %newObject.release = call nonnull ptr addrspace(1) @llvm.orca.alloc.barrier.p1.p0(ptr %newObject.fast)
  ret ptr addrspace(1) %newObject.release

fast_path.default:
  ;; This is main fast allocation path for NewInstanceVersion == 0 && NewInstanceVersion == 1.
  %newObject = call zing_stub_new_fast noalias ptr addrspace(1) @"StubRoutines::new_fast_orca()"(
    i64 orca_alloc_java_type_kid %kid.i64, i64 %size_in_bytes.i64)
  ret ptr addrspace(1) %newObject
  
fail:
  ret ptr addrspace(1) null
}

define void @azul.final_publication_barrier(ptr addrspace(1) nocapture readnone "removable-allocation-use")
    nounwind noinline "azul-late-inline"="1"
    "vmstate-idempotent"="true" willreturn "gc-leaf-function" "has-latent-use" {
entry:
  %isAARCH64Build = load i1, ptr @FalconAARCH64Build
  br i1 %isAARCH64Build, label %AARCH64, label %X86_64
X86_64:
  fence release
  ret void
AARCH64:
  call void @llvm.aarch64.dmb(i32 10) ; StoreStore equivalent on arm, weaker than fence release
  ret void
}

!10 = !{ !"branch_weights", i32 2, i32 1 }

; Maps to SharedRuntime::register_finalizer
declare ptr addrspace(1) @azul.register_finalizer(i64, ptr addrspace(1)) "consumes-replay-vmstate"

;; This is the JBA which represents 'f2i' bytecode. It converts float to integer
;; according to the java specification:
;; https://docs.oracle.com/javase/specs/jvms/se7/html/jvms-6.html#jvms-6.5.f2i
define i32 @azul.f2i(float %arg)
  nounwind alwaysinline readnone
  "azul-late-inline"="0"
  "vmstate-idempotent"="true"
  willreturn "gc-leaf-function"  {
entry:
  %isAARCH64Build = load i1, ptr @FalconAARCH64Build
  br i1 %isAARCH64Build, label %AARCH64, label %X86_64

;; TODO: Check possibility to use llvm.fptosi.sat for x86_64. 
X86_64:
  %arg.v = insertelement <4 x float> undef, float %arg, i32 0
  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %arg.v)
  %jint_min  = load i32, ptr @JIntMin ; == 0x80000000
  ; cvttss2si will return 0x80000000 in case if result does not fit into 32 bits
  ; or if result was NaN
  %bad_input = icmp eq i32 %res, %jint_min
  br i1 %bad_input, label %nan_or_overflow, label %normal_return, !prof !8
  
nan_or_overflow:
  %is_nan = fcmp uno float %arg, 0.0
  br i1 %is_nan, label %nan, label %clamp

clamp:
  %to_max = fcmp ogt float %arg, 0.0
  %jint_max  = load i32, ptr @JIntMax
  ; If not %to_max then %res contains %jint_min and we can simply return it
  %res_clamped = select i1 %to_max, i32 %jint_max, i32 %res
  ret i32 %res_clamped

normal_return:
  ret i32 %res

nan:
  ret i32 0

AARCH64:
  %ares = call i32 @llvm.fptosi.sat.i32.f32(float %arg) 
  ret i32 %ares
}
declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
declare i32 @llvm.fptosi.sat.i32.f32(float) nounwind readnone

;; This is the JBA which represents 'f2l' bytecode. It converts float to long
;; according to the java specification:
;; https://docs.oracle.com/javase/specs/jvms/se7/html/jvms-6.html#jvms-6.5.f2l
define i64 @azul.f2l(float %arg)
  nounwind alwaysinline readnone
  "azul-late-inline"="0"
  "vmstate-idempotent"="true"
  willreturn "gc-leaf-function"  {
entry:
  %isAARCH64Build = load i1, ptr @FalconAARCH64Build
  br i1 %isAARCH64Build, label %AARCH64, label %X86_64

;; TODO: Check possibility to use llvm.fptosi.sat for x86_64. 
X86_64:
  %arg.v = insertelement <4 x float> undef, float %arg, i32 0
  %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %arg.v)
  %jlong_min  = load i64, ptr @JLongMin ; == 0x8000000000000000
  ; cvttss2si64 will return 0x8000000000000000 in case if result does not fit 
  ; into 64 bits or if result was NaN
  %bad_input = icmp eq i64 %res, %jlong_min
  br i1 %bad_input, label %nan_or_overflow, label %normal_return, !prof !8
  
nan_or_overflow:
  %is_nan = fcmp uno float %arg, 0.0
  br i1 %is_nan, label %nan, label %clamp

clamp:
  %to_max = fcmp ogt float %arg, 0.0
  %jlong_max  = load i64, ptr @JLongMax
  ; If not %to_max then %res contains %jlong_min and we can simply return it
  %res_clamped = select i1 %to_max, i64 %jlong_max, i64 %res
  ret i64 %res_clamped

normal_return:
  ret i64 %res

nan:
  ret i64 0

AARCH64:
  %ares = call i64 @llvm.fptosi.sat.i64.f32(float %arg) 
  ret i64 %ares
}
declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
declare i64 @llvm.fptosi.sat.i64.f32(float) nounwind readnone

;; This is the JBA which represents 'd2i' bytecode. It converts double to integer
;; according to the java specification:
;; https://docs.oracle.com/javase/specs/jvms/se7/html/jvms-6.html#jvms-6.5.d2i
define i32 @azul.d2i(double %arg)
  nounwind alwaysinline readnone
  "azul-late-inline"="0"
  "vmstate-idempotent"="true"
  willreturn "gc-leaf-function"  {
entry:
  %isAARCH64Build = load i1, ptr @FalconAARCH64Build
  br i1 %isAARCH64Build, label %AARCH64, label %X86_64

;; TODO: Check possibility to use llvm.fptosi.sat for x86_64. 
X86_64:
  %arg.v = insertelement <2 x double> undef, double %arg, i32 0
  %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %arg.v)
  %jint_min  = load i32, ptr @JIntMin ; == 0x80000000
  ; cvttsd2si will return 0x80000000 in case if result does not fit into 32 bits
  ; or if result was NaN
  %bad_input = icmp eq i32 %res, %jint_min
  br i1 %bad_input, label %nan_or_overflow, label %normal_return, !prof !8
  
nan_or_overflow:
  %is_nan = fcmp uno double %arg, 0.0
  br i1 %is_nan, label %nan, label %clamp

clamp:
  %to_max = fcmp ogt double %arg, 0.0
  %jint_max  = load i32, ptr @JIntMax
  ; If not %to_max then %res contains %jint_min and we can simply return it
  %res_clamped = select i1 %to_max, i32 %jint_max, i32 %res
  ret i32 %res_clamped

normal_return:
  ret i32 %res

nan:
  ret i32 0

AARCH64:
  %ares = call i32 @llvm.fptosi.sat.i32.f64(double %arg) 
  ret i32 %ares
}
declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
declare i32 @llvm.fptosi.sat.i32.f64(double) nounwind readnone

;; This is the JBA which represents 'd2l' bytecode. It converts double to long
;; according to the java specification:
;; https://docs.oracle.com/javase/specs/jvms/se7/html/jvms-6.html#jvms-6.5.d2l
define i64 @azul.d2l(double %arg)
  nounwind alwaysinline readnone
  "azul-late-inline"="0"
  "vmstate-idempotent"="true"
  willreturn "gc-leaf-function"  {
entry:
  %isAARCH64Build = load i1, ptr @FalconAARCH64Build
  br i1 %isAARCH64Build, label %AARCH64, label %X86_64

;; TODO: Check possibility to use llvm.fptosi.sat for x86_64. 
X86_64:
  %arg.v = insertelement <2 x double> undef, double %arg, i32 0
  %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %arg.v)
  %jlong_min  = load i64, ptr @JLongMin ; == 0x8000000000000000
  ; cvttsd2si64 will return 0x8000000000000000 in case if result does not fit 
  ; into 64 bits or if result was NaN
  %bad_input = icmp eq i64 %res, %jlong_min
  br i1 %bad_input, label %nan_or_overflow, label %normal_return, !prof !8
  
nan_or_overflow:
  %is_nan = fcmp uno double %arg, 0.0
  br i1 %is_nan, label %nan, label %clamp

clamp:
  %to_max = fcmp ogt double %arg, 0.0
  %jlong_max  = load i64, ptr @JLongMax
  ; If not %to_max then %res contains %jlong_min and we can simply return it
  %res_clamped = select i1 %to_max, i64 %jlong_max, i64 %res
  ret i64 %res_clamped

normal_return:
  ret i64 %res

nan:
  ret i64 0

AARCH64:
  %ares = call i64 @llvm.fptosi.sat.i64.f64(double %arg) 
  ret i64 %ares
}
declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
declare i64 @llvm.fptosi.sat.i64.f64(double) nounwind readnone

declare zing_prim_arrays_equal i32 @"StubRoutines::primArraysEqual_1()"(
          ptr addrspace(1) readonly %left,
          ptr addrspace(1) readonly %right)
        nounwind willreturn "gc-leaf-function" argmemonly readonly
declare zing_prim_arrays_equal i32 @"StubRoutines::primArraysEqual_2()"(
          ptr addrspace(1) readonly %left,
          ptr addrspace(1) readonly %right)
        nounwind willreturn "gc-leaf-function" argmemonly readonly
declare zing_prim_arrays_equal i32 @"StubRoutines::primArraysEqual_4()"(
          ptr addrspace(1) readonly %left,
          ptr addrspace(1) readonly %right)
        nounwind willreturn "gc-leaf-function" argmemonly readonly
declare zing_prim_arrays_equal i32 @"StubRoutines::primArraysEqual_8()"(
          ptr addrspace(1) readonly %left,
          ptr addrspace(1) readonly %right)
        nounwind willreturn "gc-leaf-function" argmemonly readonly

; Helper functions for the _equals* intrinsics.
declare i32 @arrays_equal_impl_1(ptr addrspace(1) %left, ptr addrspace(1) %right)
  readonly argmemonly nounwind alwaysinline willreturn "gc-leaf-function" "azul-late-inline"="0"

declare i32 @arrays_equal_impl_2(ptr addrspace(1) %left, ptr addrspace(1) %right)
  readonly argmemonly nounwind alwaysinline willreturn "gc-leaf-function" "azul-late-inline"="0"

declare i32 @arrays_equal_impl_4(ptr addrspace(1) %left, ptr addrspace(1) %right)
  readonly argmemonly nounwind alwaysinline willreturn "gc-leaf-function" "azul-late-inline"="0"

declare i32 @arrays_equal_impl_8(ptr addrspace(1) %left, ptr addrspace(1) %right)
  readonly argmemonly nounwind alwaysinline willreturn "gc-leaf-function" "azul-late-inline"="0"

define zing i32 @_equalsB(ptr addrspace(1) %left, ptr addrspace(1) %right)
  argmemonly readonly nounwind "alwaysinline-top-level" {
entry:
  %result = call i32 @arrays_equal_impl_1(ptr addrspace(1) %left, ptr addrspace(1) %right)
  ret i32 %result
}
define zing i32 @_equalsC(ptr addrspace(1) %left, ptr addrspace(1) %right)
  argmemonly readonly nounwind "alwaysinline-top-level" {
entry:
  %result = call i32 @arrays_equal_impl_2(ptr addrspace(1) %left, ptr addrspace(1) %right)
  ret i32 %result
}
define zing i32 @_equalsS(ptr addrspace(1) %left, ptr addrspace(1) %right)
  argmemonly readonly nounwind "alwaysinline-top-level" {
entry:
  %result = call i32 @arrays_equal_impl_2(ptr addrspace(1) %left, ptr addrspace(1) %right)
  ret i32 %result
}
define zing i32 @_equalsI(ptr addrspace(1) %left, ptr addrspace(1) %right)
  argmemonly readonly nounwind "alwaysinline-top-level" {
entry:
  %result = call i32 @arrays_equal_impl_4(ptr addrspace(1) %left, ptr addrspace(1) %right)
  ret i32 %result
}
define zing i32 @_arraysEqualsL(ptr addrspace(1) %left, ptr addrspace(1) %right)
  argmemonly readonly nounwind "alwaysinline-top-level" {
entry:
  %result = call i32 @arrays_equal_impl_8(ptr addrspace(1) %left, ptr addrspace(1) %right)
  ret i32 %result
}

; Call to this abstraction may be replaced with constant by optimizer if it can prove that obj is a known value.
; Otherwise it will be inlined with this body.
define i1 @azul.is_known_value(ptr addrspace(1) %obj)
  nounwind noinline readonly "azul-late-inline"="1" willreturn "gc-leaf-function" "vmstate-idempotent"="true" {
  ret i1 false
}

define zing i32 @_isCompileConstant(ptr addrspace(1) %obj)
  argmemonly readonly nounwind "alwaysinline-top-level" {
  %is_known_value = call i1 @azul.is_known_value(ptr addrspace(1) %obj)
  %is_known_value.zext = zext i1 %is_known_value to i32
  ret i32 %is_known_value.zext
}

; Helper function for the string intrinsics. Extracts 'value' field from the
; String object.
define ptr addrspace(1) @azul.get_string_value(ptr addrspace(1) %arg)
  readonly argmemonly nounwind alwaysinline willreturn "gc-leaf-function" "azul-late-inline"="0" {
entry:
  %String.value_offset = load i32, ptr @java_lang_String.value_offset_in_bytes

  %arg_value_addr = getelementptr inbounds i8, ptr addrspace(1) %arg, i32 %String.value_offset
  %arg_value = call dereferenceable_or_null(12) ptr addrspace(1) @azul.load_ref_p1(ptr addrspace(1) %arg_value_addr)
  ret ptr addrspace(1) %arg_value
}

define i8 @azul.get_string_coder(ptr addrspace(1) %arg)
  readonly argmemonly nounwind alwaysinline willreturn "gc-leaf-function" "azul-late-inline"="0" {
entry:
  %coder_offset    = load i32, ptr @java_lang_String.coder_offset_in_bytes
  %coder_gep       = getelementptr inbounds i8, ptr addrspace(1) %arg, i32 %coder_offset
  %coder           = load i8, ptr addrspace(1) %coder_gep
  ret i8 %coder
}

define zing i32 @_stringEquals(ptr addrspace(1) %this, ptr addrspace(1) %arg)
  argmemonly readonly nounwind "alwaysinline-top-level" {
entry:
  %is_same_pointers = icmp eq ptr addrspace(1) %this, %arg
  br i1 %is_same_pointers, label %equals, label %different_pointers

different_pointers:
  %is_arg_null = icmp eq ptr addrspace(1) %arg, null
  br i1 %is_arg_null, label %not_equals, label %instance_check

instance_check:
  ; %this is receiver so it is always a String klass. In most cases this
  ; will be replaced with constant due to a known type.
  %string_kid = call i32 @azul.get_klass_id(ptr addrspace(1) %this)
  %arg_kid = call i32 @azul.get_klass_id(ptr addrspace(1) %arg)
  %is_arg_string = call i1 @azul.is_subtype_of(i32 %string_kid, i32 %arg_kid)
  br i1 %is_arg_string, label %do_equals_comparison, label %not_equals

do_equals_comparison:
  %this_value = call ptr addrspace(1) @azul.get_string_value(ptr addrspace(1) %this)
  %arg_value = call ptr addrspace(1) @azul.get_string_value(ptr addrspace(1) %arg)

; Based on java version and CompactStrings flag we determine whether we have byte[]
; or char[] internal string representation.
  %jdk_version = load i32, ptr @JDK_VERSION_MAJOR
  %is_byte_string = icmp ugt i32 %jdk_version, 8
  br i1 %is_byte_string, label %byte_case, label %check_for_cs

check_for_cs:
  %compact_strings = load i1, ptr @CompactStrings
  br i1 %compact_strings, label %byte_case, label %char_case

byte_case:
  %this_coder = call i8 @azul.get_string_coder(ptr addrspace(1) %this)
  %arg_coder  = call i8 @azul.get_string_coder(ptr addrspace(1) %arg)
  %is_coder_equal = icmp eq i8 %this_coder, %arg_coder
  br i1 %is_coder_equal, label %comparable, label %not_equals

comparable:
  %byte_result = call i32 @arrays_equal_impl_1(ptr addrspace(1) %this_value, ptr addrspace(1) %arg_value)
  ret i32 %byte_result

char_case:
  %char_result = call i32 @arrays_equal_impl_2(ptr addrspace(1) %this_value, ptr addrspace(1) %arg_value)
  ret i32 %char_result

equals:
  ret i32 1

not_equals:
  ret i32 0
}

define zing noundef i32 @_getCharStringU(ptr addrspace(1) noundef %0, i32 noundef %1) {
  %index = shl i32 %1, 1
  %index.64 = zext i32 %index to i64
  %arr_gep = getelementptr inbounds i8, ptr addrspace(1) %0, i64 16
  %val_gep = getelementptr inbounds i8, ptr addrspace(1) %arr_gep, i64 %index.64
  %value = load atomic i16, ptr addrspace(1) %val_gep unordered, align 2 ; %0 is at least align 2
  %value.32 = zext i16 %value to i32
  ret i32 %value.32
}

define zing void @_putCharStringU(ptr addrspace(1) noundef %arrayOop, i32 noundef %index, i32 noundef %value) {
  %index.shl = shl i32 %index, 1
  %index.64 = zext i32 %index.shl to i64
  %arrary_header_size = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %arrary_base_offset = getelementptr inbounds i8, ptr addrspace(1) %arrayOop, i32 %arrary_header_size
  %value_offset  = getelementptr inbounds i8, ptr addrspace(1) %arrary_base_offset, i64 %index.64
  %value.16 = trunc i32 %value to i16
  store atomic i16 %value.16, ptr addrspace(1) %value_offset unordered, align 2 ; %arrayOop is at least align 2
  ret void
}

declare zing_stub_default i32 @"StubRoutines::countPositives()"(
  ptr addrspace(1) nonnull readonly %src_start, i64 %len)
  argmemonly willreturn "gc-leaf-function" nounwind

define zing i32 @_countPositives(ptr addrspace(1) nonnull %src, i32 %ofs, i32 %len)
  alwaysinline argmemonly willreturn "gc-leaf-function" nounwind {
entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %src.start = getelementptr inbounds i8, ptr addrspace(1) %src, i32 %byteArrayHeaderSize
  %ofs.zext = zext i32 %ofs to i64
  %src.gep = getelementptr inbounds i8, ptr addrspace(1) %src.start, i64 %ofs.zext
  %len.zext = zext i32 %len to i64

  %result = call zing_stub_default i32 @"StubRoutines::countPositives()"(
            ptr addrspace(1) nonnull readonly %src.gep, i64 %len.zext)
  ret i32 %result
}

declare zing_stub_default i32 @"StubRoutines::arraysHashCodeBoolean()"(
  ptr addrspace(1) nonnull readonly %array_start, i64 %len, i64 %result)
  argmemonly willreturn "gc-leaf-function" nounwind

declare zing_stub_default i32 @"StubRoutines::arraysHashCodeChar()"(
  ptr addrspace(1) nonnull readonly %array_start, i64 %len, i64 %result)
  argmemonly willreturn "gc-leaf-function" nounwind

declare zing_stub_default i32 @"StubRoutines::arraysHashCodeByte()"(
  ptr addrspace(1) nonnull readonly %array_start, i64 %len, i64 %result)
  argmemonly willreturn "gc-leaf-function" nounwind

declare zing_stub_default i32 @"StubRoutines::arraysHashCodeShort()"(
  ptr addrspace(1) nonnull readonly %array_start, i64 %len, i64 %result)
  argmemonly willreturn "gc-leaf-function" nounwind

declare zing_stub_default i32 @"StubRoutines::arraysHashCodeInt()"(
  ptr addrspace(1) nonnull readonly %array_start, i64 %len, i64 %result)
  argmemonly willreturn "gc-leaf-function" nounwind

; public static int vectorizedHashCode(Object array, int fromIndex, int length,
;                                      int initialValue, int basicType)
define zing i32 @_vectorizedHashCode(ptr addrspace(1) nonnull %array, i32 %from_index, i32 %length,
                                     i32 %initial_value, i32 %basic_type)
  "alwaysinline-top-level" {
entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %array.start = getelementptr inbounds i8, ptr addrspace(1) %array, i32 %byteArrayHeaderSize
  %from_index.zext = zext i32 %from_index to i64
  %array.gep = getelementptr inbounds i8, ptr addrspace(1) %array.start, i64 %from_index.zext
  %length.zext = zext i32 %length to i64
  %initial_value.zext = zext i32 %initial_value to i64

  %is_boolean_case = icmp eq i32 %basic_type, 4
  br i1 %is_boolean_case, label %boolean_case, label %check_char_case

boolean_case:
  %result_boolean = call zing_stub_default i32 @"StubRoutines::arraysHashCodeBoolean()"(
                    ptr addrspace(1) nonnull readonly %array.gep, i64 %length.zext, i64 %initial_value.zext)
  ret i32 %result_boolean

check_char_case:
  %is_char_case = icmp eq i32 %basic_type, 5
  br i1 %is_char_case, label %char_case, label %check_byte_case

char_case:
  %result_char = call zing_stub_default i32 @"StubRoutines::arraysHashCodeChar()"(
                 ptr addrspace(1) nonnull readonly %array.gep, i64 %length.zext, i64 %initial_value.zext)
  ret i32 %result_char

check_byte_case:
  %is_byte_case = icmp eq i32 %basic_type, 8
  br i1 %is_byte_case, label %byte_case, label %check_short_case

byte_case:
  %byte_result = call zing_stub_default i32 @"StubRoutines::arraysHashCodeByte()"(
                 ptr addrspace(1) nonnull readonly %array.gep, i64 %length.zext, i64 %initial_value.zext)
  ret i32 %byte_result

check_short_case:
  %is_short_case = icmp eq i32 %basic_type, 9
  br i1 %is_short_case, label %short_case, label %check_int_case

short_case:
  %short_result = call zing_stub_default i32 @"StubRoutines::arraysHashCodeShort()"(
                  ptr addrspace(1) nonnull readonly %array.gep, i64 %length.zext, i64 %initial_value.zext)
  ret i32 %short_result

check_int_case:
  %is_int_case = icmp eq i32 %basic_type, 10
  br i1 %is_int_case, label %int_case, label %exception_case

int_case:
  %int_result = call zing_stub_default i32 @"StubRoutines::arraysHashCodeInt()"(
                ptr addrspace(1) nonnull readonly %array.gep, i64 %length.zext, i64 %initial_value.zext)
  ret i32 %int_result

exception_case:
  %current.thread = call i64 @azul.get_current_thread()
  call void @azul.throw_arrays_hashcode_illegal_argument_exception(i64 %current.thread, i32 %basic_type) [ "deopt"() ]
  unreachable
}

declare zing_stub_default i32 @"StubRoutines::stringHashCode()"(
  ptr addrspace(1) nonnull readonly %val, i64 %len)
  argmemonly willreturn "gc-leaf-function" nounwind

; Implementation of the java.lang.String.stringHashCode()
define zing i32 @_stringHashCode(ptr addrspace(1) nonnull %val, i32 %len)
  alwaysinline argmemonly willreturn "gc-leaf-function" nounwind{
entry:
  %charArrayHeaderSize = load i32, ptr @arrayOopDesc.charArrayHeaderSize
  %val.start = getelementptr i8, ptr addrspace(1) %val, i32 %charArrayHeaderSize
  %len.zext = zext i32 %len to i64
  %result = call zing_stub_default i32 @"StubRoutines::stringHashCode()"(
            ptr addrspace(1) nonnull readonly %val.start, i64 %len.zext)
  ret i32 %result
}

declare zing_stub_default i32 @"StubRoutines::stringCompare()"(
          ptr addrspace(1) readonly %left,
          ptr addrspace(1) readonly %right,
          i64 %coder1,
          i64 %coder2,
          i64 %length1,
          i64 %length2)
        mustprogress nofree nounwind argmemonly readonly willreturn norecurse nosync "gc-leaf-function" "azul-generatable"

; Set up pending NullPointerException and deoptimize caller
declare void @azul.throw_npe_and_deoptimize(i64) "consumes-caller-vmstate" "azul-deopt-on-throw"

; Set up pending InstantiationException and deoptimize caller
declare void @azul.throw_instantiation_exception_and_deoptimize(i64) "consumes-caller-vmstate" "azul-deopt-on-throw"

; Set up pending NegativeArraySizeException and deoptimize caller
declare void @azul.throw_nase_and_deoptimize(i64) "consumes-caller-vmstate" "azul-deopt-on-throw"

; Set up pending IllegalArgumentException with message for arrays_hashcode and deoptimize caller
declare void @azul.throw_arrays_hashcode_illegal_argument_exception(i64, i32) "consumes-caller-vmstate" "azul-deopt-on-throw"

; Implementation of the java.lang.String.compareTo()
define zing i32 @_compareTo(ptr addrspace(1) %this, ptr addrspace(1) %arg)
  "alwaysinline-top-level" {
entry:
  %is_arg_null = icmp eq ptr addrspace(1) %arg, null
  br i1 %is_arg_null, label %null, label %check_same_obj

check_same_obj:
  %is_same_obj = icmp eq ptr addrspace(1) %this, %arg
  br i1 %is_same_obj, label %same_obj, label %load_data

same_obj:
  ret i32 0

load_data:
  %base_offset_in_bytes = load i32, ptr @arrayOopDesc.charArrayHeaderSize
  %value_offset_in_bytes = load i32, ptr @java_lang_String.value_offset_in_bytes
  %aStr1 = getelementptr inbounds i8, ptr addrspace(1)  %this, i32 %value_offset_in_bytes
  %aStr1.fixed  = call ptr addrspace(1) @azul.load_ref_p1(ptr addrspace(1) %aStr1)
  %aStr2 = getelementptr inbounds i8, ptr addrspace(1) %arg, i32 %value_offset_in_bytes
  %aStr2.fixed  = call ptr addrspace(1) @azul.load_ref_p1(ptr addrspace(1) %aStr2)
  %Len1 = call i32 @azul.array_length(ptr addrspace(1) %aStr1.fixed)
  %Len2 = call i32 @azul.array_length(ptr addrspace(1) %aStr2.fixed)
  %a1 = getelementptr inbounds i8, ptr addrspace(1) %aStr1.fixed, i32 %base_offset_in_bytes
  %a2 = getelementptr inbounds i8, ptr addrspace(1) %aStr2.fixed, i32 %base_offset_in_bytes
  br label %check_same_sarray

check_same_sarray:
  ;check if strings have the same symbol arrays and length
  %is_same_ptr = icmp eq ptr addrspace(1) %a1, %a2
  %is_same_length = icmp eq i32 %Len1, %Len2
  %is_same_sarray = and i1 %is_same_ptr, %is_same_length 
  br i1 %is_same_sarray, label %same_array, label %check_strings_are_compact 

same_array:
  ret i32 0

check_strings_are_compact:
  %compactstrings  = load i1, ptr @CompactStrings
  br i1 %compactstrings, label %compact, label %non-compact 

compact:
  br label %check_coders

non-compact:
  %Len1.i64_nc = zext i32 %Len1 to i64
  %Len2.i64_nc = zext i32 %Len2 to i64
  %result_new_nc = call zing_stub_default i32 @"StubRoutines::stringCompare()"(ptr addrspace(1) readonly %a1,
          ptr addrspace(1) readonly %a2, i64 %Len1.i64_nc, i64 %Len2.i64_nc, i64 0, i64 0)
  ret i32 %result_new_nc

check_coders:
  %Coder1 = call i8 @azul.get_string_coder(ptr addrspace(1) %this)
  %Coder2 = call i8 @azul.get_string_coder(ptr addrspace(1) %arg)
  %minSlength = call i32 @llvm.umin.i32(i32 %Len1, i32 %Len2)
  %zero_min_length = icmp eq i32 %minSlength, 0
  ;use constants 0 for LATIN1 and 1 for UTF* (according to String.java)
  ;use fast path for latin comparison, otherwise call stub
  %code = or i8 %Coder1, %Coder2
  %is_latin_case = icmp ne i8 %code, 1
  br i1 %is_latin_case, label %latin_case, label %no_latin_case

 no_latin_case:
  br i1 %zero_min_length, label %length_diff_label, label %call_stub

 latin_case:
  br i1 %zero_min_length, label %length_diff_label_latin, label %check_strings

 check_strings:
  ; for large strings go to stub (contains 16 byte hand made vectorization)
  ; otherwise make comparison in scalar loop
  %small_loop = icmp sle i32 %minSlength, 16
  br i1 %small_loop, label %small_loop_case, label %large_loop_case

 large_loop_case:
   ;check first 16 symbols of strings and go to scalar loop if they are not equal 
   %E1 = load i64, ptr addrspace(1) %a1
   %E2 = load i64, ptr addrspace(1) %a2
   %equal_elements1 = icmp eq i64 %E1, %E2
   %a12 = getelementptr inbounds i8, ptr addrspace(1) %a1, i32 8
   %a22 = getelementptr inbounds i8, ptr addrspace(1) %a2, i32 8
   %E12 = load i64, ptr addrspace(1) %a12
   %E22 = load i64, ptr addrspace(1) %a22
   %equal_elements2 = icmp eq i64 %E12, %E22
   %cont_is_needed = and i1 %equal_elements1, %equal_elements2
   br i1 %cont_is_needed, label %call_stub, label %small_loop_case

 small_loop_case:
    %trip.count = zext i32 %minSlength to i64
    br label %for.cond 

 for.cond:
    %index = phi i64 [ %index.next, %for.body ], [ 0, %small_loop_case ]
    %exitcond = icmp eq i64 %index, %trip.count
    br i1 %exitcond, label %length_diff_label_latin, label %for.body

 for.body:
    %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %a1, i64 %index
    %for.el1 = load i8, ptr addrspace(1)  %arrayidx, align 1
    %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %a2, i64 %index
    %for.el2 = load i8, ptr addrspace(1) %arrayidx2, align 1
    %symbols_are_equal = icmp eq i8 %for.el1, %for.el2
    %index.next = add nuw nsw i64 %index, 1
    br i1 %symbols_are_equal, label %for.cond, label %diff_chars

 diff_chars:
    %for.el1.i32 = zext i8 %for.el1 to i32
    %for.el2.i32 = zext i8 %for.el2 to i32
    %myres = sub i32 %for.el1.i32, %for.el2.i32
    ret i32 %myres

 ; Call of stub is expensive in performance sense. So stub is called for rare cases for large strings
 ; with coding.
 call_stub:
  %Coder1.i64 = zext i8 %Coder1 to i64
  %Coder2.i64 = zext i8 %Coder2 to i64
  %Len1.i64 = zext i32 %Len1 to i64
  %Len2.i64 = zext i32 %Len2 to i64
  %result_new = call zing_stub_default i32 @"StubRoutines::stringCompare()"(ptr addrspace(1) readonly %a1,
          ptr addrspace(1) readonly %a2, i64 %Len1.i64, i64 %Len2.i64, i64 %Coder1.i64, i64 %Coder2.i64) 
  ret i32 %result_new

 length_diff_label:
  %Len1.utf = lshr i32 %Len1, 1
  %Len2.utf = lshr i32 %Len2, 1
  %utf1 = icmp eq i8 %Coder1, 1
  %utf2 = icmp eq i8 %Coder2, 1
  %Len1.str = select i1 %utf1, i32 %Len1.utf, i32 %Len1
  %Len2.str = select i1 %utf2, i32 %Len2.utf, i32 %Len2
  %length_diff = sub i32 %Len1.str, %Len2.str
  ret i32 %length_diff

 length_diff_label_latin:
  %length_diff_latin = sub i32 %Len1, %Len2
  ret i32 %length_diff_latin

null:
  %current_thread = call i64 @azul.get_current_thread()
  call void @azul.throw_npe_and_deoptimize(i64 %current_thread) [ "deopt"() ]
  unreachable
}

declare i32 @llvm.umin.i32(i32 %a, i32 %b)
declare zing_stub_default i32 @"StubRoutines::stringIndexOf()"(
          ptr addrspace(1) readonly %str1,
          ptr addrspace(1) readonly %str2,
          i64 %cnt1, i64 %cnt2, i64 %fromIndex, i64 %coders)
        nounwind argmemonly readonly willreturn "gc-leaf-function"

declare zing_stub_default i32 @"StubRoutines::stringIndexOfChar()"(
          ptr addrspace(1) readonly %str1,
          i64 %ch, i64 %fromIndex, i64 %cnt1)
        nounwind argmemonly readonly willreturn "gc-leaf-function"

declare zing_stub_default i32 @"StubRoutines::stringIndexOfLChar()"(
          ptr addrspace(1) readonly %str1,
          i64 %ch, i64 %fromIndex, i64 %cnt1)
        nounwind argmemonly readonly willreturn "gc-leaf-function"

; Helper function for _indexOf, _indexOfI intrinsics to avoid code duplication. 
; Because of internal attribute this function is deleted from boilerplate module after inlining
define internal i32 @indexOf_impl(ptr addrspace(1) %this, ptr addrspace(1) %substr, i32 %fromIndex)
  alwaysinline  {
entry:
  %charArrayHeaderSize = load i32, ptr @arrayOopDesc.charArrayHeaderSize
  %is_substr_null = icmp eq ptr addrspace(1) %substr, null
  br i1 %is_substr_null, label %null, label %not_null

not_null:
  ; First string value
  %this_value = call ptr addrspace(1) @azul.get_string_value(ptr addrspace(1) %this)
  %this_value_len = call i32 @azul.array_length(ptr addrspace(1) %this_value)

  %this_coder = call i8 @azul.get_string_coder(ptr addrspace(1) %this)
  ;coder is 0 for LATIN1 and 1 for UTF16 (according to String.java)
  %this_coder.i32 = zext i8 %this_coder to i32
  %this_char_len = lshr i32 %this_value_len, %this_coder.i32

  ; Second string value
  %substr_value = call ptr addrspace(1) @azul.get_string_value(ptr addrspace(1) %substr)
  %substr_value_len = call i32 @azul.array_length(ptr addrspace(1) %substr_value)
  %is_substr_empty = icmp eq i32 %substr_value_len, 0

  %normal_fromIndex = icmp slt i32 %fromIndex, %this_value_len
  br i1 %normal_fromIndex, label %check_neg_index, label %large_fromIndex 

check_neg_index:
  %is_fromIndex_gte_0 = icmp sge i32 %fromIndex, 0
  %fromIndex.checked = select i1 %is_fromIndex_gte_0, i32 %fromIndex, i32 0
  br i1 %is_substr_empty, label %empty_substr, label %substr_not_empty

substr_not_empty:
  %is_substr_longer = icmp ugt i32 %substr_value_len, %this_value_len
  br i1 %is_substr_longer, label %failure, label %substr_shorter

substr_shorter:
  %substr_coder = call i8 @azul.get_string_coder(ptr addrspace(1) %substr)
  %substr_coder.i32 =  zext i8 %substr_coder to i32
  %substr_char_len = lshr i32 %substr_value_len, %substr_coder.i32

  %from_index_substr_len = add i32 %substr_char_len, %fromIndex.checked
  %is_shorter_or_eq = icmp sge i32 %this_char_len, %from_index_substr_len
  br i1 %is_shorter_or_eq, label %runtime_query, label %failure

runtime_query:
  %this_value_len.zext = zext i32 %this_value_len to i64
  %substr_value_len.zext = zext i32 %substr_value_len to i64
  %fromIndex.zext = zext i32 %fromIndex.checked to i64

  ; Based on java version and CompactStrings flag we determine whether we have byte[]
  ; or char[] internal string representation.
  %jdk_version = load i32, ptr @JDK_VERSION_MAJOR
  %is_byte_string = icmp ugt i32 %jdk_version, 8
  br i1 %is_byte_string, label %byte_case, label %check_for_cs

check_for_cs:
  %compact_strings = load i1, ptr @CompactStrings
  br i1 %compact_strings, label %byte_case, label %char_case

char_case:
  %this_value_start_char = getelementptr i8, ptr addrspace(1) %this_value, i32 %charArrayHeaderSize
  %substr_value_start_char = getelementptr i8, ptr addrspace(1) %substr_value, i32 %charArrayHeaderSize
  %char_result = call zing_stub_default i32 @"StubRoutines::stringIndexOf()"(
           ptr addrspace(1) readonly %this_value_start_char,
           ptr addrspace(1) readonly %substr_value_start_char,
           i64 %this_value_len.zext, i64 %substr_value_len.zext, i64 %fromIndex.zext, i64 0)
  ret i32 %char_result

byte_case:
  %this_is_latin = icmp eq i8 %this_coder, 0
  br i1 %this_is_latin, label %this_latin, label %call_indexOf

this_latin:
  %substr_is_latin = icmp eq i8 %substr_coder, 0
  br i1 %substr_is_latin, label %latin_case, label %failure

latin_case:
  %is_symbol_search = icmp eq i32 %substr_value_len, 1 
  br i1 %is_symbol_search, label %call_indexOfLChar, label %call_indexOf

call_indexOf:
  ;coders: LL : 0, UL : 1, UU : 2
  ;LU case is impossible (label this_latin)
  %coders = add nsw i8 %this_coder, %substr_coder
  %coders.zext = zext i8 %coders to i64
  %this_value_start = getelementptr i8, ptr addrspace(1) %this_value, i32 %charArrayHeaderSize
  %substr_value_start = getelementptr i8, ptr addrspace(1) %substr_value, i32 %charArrayHeaderSize

  %result = call zing_stub_default i32 @"StubRoutines::stringIndexOf()"(
          ptr addrspace(1) readonly %this_value_start,
          ptr addrspace(1) readonly %substr_value_start,
          i64 %this_value_len.zext, i64 %substr_value_len.zext, i64 %fromIndex.zext,
          i64 %coders.zext)
  ret i32 %result

call_indexOfLChar:
  %this_array_latin = getelementptr i8, ptr addrspace(1) %this_value, i32 %charArrayHeaderSize
  %substr_array_latin = getelementptr i8, ptr addrspace(1) %substr_value, i32 %charArrayHeaderSize
  %ch_latin = load i8, ptr addrspace(1) %substr_array_latin
  %ch_latin.64 = zext i8 %ch_latin to i64
  %result_latinc = call zing_stub_default i32 @"StubRoutines::stringIndexOfLChar()"(
           ptr addrspace(1) readonly %this_array_latin,
           i64 %ch_latin.64, i64 %fromIndex.zext, i64 %this_value_len.zext)
  ret i32 %result_latinc

large_fromIndex:
  %large_index_ret = select i1 %is_substr_empty, i32 %this_char_len, i32 -1
  ret i32 %large_index_ret

empty_substr:
  ret i32 %fromIndex.checked

failure:
  ret i32 -1

null:
  %current_thread = call i64 @azul.get_current_thread()
  call void @azul.throw_npe_and_deoptimize(i64 %current_thread) [ "deopt"() ]
  unreachable
}

; java.lang.String.indexOf(String)
define zing i32 @_indexOf(ptr addrspace(1) %this, ptr addrspace(1) %substr)
  "alwaysinline-top-level" {
entry:
  %result = call i32 @indexOf_impl(ptr addrspace(1) %this, ptr addrspace(1) %substr, i32 0)
  ret i32 %result
}

; java.lang.String.indexOf(String, int fromIndex)
define zing i32 @_indexOfI(ptr addrspace(1) %this, ptr addrspace(1) %substr, i32 %fromIndex)
  "alwaysinline-top-level" {
entry:
  %result = call i32 @indexOf_impl(ptr addrspace(1) %this, ptr addrspace(1) %substr, i32 %fromIndex)
  ret i32 %result
}

define zing i32 @_addExactI(
    i32 %first,
    i32 %second) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {
entry:
  %res = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %first, i32 %second)
  %obit = extractvalue {i32, i1} %res, 1
  br i1 %obit, label %overflow, label %normal

normal:
  %sum = extractvalue {i32, i1} %res, 0
  ret i32 %sum

overflow:
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  %ret = call zing_uncommon_trap i32(...) @llvm.experimental.deoptimize.isI32(i32 %reasonUnhandled) "azul-need-deopt-before-call" "azul-allow-gcptrs-in-regs" "deopt-lowering"="live-in" [ "deopt"() ]
  ret i32 %ret

}

declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone

define zing i64 @_addExactL(
    i64 %first,
    i64 %second) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {
entry:
  %res = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %first, i64 %second)
  %obit = extractvalue {i64, i1} %res, 1
  br i1 %obit, label %overflow, label %normal

normal:
  %sum = extractvalue {i64, i1} %res, 0
  ret i64 %sum

overflow:
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  %ret = call zing_uncommon_trap i64(...) @llvm.experimental.deoptimize.isI64(i32 %reasonUnhandled) "azul-need-deopt-before-call" "azul-allow-gcptrs-in-regs" "deopt-lowering"="live-in" [ "deopt"() ]
  ret i64 %ret

}

declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone

define zing i32 @_subtractExactI(
    i32 %first,
    i32 %second) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {
entry:
  %res = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %first, i32 %second)
  %obit = extractvalue {i32, i1} %res, 1
  br i1 %obit, label %overflow, label %normal

normal:
  %sub = extractvalue {i32, i1} %res, 0
  ret i32 %sub

overflow:
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  %ret = call zing_uncommon_trap i32(...) @llvm.experimental.deoptimize.isI32(i32 %reasonUnhandled) "azul-need-deopt-before-call" "azul-allow-gcptrs-in-regs" "deopt-lowering"="live-in" [ "deopt"() ]
  ret i32 %ret

}

declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone

define zing i64 @_subtractExactL(
    i64 %first,
    i64 %second) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {
entry:
  %res = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %first, i64 %second)
  %obit = extractvalue {i64, i1} %res, 1
  br i1 %obit, label %overflow, label %normal

normal:
  %sub = extractvalue {i64, i1} %res, 0
  ret i64 %sub

overflow:
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  %ret = call zing_uncommon_trap i64(...) @llvm.experimental.deoptimize.isI64(i32 %reasonUnhandled) "azul-need-deopt-before-call" "azul-allow-gcptrs-in-regs" "deopt-lowering"="live-in" [ "deopt"() ]
  ret i64 %ret

}

declare { i32, i1 } @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone

define zing i32 @_multiplyExactI(
    i32 %first,
    i32 %second) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {
entry:
  %res = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %first, i32 %second)
  %obit = extractvalue {i32, i1} %res, 1
  br i1 %obit, label %overflow, label %normal

normal:
  %mul = extractvalue {i32, i1} %res, 0
  ret i32 %mul

overflow:
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  %ret = call zing_uncommon_trap i32(...) @llvm.experimental.deoptimize.isI32(i32 %reasonUnhandled) "azul-need-deopt-before-call" "azul-allow-gcptrs-in-regs" "deopt-lowering"="live-in" [ "deopt"() ]
  ret i32 %ret

}

declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone

define zing i64 @_multiplyExactL(
    i64 %first,
    i64 %second) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {
entry:
  %res = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %first, i64 %second)
  %obit = extractvalue {i64, i1} %res, 1
  br i1 %obit, label %overflow, label %normal

normal:
  %mul = extractvalue {i64, i1} %res, 0
  ret i64 %mul

overflow:
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  %ret = call zing_uncommon_trap i64(...) @llvm.experimental.deoptimize.isI64(i32 %reasonUnhandled) "azul-need-deopt-before-call" "azul-allow-gcptrs-in-regs" "deopt-lowering"="live-in" [ "deopt"() ]
  ret i64 %ret
}

declare zing_stub_default i32 @"StubRoutines::encodeISOArray()"(
        ptr addrspace(1) readonly %src, ptr addrspace(1) %dst, i64 %len)
    nounwind argmemonly willreturn "gc-leaf-function"

define zing i32 @_encodeByteISOArray(
    ptr addrspace(1) nonnull %sa, 
    i32 %sp, 
    ptr addrspace(1) nonnull %da,
    i32 %dp,
    i32 %len) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {
entry:
  ; unwrap the byte array and get to the underlying data, no need to check sizes
  ; here, already checked in the java-land
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %sa.data = getelementptr inbounds i8, ptr addrspace(1) %sa, i32 %byteArrayHeaderSize
  %da.data = getelementptr inbounds i8, ptr addrspace(1) %da, i32 %byteArrayHeaderSize
  ; we move everything to i64s, can use zext here since Java checked for us that the values are non-negative
  %sp.64 = zext i32 %sp to i64
  %dp.64 = zext i32 %dp to i64
  %len.64 = zext i32 %len to i64
  ; adjust the pointers we give to the stub with the given offsets
  %src.start = getelementptr inbounds i16, ptr addrspace(1) %sa.data, i64 %sp.64
  %dst.start = getelementptr inbounds i8, ptr addrspace(1) %da.data, i64 %dp.64
  %result = call zing_stub_default i32 @"StubRoutines::encodeISOArray()"(
           ptr addrspace(1) %src.start, ptr addrspace(1) %dst.start, i64 %len.64)
  ret i32 %result
}

;; int implEncodeISOArray(char[] sa, int sp, byte[] da, int dp, int len)
define zing i32 @_encodeISOArray(
    ptr addrspace(1) nonnull %src,
    i32 %srcOff,
    ptr addrspace(1) nonnull %dst,
    i32 %dstOff,
    i32 %len) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {
entry:
  %charArrayHeaderSize = load i32, ptr @arrayOopDesc.charArrayHeaderSize
  %src.data = getelementptr inbounds i8, ptr addrspace(1) %src, i32 %charArrayHeaderSize
  %srcOff.i64 = zext i32 %srcOff to i64
  %src.start = getelementptr inbounds i16, ptr addrspace(1) %src.data, i64 %srcOff.i64

  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %dst.data = getelementptr inbounds i8, ptr addrspace(1) %dst, i32 %byteArrayHeaderSize
  %dstOff.i64 = zext i32 %dstOff to i64
  %dst.start  = getelementptr inbounds i8, ptr addrspace(1) %dst.data, i64 %dstOff.i64

  %len.i64 = zext i32 %len to i64

  %result = call zing_stub_default i32 @"StubRoutines::encodeISOArray()"(
           ptr addrspace(1) %src.start, ptr addrspace(1) %dst.start, i64 %len.i64)
  ret i32 %result
}

declare zing_stub_default i32 @"StubRoutines::encodeAsciiArray()"(
        ptr addrspace(1) readonly %src, ptr addrspace(1) %dst, i64 %len)
    nounwind argmemonly willreturn "gc-leaf-function"

;; int implEncodeAsciiArray(char[] sa, int sp, byte[] da, int dp, int len)
define zing i32 @_encodeAsciiArray(
    ptr addrspace(1) nonnull %src,
    i32 %srcOff,
    ptr addrspace(1) nonnull %dst,
    i32 %dstOff,
    i32 %len) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {
entry:
  %charArrayHeaderSize = load i32, ptr @arrayOopDesc.charArrayHeaderSize
  %src.data = getelementptr inbounds i8, ptr addrspace(1) %src, i32 %charArrayHeaderSize
  %srcOff.i64 = zext i32 %srcOff to i64
  %src.start = getelementptr inbounds i16, ptr addrspace(1) %src.data, i64 %srcOff.i64

  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %dst.data = getelementptr inbounds i8, ptr addrspace(1) %dst, i32 %byteArrayHeaderSize
  %dstOff.i64 = zext i32 %dstOff to i64
  %dst.start  = getelementptr inbounds i8, ptr addrspace(1) %dst.data, i64 %dstOff.i64

  %len.i64 = zext i32 %len to i64

  %result = call zing_stub_default i32 @"StubRoutines::encodeAsciiArray()"(
           ptr addrspace(1) %src.start, ptr addrspace(1) %dst.start, i64 %len.i64)
  ret i32 %result
}

declare zing_stub_default i32 @"StubRoutines::compressStringC()"(
        ptr addrspace(1) readonly %src, ptr addrspace(1) %dst, i64 %len)
    nounwind argmemonly willreturn "gc-leaf-function"

;; int StringUTF16.compressImpl(char[] src, int srcOff, byte[] dst, int dstOff, int len)
define zing i32 @_compressStringC(
    ptr addrspace(1) nonnull %src,
    i32 %srcOff,
    ptr addrspace(1) nonnull %dst,
    i32 %dstOff,
    i32 %len) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {

entry:
  %charArrayHeaderSize = load i32, ptr @arrayOopDesc.charArrayHeaderSize
  %src.data = getelementptr inbounds i8, ptr addrspace(1) %src, i32 %charArrayHeaderSize
  %srcOff.i64 = zext i32 %srcOff to i64
  %src.start = getelementptr inbounds i16, ptr addrspace(1) %src.data, i64 %srcOff.i64

  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %dst.data = getelementptr inbounds i8, ptr addrspace(1) %dst, i32 %byteArrayHeaderSize
  %dstOff.i64 = zext i32 %dstOff to i64
  %dst.start = getelementptr inbounds i8, ptr addrspace(1) %dst.data, i64 %dstOff.i64

  %len.i64 = zext i32 %len to i64

  %result = call zing_stub_default i32 @"StubRoutines::compressStringC()"(
           ptr addrspace(1) %src.start, ptr addrspace(1) %dst.start, i64 %len.i64)
  ret i32 %result
}

;; int StringUTF16.compressImplByte(byte[] src, int srcOff, byte[] dst, int dstOff, int len)
define zing i32 @_compressStringB(
    ptr addrspace(1) nonnull %src,
    i32 %srcOff,
    ptr addrspace(1) nonnull %dst,
    i32 %dstOff,
    i32 %len) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {

entry:

  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %src.base = getelementptr inbounds i8, ptr addrspace(1) %src, i32 %byteArrayHeaderSize
  %srcOff.i64 = zext i32 %srcOff to i64
;; According to the Java API srcOff is an offset in characters (not bytes),
;; We are using i16 as the gep type so as to scale the offset appropriately.
  %src.start = getelementptr inbounds i16, ptr addrspace(1) %src.base, i64 %srcOff.i64

  %dst.data = getelementptr inbounds i8, ptr addrspace(1) %dst, i32 %byteArrayHeaderSize
  %dstOff.i64 = zext i32 %dstOff to i64
  %dst.start = getelementptr inbounds i8, ptr addrspace(1) %dst.data, i64 %dstOff.i64

  %len.i64 = zext i32 %len to i64

  %result = call zing_stub_default i32 @"StubRoutines::compressStringC()"(
           ptr addrspace(1) %src.start, ptr addrspace(1) %dst.start, i64 %len.i64)
  ret i32 %result
}

declare zing_stub_default void @"StubRoutines::inflateStringC()"(
        ptr addrspace(1) readonly %src, ptr addrspace(1) %dst, i64 %len)
    nounwind argmemonly willreturn "gc-leaf-function"

;; int StringLatin1.inflateImpl(byte[] src, int srcOff, char[] dst, int dstOff, int len)
define zing void @_inflateStringC(
    ptr addrspace(1) nonnull %src,
    i32 %srcOff,
    ptr addrspace(1) nonnull %dst,
    i32 %dstOff,
    i32 %len) alwaysinline nounwind argmemonly willreturn "gc-leaf-function" {
entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %src.data = getelementptr inbounds i8, ptr addrspace(1) %src, i32 %byteArrayHeaderSize
  %srcOff.i64 = zext i32 %srcOff to i64
  %src.start = getelementptr inbounds i8, ptr addrspace(1) %src.data, i64 %srcOff.i64

  %charArrayHeaderSize = load i32, ptr @arrayOopDesc.charArrayHeaderSize
  %dst.data = getelementptr inbounds i8, ptr addrspace(1) %dst, i32 %charArrayHeaderSize
  %dstOff.i64 = zext i32 %dstOff to i64
  %dst.start = getelementptr inbounds i16, ptr addrspace(1) %dst.data, i64 %dstOff.i64

  %len.i64 = zext i32 %len to i64

  call zing_stub_default void @"StubRoutines::inflateStringC()"(
           ptr addrspace(1) %src.start, ptr addrspace(1) %dst.start, i64 %len.i64)
  ret void
}

; Method String.indexOf(int ch) implemented as a call to String.indexOf(ch, 0)
; so it is enough to have only this methods intrinsifyed.
;
; JDK8 has char[] internal string value representation by default, but Zing
; provides same String classes as JDK >= 9 has with extensions that can be
; enabled by -XX:+CompactStrings. So we do not intrinsify char[] case.

; StringLatin1.indexOfChar(byte[] value, int ch, int fromIndex, int max)
define zing i32 @_indexOfL_char(ptr addrspace(1) %value, i32 %ch, i32 %fromIndex, i32 %max)
  nounwind readonly alwaysinline {
entry:
  %is_empty = icmp eq i32 %max, 0
  br i1 %is_empty, label %not_found, label %not_empty

not_empty:
  %trunc = trunc i32 %ch to i8
  %index.start = zext i32 %fromIndex to i64
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %byte_array_start = getelementptr i8, ptr addrspace(1) %value, i32 %byteArrayHeaderSize

  %len = zext i32 %max to i64
  ; Short strings are considered to be less than 16 chars. Maybe it should be some kind of
  ; command line option to set the len of short strings, but reading it will introduce one
  ; more load here and kill all the performance wins of having a different way to handle a
  ; short strings.
  %is_short_string = icmp ult i32 %max, 16
  br i1 %is_short_string, label %loop_start, label %call_stub

call_stub:
  %ch.64 = zext i32 %ch to i64
  %result = call zing_stub_default i32 @"StubRoutines::stringIndexOfLChar()"(
           ptr addrspace(1) readonly %byte_array_start,
           i64 %ch.64, i64 %index.start, i64 %len)
  ret i32 %result

loop_start:
  %index = phi i64 [ %index.start, %not_empty ], [ %index.next, %latch ]
  %gep = getelementptr inbounds i8, ptr addrspace(1) %byte_array_start, i64 %index
  %byte = load atomic i8, ptr addrspace(1) %gep unordered, align 1
  %is_eq = icmp eq i8 %trunc, %byte
  br i1 %is_eq, label %found, label %latch

latch:
  %index.next = add nsw i64 %index, 1
  %exit_cond = icmp eq i64 %index.next, %len
  br i1 %exit_cond, label %not_found, label %loop_start, !llvm.loop !28

not_found:
  ret i32 -1

found:
  %ret = trunc i64 %index to i32
  ret i32 %ret
}

; StringUTF16.indexOfChar(byte[] value, int ch, int fromIndex, int max)
define zing i32 @_indexOfU_char(ptr addrspace(1) %value, i32 %ch, i32 %fromIndex, i32 %max)
  nounwind readonly alwaysinline {
entry:
  %is_empty = icmp eq i32 %max, 0
  br i1 %is_empty, label %not_found, label %not_empty

not_empty:
  %index.start = zext i32 %fromIndex to i64
  %charArrayHeaderSize = load i32, ptr @arrayOopDesc.charArrayHeaderSize
  %char_array_start = getelementptr i8, ptr addrspace(1) %value, i32 %charArrayHeaderSize

  %len = zext i32 %max to i64
  ; Short strings are considered to be less than 16 chars. Maybe it should be some kind of
  ; command line option to set the len of short strings, but reading it will introduce one
  ; more load here and kill all the performance wins of having a different way to handle a
  ; short strings.
  %is_short_string = icmp ult i32 %max, 16
  br i1 %is_short_string, label %continue, label %call_stub

call_stub:
  %ch.64 = zext i32 %ch to i64
  %result = call zing_stub_default i32 @"StubRoutines::stringIndexOfChar()"(
           ptr addrspace(1) readonly %char_array_start,
           i64 %ch.64, i64 %index.start, i64 %len)
  ret i32 %result

continue:
  %trunc = trunc i32 %ch to i16
  br label %loop_start

loop_start:
  %index = phi i64 [ %index.start, %continue ], [ %index.next, %latch ]
  %gep = getelementptr inbounds i16, ptr addrspace(1) %char_array_start, i64 %index
  %char = load atomic i16, ptr addrspace(1) %gep unordered, align 2 ; %value is at least align 2
  %is_eq = icmp eq i16 %trunc, %char
  br i1 %is_eq, label %found, label %latch

latch:
  %index.next = add nsw i64 %index, 1
  %exit_cond = icmp eq i64 %index.next, %len
  br i1 %exit_cond, label %not_found, label %loop_start, !llvm.loop !28

not_found:
  ret i32 -1

found:
  %ret = trunc i64 %index to i32
  ret i32 %ret
}

; java.lang.String.lastIndexOf(int, int)
; Added for the sake of enabling loop unrolling, which is proven to give good
; performance with this routine. Required for the customer case (see 14493 zilla).
; Intrinsic could be enabled by the experimental +FalconSpecialStringLastIndexOf
define zing i32 @_lastIndexOfImpl(ptr addrspace(1) %this, i32 %ch, i32 %fromIndex)
  nounwind readonly alwaysinline {
entry:
  %this_value = call ptr addrspace(1) @azul.get_string_value(ptr addrspace(1) %this)
  %this_value_len = call i32 @azul.array_length(ptr addrspace(1) %this_value)
  %charArrayHeaderSize = load i32, ptr @arrayOopDesc.charArrayHeaderSize
  %this_value_start = getelementptr i8, ptr addrspace(1) %this_value, i32 %charArrayHeaderSize
  %this_value_len_sub_1 = add nsw i32 %this_value_len, -1

  ; check length and start index
  %is_lesser = icmp slt i32 %fromIndex, %this_value_len_sub_1
  %min = select i1 %is_lesser, i32 %fromIndex, i32 %this_value_len_sub_1
  %is_greater_than_0 = icmp sge i32 %min, 0
  br i1 %is_greater_than_0, label %loop_preheader, label %no_match

loop_preheader:
  %char_trunc = trunc i32 %ch to i16
  %min.zext = zext i32 %min to i64
  br label %loop_start

loop_start:
  %ind_var = phi i64 [ %min.zext, %loop_preheader ], [ %ind_var.next, %latch ]
  %char_gep = getelementptr inbounds i16, ptr addrspace(1) %this_value_start, i64 %ind_var
  %char = load atomic i16, ptr addrspace(1) %char_gep unordered, align 2
  %is_eq = icmp eq i16 %char_trunc, %char
  br i1 %is_eq, label %exit, label %latch

latch:
  %ind_var.next = add nsw i64 %ind_var, -1
  %exit_cond = icmp slt i64 %ind_var, 1
  br i1 %exit_cond, label %no_match, label %loop_start, !llvm.loop !28

no_match:
  ; Below call is a workaround to prevent this BB from merging.
  ; We don't want this merged since it is latch exit BB,
  ; if it would be merged with some other BB having more
  ; predecessors (like 'exit' BB below) then loop unrolling
  ; will give up on this loop due to latch exit having
  ; more than 1 predecessors.
  call void @onSpinWait() nounwind
  br label %exit

exit:
  %incoming = phi i64 [ -1, %no_match], [ %ind_var, %loop_start ]
  %ret = trunc i64 %incoming to i32
  ret i32 %ret
}

declare i32 @"StubRoutines::updateBytesAdler32().p0"(
                                  i64 %adler, 
                                  ptr %src_start, 
                                  i64 %length) argmemonly readonly nounwind willreturn "gc-leaf-function"
declare i32 @"StubRoutines::updateBytesAdler32().p1"(
                                  i64 %adler, 
                                  ptr addrspace(1) %src_start, 
                                  i64 %length) argmemonly readonly nounwind willreturn "gc-leaf-function"

; int java.util.zip.Adler32.updateBytes(int crc, byte[] buf, int off, int len)
define zing i32 @_updateBytesAdler32(i32 %adler, ptr addrspace(1) %b,
       i32 %off, i32 %len)
  argmemonly readonly nounwind "alwaysinline-top-level"
{
entry:
  %byteArrayHeaderSize = load i32, ptr @arrayOopDesc.byteArrayHeaderSize
  %b_start = getelementptr i8, ptr addrspace(1) %b, i32 %byteArrayHeaderSize
  %src_ptr = getelementptr i8, ptr addrspace(1) %b_start, i32 %off
  %adler.zext = zext i32 %adler to i64
  %len.zext = zext i32 %len to i64

  %res = call i32 @"StubRoutines::updateBytesAdler32().p1"(i64 %adler.zext,
       ptr addrspace(1) %src_ptr, i64 %len.zext)
  ret i32 %res
}

; int java.util.zip.Adler32.updateByteBuffer(int crc, long buf, int off, int len)
define zing i32 @_updateByteBufferAdler32(i32 %adler, i64 %src, i32 %off, i32 %len)
  argmemonly readonly nounwind noinline "alwaysinline-top-level" {
entry:
  %off_64 = sext i32 %off to i64
  %unsafe_addr = add i64 %src, %off_64
  %src_ptr = inttoptr i64 %unsafe_addr to ptr
  %adler.zext = zext i32 %adler to i64
  %len.zext = zext i32 %len to i64

  %res = call i32 @"StubRoutines::updateBytesAdler32().p0"(i64 %adler.zext,
       ptr %src_ptr, i64 %len.zext)
  ret i32 %res
}

declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone

; static boolean java.lang.invoke.MethodHandleImpl.profileBoolean(boolean, int[])
define zing i32 @_profileBoolean(i32 %result, ptr addrspace(1) %counters)
  nounwind alwaysinline {
entry:
  %useEmptyBody = load i1, ptr @FalconUseEmptyProfileBoolean
  br i1 %useEmptyBody, label %immediate_return, label %continue
continue:
  %intArrayHeaderSize = load i32, ptr @arrayOopDesc.intArrayHeaderSize
  %cond = trunc i32 %result to i1
  %idx = select i1 %cond, i32 1, i32 0
  %counters.base = getelementptr inbounds i8, ptr addrspace(1) %counters, i32 %intArrayHeaderSize
  %counters.addr = getelementptr inbounds i32, ptr addrspace(1) %counters.base, i32 %idx
  %cnt.init.val = load atomic i32, ptr addrspace(1) %counters.addr unordered, align 4
  %sadd.res       = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %cnt.init.val, i32 1)
  %cnt            = extractvalue { i32, i1 } %sadd.res, 0
  %sadd.overflown = extractvalue { i32, i1 } %sadd.res, 1
  br i1 %sadd.overflown, label %overflown, label %normal
overflown:
  %half.cnt = lshr i32 %cnt.init.val, 1
  br label %store_counter
normal:
  br label %store_counter
store_counter:
  %updated.cnt = phi i32 [%cnt, %normal], [%half.cnt, %overflown]
  store atomic i32 %updated.cnt, ptr addrspace(1) %counters.addr unordered, align 4
  br label %immediate_return
immediate_return:
  ret i32 %result
}

; This function does the CAS for references and will be overloaded to handle
; both compressed and regular oops.
declare { ptr addrspace(1), i1 } @azul.cas_ref(ptr addrspace(1) %addr, ptr addrspace(1) %expected_val, 
                                             ptr addrspace(1) %new_val)
    "azul-late-inline"="0" alwaysinline willreturn "gc-leaf-function" nounwind

define { ptr addrspace(1), i1 } @compareAndSwapObject_lazy_lvb(ptr addrspace(1) %object, i64 %offset,
                                                    ptr addrspace(1) %expected, ptr addrspace(1) %newVal)
    alwaysinline nounwind willreturn "gc-leaf-function" {
entry:
  %oopAddr = getelementptr i8, ptr addrspace(1) %object, i64 %offset
  %expected.poisoned = call ptr addrspace(1) @azul.poison.oop(ptr addrspace(1) %expected)

  %newVal.poisoned = call ptr addrspace(1) @azul.poison.oop(ptr addrspace(1) %newVal)

  ;; In this strategy, we speculatively assume that the reference
  ;; stored at *%oopAddr is in the current phase, and the CAS is
  ;; likely to succeed.
  %firstTry = call { ptr addrspace(1), i1 } @azul.cas_ref(ptr addrspace(1) %oopAddr, 
                                             ptr addrspace(1) %expected.poisoned, ptr addrspace(1) %newVal.poisoned)
  %success = extractvalue { ptr addrspace(1), i1 } %firstTry, 1
  br i1 %success, label %return_success, label %retry

return_success:
  %firstTry.ptr = extractvalue { ptr addrspace(1), i1 } %firstTry, 0
  %firstTry.ptr.unpoisoned = call ptr addrspace(1) @azul.unpoison.oop(ptr addrspace(1) %firstTry.ptr)
  %firstTry.unpoisoned = insertvalue { ptr addrspace(1), i1 } %firstTry, ptr addrspace(1) %firstTry.ptr.unpoisoned, 0
  ret { ptr addrspace(1), i1 } %firstTry.unpoisoned

retry:
  ;; If the CAS failed, it could be because the reference in *%oopAddr is
  ;; logically equal to %expected but out of phase, or it could be that
  ;; *%oopAddr holds a logically different reference.  We rule out the former
  ;; possibility by healing the location.
  ;;
  ;; NB! Despite operationally equivalent at this time, it _not_ legal to use a
  ;; @jHeapLvb call here.  @jHeapLvb calls have LVB semantics, meaning they can
  ;; be optimized as LVBs.  LVB optimizations include removing unused @jHeapLvb
  ;; calls, which we do not want here.
  ;;
  ;; TODO: There is a minor optimization here -- we can re-use the value
  ;; returned from the cmpxchg instead of re-loading from %oopAddr in
  ;; @azul.healJHeapLocation.
  call void @azul.healJHeapLocation(ptr addrspace(1) %oopAddr)

  ;; Note: threads always store in-phase oops so after the @jHeapLvb
  ;; above is done we know that whatever is currently in *%oopAddr is
  ;; in phase.

  %finalTry = call { ptr addrspace(1), i1 } @azul.cas_ref(ptr addrspace(1) %oopAddr, 
                                             ptr addrspace(1) %expected.poisoned, ptr addrspace(1) %newVal.poisoned)
  %finalTry.ptr = extractvalue { ptr addrspace(1), i1 } %finalTry, 0
  %finalTry.ptr.unpoisoned = call ptr addrspace(1) @azul.unpoison.oop(ptr addrspace(1) %finalTry.ptr)
  %finalTry.unpoisoned = insertvalue { ptr addrspace(1), i1 } %finalTry, ptr addrspace(1) %finalTry.ptr.unpoisoned, 0
  ret { ptr addrspace(1), i1 } %finalTry.unpoisoned
}

define { ptr addrspace(1), i1 } @compareAndSwapObject_eager_lvb(ptr addrspace(1) %object, i64 %offset,
                                                    ptr addrspace(1) %expected, ptr addrspace(1) %newVal)
    alwaysinline nounwind willreturn "gc-leaf-function" {
entry:
  %oopAddr = getelementptr i8, ptr addrspace(1) %object, i64 %offset
  ;; Eagerly fix up *%oopAddr, so that when we get to doing the CAS
  ;; we're assured that *%oopAddr holds an in-phase oop.
  ;;
  ;; NB! Despite operationally equivalent at this time, it _not_ legal to use a
  ;; @jHeapLvb call here.  @jHeapLvb calls have LVB semantics, meaning they can
  ;; be optimized as LVBs.  LVB optimizations include removing unused @jHeapLvb
  ;; calls, which we do not want here.
  call void @azul.healJHeapLocation(ptr addrspace(1) %oopAddr)

  ;; Note: threads always store in-phase oops so after the @jHeapLvb
  ;; above is done we know that whatever is currently in *%oopAddr is
  ;; in phase.

  %expected.poisoned = call ptr addrspace(1) @azul.poison.oop(ptr addrspace(1) %expected)
  %newVal.poisoned = call ptr addrspace(1) @azul.poison.oop(ptr addrspace(1) %newVal)

  %onlyTry = call { ptr addrspace(1), i1 } @azul.cas_ref(ptr addrspace(1) %oopAddr, 
                                             ptr addrspace(1) %expected.poisoned, ptr addrspace(1) %newVal.poisoned)
  %onlyTry.ptr = extractvalue { ptr addrspace(1), i1 } %onlyTry, 0
  %onlyTry.ptr.unpoisoned = call ptr addrspace(1) @azul.unpoison.oop(ptr addrspace(1) %onlyTry.ptr)
  %onlyTry.unpoisoned = insertvalue { ptr addrspace(1), i1 } %onlyTry, ptr addrspace(1) %onlyTry.ptr.unpoisoned, 0
  ret { ptr addrspace(1), i1 } %onlyTry.unpoisoned
}

;; Try to atomically compare and swap *GEP(%object, %offset) from
;; %expected to %newVal.
;;
;; IMPORTANT: This function needs to be atomic with respect to
;; safepoints (i.e. there can be no safepoints in the function body);
;; meaning "azul-late-inline" can not be less than "3".  Specifically,
;; if we get a safepoint between the CAS and the SVB then we can
;; potentially have an out of sync card table at that safepoint.
;;
;; Note: This function returns a struct, and RS4GC doesn't support
;; first class aggregates (FCA).  As such, we need a separate wrapper 
;; around this which doesn't get inlined until after RS4GC. 
define { ptr addrspace(1), i1 }
@compareAndSwapObjectImpl(ptr addrspace(1) %object, i64 %offset, ptr addrspace(1) %expected,
                                ptr addrspace(1) %newVal)
                                 alwaysinline nounwind willreturn "gc-leaf-function" {
load_vm_constants:
  %lazyLVB = load i1, ptr @FalconCASObjectLazyLVB
  br label %entry

entry:
  %oopAddr = getelementptr i8, ptr addrspace(1) %object, i64 %offset

  ;; Note: we should investigate if we can sink this SVB down the
  ;; successful path.
  %unused = call ptr addrspace(1) @svb(ptr addrspace(1) %newVal,
                                       ptr addrspace(1) %oopAddr,
                                       ptr addrspace(1) %object,
                                       i32 0)

  ;; Choose between two equally correct strategies with different
  ;; performance characteristics.
  br i1 %lazyLVB, label %lazy_lvb, label %eager_lvb

lazy_lvb:
  %result_l = call { ptr addrspace(1), i1 } @compareAndSwapObject_lazy_lvb(
      ptr addrspace(1) %object, i64 %offset, ptr addrspace(1) %expected, ptr addrspace(1) %newVal)
  br label %cas_done

eager_lvb:
  %result_e = call { ptr addrspace(1), i1 } @compareAndSwapObject_eager_lvb(
        ptr addrspace(1) %object, i64 %offset, ptr addrspace(1) %expected, ptr addrspace(1) %newVal)
  br label %cas_done

cas_done:
  %result = phi { ptr addrspace(1), i1 } [ %result_l, %lazy_lvb ], [ %result_e, %eager_lvb ]
  ret { ptr addrspace(1), i1 } %result
}

; See note on compareAndSwapObjectImpl for why there can be no safepoints in this function
define i1 @azul.compareAndSwapObject(ptr addrspace(1) nocapture %object, i64 %offset, ptr addrspace(1) nocapture %expected,
                                     ptr addrspace(1) %newVal)
    noinline nounwind "azul-late-inline"="4" willreturn "gc-leaf-function" {
entry:
  %result = call {ptr addrspace(1), i1} 
     @compareAndSwapObjectImpl(ptr addrspace(1) %object, i64 %offset, ptr addrspace(1) %expected,
                                     ptr addrspace(1) %newVal)
  %success = extractvalue { ptr addrspace(1), i1 } %result, 1
  ret i1 %success
}

; See note on compareAndSwapObjectImpl for why there can be no safepoints in this function
define ptr addrspace(1) @azul.compareAndExchangeObject(ptr addrspace(1) %object, i64 %offset, ptr addrspace(1) %expected,
                                     ptr addrspace(1) %newVal)
    noinline nounwind "azul-late-inline"="4" willreturn "gc-leaf-function" {
entry:
  %result = call { ptr addrspace(1), i1 }
     @compareAndSwapObjectImpl(ptr addrspace(1) %object, i64 %offset, ptr addrspace(1) %expected,
                                     ptr addrspace(1) %newVal)
  %oldval = extractvalue { ptr addrspace(1), i1 } %result, 0
  ret ptr addrspace(1) %oldval
}


;; Note: This is mapped to compareAndSetObject in jdk9+
define zing i32 @_compareAndSwapObject(ptr addrspace(1) %unsafeInstance,
    ptr addrspace(1) %object, i64 %offset, ptr addrspace(1) %expected, ptr addrspace(1) %newVal)
      alwaysinline {
  %baseOop = call ptr addrspace(1) @azul.get_base_oop(ptr addrspace(1) %object, i64 %offset)
  %byteOffset = call i64 @azul.get_byte_offset(ptr addrspace(1) %object, i64 %offset)
  %success = call i1 @azul.compareAndSwapObject(
        ptr addrspace(1) %baseOop, i64 %byteOffset, ptr addrspace(1) %expected, ptr addrspace(1) %newVal)
  %retVal = zext i1 %success to i32
  ret i32 %retVal
}

define zing ptr addrspace(1) @_compareAndExchangeObject(ptr addrspace(1) %unsafeInstance,
    ptr addrspace(1) %object, i64 %offset, ptr addrspace(1) %expected, ptr addrspace(1) %newVal)
      alwaysinline {
  %baseOop = call ptr addrspace(1) @azul.get_base_oop(ptr addrspace(1) %object, i64 %offset)
  %byteOffset = call i64 @azul.get_byte_offset(ptr addrspace(1) %object, i64 %offset)
  %oldVal = call ptr addrspace(1) @azul.compareAndExchangeObject(
        ptr addrspace(1) %baseOop, i64 %byteOffset, ptr addrspace(1) %expected, ptr addrspace(1) %newVal)
  ret ptr addrspace(1) %oldVal
}

; Depending on JDK version returns i64 with all bits set, except bit 31 or 62
; For JDK9+ bit 62 is selected to keep marked offsets positive
define i64 @azul.get_static_field_offset_mask()
    "azul-late-inline"="0" readnone speculatable alwaysinline willreturn "gc-leaf-function" nounwind {
entry:
  %jdk_version = load i32, ptr @JDK_VERSION_MAJOR
  %jdk_above8 = icmp ugt i32 %jdk_version, 8
  %bit31_up = shl i64 1, 31
  %bits62_up = shl i64 1, 62
  %bits_up = select i1 %jdk_above8, i64 %bits62_up, i64 %bit31_up
  %mask = xor i64 %bits_up, -1
  ret i64 %mask
}

; With -XX:+UseTrueObjectsForUnsafe the 31st or 62nd bit of the offset is set for static
; fields, so before using the offset we have to mask out that bit. Note: this
; applies only to the offsets returned by Unsafe API.
define i64 @azul.get_byte_offset(ptr addrspace(1) %object, i64 %offset)
    "azul-late-inline"="0" alwaysinline willreturn "gc-leaf-function" nounwind {
entry:
  %enabled = load i1, ptr @UseTrueObjectsForUnsafe
  br i1 %enabled, label %nullcheck, label %ret_as_is

nullcheck:
  %object.not_zero = icmp ne ptr addrspace(1) %object, null
  br i1 %object.not_zero, label %check_kid, label %ret_as_is

check_kid:
  %java.lang.Class = load ptr addrspace(1), ptr @java_lang_Class.oop
  %java.lang.Class.kid = call i32 @azul.get_klass_id(ptr addrspace(1) %java.lang.Class)
  %object_kid = call i32 @azul.get_klass_id(ptr addrspace(1) %object)
  %is_java.lang.Class = icmp eq i32 %java.lang.Class.kid, %object_kid
  br i1 %is_java.lang.Class, label %mask_offset, label %ret_as_is

mask_offset:
  ; We assume that its branch can be removed with kid checks together.
  ; this effectively just masks out the 31st or 62nd bit depending on JDK version.
  %static_field_offset_mask = call i64 @azul.get_static_field_offset_mask()
  %masked_offset = and i64 %offset, %static_field_offset_mask
  ret i64 %masked_offset

ret_as_is:
  ret i64 %offset
}

; With -XX:+UseTrueObjectsForUnsafe the 31st or 62nd bit of the offset is set for static
; fields. It tells us that the %object is a mirror and we have to load the
; corresponding Klass OOP to access the field. Note: this applies only to the
; base pointers and offsets returned by Unsafe API.
define ptr addrspace(1) @azul.get_base_oop(ptr addrspace(1) %object, i64 %offset)
    "azul-late-inline"="0" alwaysinline willreturn "gc-leaf-function" nounwind {
entry:
  %enabled = load i1, ptr @UseTrueObjectsForUnsafe
  br i1 %enabled, label %nullcheck, label %ret_as_is

nullcheck:
  %object.not_zero = icmp ne ptr addrspace(1) %object, null
  br i1 %object.not_zero, label %continue, label %ret_as_is

continue:
  ; this effectively just checks the 31st or 62nd bit depending on JDK version.
  %static_field_offset_mask = call i64 @azul.get_static_field_offset_mask()
  %offset_or_mask = or i64 %offset, %static_field_offset_mask
  %is_static_field = icmp eq i64 %offset_or_mask, -1
  br i1 %is_static_field, label %check_kid, label %ret_as_is

check_kid:
  ; Actually this KID check is unnecessary, but we can often infer the type from the context
  ; and when that happens, we would both eliminate the KID check and the branch to offset decoding.

  %java.lang.Class = load ptr addrspace(1), ptr @java_lang_Class.oop
  %java.lang.Class.kid = call i32 @azul.get_klass_id(ptr addrspace(1) %java.lang.Class)
  %object_kid = call i32 @azul.get_klass_id(ptr addrspace(1) %object)
  %is_java.lang.Class = icmp eq i32 %java.lang.Class.kid, %object_kid
  br i1 %is_java.lang.Class, label %get_oop_from_mirror, label %ret_as_is

get_oop_from_mirror:
  %oop = call ptr addrspace(1) @azul.get_klass_oop(ptr addrspace(1) %object)
  ret ptr addrspace(1) %oop

ret_as_is:
  ret ptr addrspace(1) %object
}

; Unsafe.getAndSetObject
; See http://opengrok.azulsystems.com:8080/source/xref/jdk8_dev/jdk/src/share/classes/sun/misc/Unsafe.java#1167
define zing ptr addrspace(1) @_getAndSetObject(ptr addrspace(1) %unsafeInstance,
    ptr addrspace(1) %object, i64 %offset, ptr addrspace(1) %newValue)
      "alwaysinline-top-level" {
  %baseOop = call ptr addrspace(1) @azul.get_base_oop(ptr addrspace(1) %object, i64 %offset)
  %byteOffset = call i64 @azul.get_byte_offset(ptr addrspace(1) %object, i64 %offset)
  %prevVal = call ptr addrspace(1) @azul.getAndSetObject(
        ptr addrspace(1) %baseOop, i64 %byteOffset, ptr addrspace(1) %newValue)
  ret ptr addrspace(1) %prevVal
}

declare ptr addrspace(1) @azul.atomicrmw_xchg_ref(ptr addrspace(1) %oop.ptr, ptr addrspace(1) %newVal)
     "azul-late-inline"="0" alwaysinline willreturn "gc-leaf-function" nounwind

define ptr addrspace(1) @azul.getAndSetObject(
    ptr addrspace(1) %object, i64 %offset, ptr addrspace(1) %newVal)
      noinline nounwind "azul-late-inline"="4" willreturn "gc-leaf-function" {
entry:
  %oop.ptr = getelementptr inbounds i8, ptr addrspace(1) %object, i64 %offset

  %unused = call ptr addrspace(1) @svb(ptr addrspace(1) %newVal,
                                       ptr addrspace(1) %oop.ptr,
                                       ptr addrspace(1) %object,
                                       i32 0)

  ;; Eagerly fix up *%oop.ptr, so that when we get to doing the XCHG
  ;; we're assured that *%oop.ptr holds an in-phase oop.
  ;;
  ;; NB! Despite operationally equivalent at this time, it _not_ legal to use a
  ;; @jHeapLvb call here.  @jHeapLvb calls have LVB semantics, meaning they can
  ;; be optimized as LVBs.  LVB optimizations include removing unused @jHeapLvb
  ;; calls, which we do not want here.
  call void @azul.healJHeapLocation(ptr addrspace(1) %oop.ptr)

  ;; NB: threads always store in-phase oops so after the @azul.healJHeapLocation
  ;; above is done we know that whatever is currently in *%oop.ptr is in phase.

  %newVal.poisoned = call ptr addrspace(1) @azul.poison.oop(ptr addrspace(1) %newVal)
  %oldVal = call ptr addrspace(1) @azul.atomicrmw_xchg_ref(ptr addrspace(1) %oop.ptr, ptr addrspace(1) %newVal.poisoned) 
  %oldVal.unpoisoned = call ptr addrspace(1) @azul.unpoison.oop(ptr addrspace(1) %oldVal)
  ret ptr addrspace(1) %oldVal.unpoisoned
}

declare void @llvm.x86.sse2.pause() nounwind
declare void @llvm.aarch64.hint(i32) nounwind

define void @onSpinWait() alwaysinline nounwind willreturn "gc-leaf-function" {
entry:
  %isAARCH64Build = load i1, ptr @FalconAARCH64Build
  br i1 %isAARCH64Build, label %AARCH64, label %X86_64

X86_64:
  call void @llvm.x86.sse2.pause() nounwind
  ret void

AARCH64:
  call void @llvm.aarch64.hint(i32 1) nounwind
  ret void
}

;; intrinsic for java.lang.Thread.onSpinWait()
define zing void @_onSpinWait_jdk() nounwind alwaysinline "azul-generatable" "azul-inlining-candidate" {
entry:
  call void @onSpinWait() nounwind
  ret void
}

;; intrinsic for org.agrona.hints.ThreadHints.onSpinWait()
define zing void @_onSpinWait_agrona() nounwind alwaysinline "azul-generatable" "azul-inlining-candidate" {
entry:
  call void @onSpinWait() nounwind
  ret void
}

;; intrinsic for org.performancehints.ThreadHints.onSpinWait()
define zing void @_onSpinWait_performance_Hints() nounwind alwaysinline "azul-generatable" "azul-inlining-candidate" {
entry:
  call void @onSpinWait() nounwind
  ret void
}

;; intrinsic for com.lmax.disruptor.util.ThreadHints.onSpinWait()
define zing void @_onSpinWait_disruptor_Hints() nounwind alwaysinline "azul-generatable" "azul-inlining-candidate" {
entry:
  call void @onSpinWait() nounwind
  ret void
}

; This JBA represents the abstract notion of cloning the source object into an identical
; newly allocated one.  Note that cloning is allowed to fail for unspecified reasons.
; In practice, this is used to model the runtime call used to implement Object::clone.  
; We will use the hook provided by the JBA to specialize the implementation when an
; exact type is known, and implement optimizations like escape analysis even when it's not.
declare zing noalias nonnull ptr addrspace(1) @azul.clone_object(ptr addrspace(1) readonly %src)
    "consumes-caller-vmstate"

declare void @"DolphinRuntime::transaction_succeed"() nounwind willreturn "gc-leaf-function"
declare void @"DolphinRuntime::transaction_failed"(i32 %reason) nounwind willreturn "gc-leaf-function"

define void @azul.report_transaction_success() nounwind willreturn "gc-leaf-function" 
    "has-latent-use" "azul-late-inline"="3" noinline {
  call void @"DolphinRuntime::transaction_succeed"()
  ret void
}
define void @azul.report_transaction_failure(i32 %reason) nounwind willreturn "gc-leaf-function" 
    "has-latent-use" "azul-late-inline"="3" noinline {
  call void @"DolphinRuntime::transaction_failed"(i32 %reason)
  ret void
}

; Maps to SharedRuntime::log4j_getCallerClass
declare zing_stub_default ptr addrspace(1)  @"StubRoutines::log4j_getCallerClass()"(
        i64 %current_thread,
        ptr addrspace(1) %fqcn,
        ptr addrspace(1) %pkg
) "consumes-caller-vmstate" "azul-deopt-on-throw" "azul-allow-gcptrs-in-regs"

define ptr addrspace(1) @_getCallerClass_log4j(ptr addrspace(1) %this,
                                               ptr addrspace(1) %fqcn,
                                               ptr addrspace(1) %pkg)
        nounwind alwaysinline {

    %current_thread = call i64 @azul.get_current_thread()
    %result = call zing_stub_default ptr addrspace(1) @"StubRoutines::log4j_getCallerClass()"(
        i64 %current_thread,
        ptr addrspace(1) %fqcn,
        ptr addrspace(1) %pkg
    ) [ "deopt"() ]
    ret ptr addrspace(1) %result
}

; getObjectSize0 is non-static method, hence first 'this' argument
; %obj is checked for nullness by the caller.
define i64 @_getObjectSize(ptr addrspace(1) %this, i64 %agent, ptr addrspace(1) nonnull %obj)
  alwaysinline nounwind willreturn "gc-leaf-function" {
entry:
  %kid = call i32 @azul.get_klass_id(ptr addrspace(1) %obj)
  %lh = call i32 @azul.load_layout_helper(i32 %kid)
  %lh_neutral_value = load i32, ptr @Klass.layout_helper_neutral_value
  %is_array = icmp slt i32 %lh, %lh_neutral_value
  br i1 %is_array, label %array, label %instance

instance:
  %layoutHelperSizeInBytesMask = load i32, ptr @Klass.layout_helper_size_in_bytes_mask
  %size_in_bytes = and i32 %layoutHelperSizeInBytesMask, %lh
  %result1 = zext i32 %size_in_bytes to i64
  ret i64 %result1

array:
  %header_size = call i32 @azul.layout_helper_header_size(i32 %lh)
  %header_size.64 = zext i32 %header_size to i64
  %log2_element_size = call i32 @azul.layout_helper_log2_element_size(i32 %lh)
  %log2_element_size.64 = zext i32 %log2_element_size to i64
  %length = call i32 @azul.array_length(ptr addrspace(1) %obj)
  %length.64 = zext i32 %length to i64
  %array_size_rounded.i64 = call i64 @azul.compute_array_heap_size(
    i64 %length.64, i64 %log2_element_size.64, i64 %header_size.64)
  ret i64 %array_size_rounded.i64
}

;; This will be expanded into a call to DolphinRuntime::box_value
declare ptr addrspace(1) @azul.box_value(i64 %javaThread, i64 %value, i32 %index) "consumes-caller-vmstate" nounwind

; Intrinsic of java.lang.reflect.Array.get method
define ptr addrspace(1) @_getArrayElement(ptr addrspace(1) %array, i32 %index) alwaysinline {
entry:
  ; Check for null object for np exception
  %not_null = icmp ne ptr addrspace(1) %array, null
  br i1 %not_null, label %not.null, label %deopt.before, !prof !9

not.null:
  ; Check that the argument is an array for illegal argument exception
  %lh_neutral_value = load i32, ptr @Klass.layout_helper_neutral_value
  %kid = call i32 @azul.get_klass_id(ptr addrspace(1) %array)
  %lh = call i32 @azul.load_layout_helper(i32 %kid)
  %is_array = icmp slt i32 %lh, %lh_neutral_value
  br i1 %is_array, label %is.array, label %deopt.before, !prof !9

is.array:
  ; Check that the index is in bounds for oob exception
  %length = call i32 @azul.array_length(ptr addrspace(1) %array)
  %is_in_bounds = icmp slt i32 %index, %length
  %is_non_negative = icmp sge i32 %index, 0
  %is_correct_index = and i1 %is_in_bounds, %is_non_negative
  br i1 %is_correct_index, label %extract, label %deopt.before, !prof !9

extract:
  ; Get the element pointer
  %basictype = call i32 @azul.layout_helper_element_type(i32 %lh)
  %header_size = call i32 @azul.layout_helper_header_size(i32 %lh)
  %header_size.zext = zext i32 %header_size to i64
  %index.zext = zext i32 %index to i64

  %log2_element_size = call i32 @azul.layout_helper_log2_element_size(i32 %lh)
  %log2_element_size.zext = zext i32 %log2_element_size to i64

  ;; index_ptr = array + header_size + index << log2_element_size
  %index_bytes = shl i64 %index.zext, %log2_element_size.zext
  %index_offset = add i64 %index_bytes, %header_size.zext
  %index_ptr = getelementptr inbounds i8, ptr addrspace(1) %array, i64 %index_offset
  
  ; Check if the array elements are primitive for boxing reasons
  %object_array_lh = load i32, ptr @Klass.object_array_layout_helper
  %src_is_objarray = icmp eq i32 %lh, %object_array_lh
  br i1 %src_is_objarray, label %not.primitive, label %primitive
  
not.primitive:
  ; Just load the oop and return it
  %loaded_val = load ptr addrspace(1), ptr addrspace(1) %index_ptr
  ret ptr addrspace(1) %loaded_val

primitive:
  %current_thread = call i64 @azul.get_current_thread()
  ; Extract value based on the element size (log2 of it)
  switch i32 %log2_element_size, label %extract8
                        [ i32 0, label %extract1
                          i32 1, label %extract2
                          i32 2, label %extract4 ]
extract1:
  %val1 = load i8, ptr addrspace(1) %index_ptr
  %value1_as_i64 = zext i8 %val1 to i64
  br label %box
extract2:
  %val2 = load i16, ptr addrspace(1) %index_ptr
  %value2_as_i64 = zext i16 %val2 to i64
  br label %box
extract4:
  %val4 = load i32, ptr addrspace(1) %index_ptr
  %value4_as_i64 = zext i32 %val4 to i64
  br label %box
extract8:
  %value8_as_i64 = load i64, ptr addrspace(1) %index_ptr
  br label %box
box:
  ; We need to box the primitive value
  %value_as_i64 = phi i64 [ %value1_as_i64, %extract1 ],
                          [ %value2_as_i64, %extract2 ],
                          [ %value4_as_i64, %extract4 ],
                          [ %value8_as_i64, %extract8 ]
  %rval = call ptr addrspace(1) @azul.box_value(i64 %current_thread, i64 %value_as_i64, i32 %basictype) [ "deopt"() ]
  ret ptr addrspace(1) %rval
deopt.before:
  ; Exception, deopt to before call and let the interpreter throw the exception
  %reasonUnhandled = load i32, ptr @DeoptReasons.Reason_unhandled
  %ret = call zing_uncommon_trap ptr addrspace(1)(...) @llvm.experimental.deoptimize.isPtr(i32 %reasonUnhandled) "azul-need-deopt-before-call" [ "deopt"() ]
  ret ptr addrspace(1) %ret
}

define void @_notifyJvmtiVThreadDisableSuspend(i32 %enter) alwaysinline {
  ; Update is_disable_suspend field to %enter in current JavaThread
  %enteri8 = trunc i32 %enter to i8
  %is_disable_suspend_offset = load i32, ptr @Thread.is_disable_suspend_offset_bytes
  %is_disable_suspend_address = getelementptr inbounds i8, ptr addrspace(256) null, i32 %is_disable_suspend_offset
  store i8 %enteri8, ptr addrspace(256) %is_disable_suspend_address, align 4
  ret void
}

declare void @azul.notify_jvmti_vthread_start(i64, i32, ptr addrspace(1)) "consumes-replay-vmstate" nounwind
declare void @azul.notify_jvmti_vthread_end(i64, i32, ptr addrspace(1)) "consumes-replay-vmstate" nounwind
declare void @azul.notify_jvmti_vthread_mount(i64, i32, ptr addrspace(1)) "consumes-replay-vmstate" nounwind
declare void @azul.notify_jvmti_vthread_unmount(i64, i32, ptr addrspace(1)) "consumes-replay-vmstate" nounwind

define void @_notifyJvmtiVThreadStart(ptr addrspace(1) %this) alwaysinline {
  %current_thread = call i64 @azul.get_current_thread()
  call void @azul.notify_jvmti_vthread_start(i64 %current_thread, i32 0, ptr addrspace(1) %this) [ "deopt"() ]
  ret void
}

define void @_notifyJvmtiVThreadEnd(ptr addrspace(1) %this) alwaysinline {
  %current_thread = call i64 @azul.get_current_thread()
  call void @azul.notify_jvmti_vthread_end(i64 %current_thread, i32 1, ptr addrspace(1) %this) [ "deopt"() ]
  ret void
}

define void @_notifyJvmtiVThreadMount(ptr addrspace(1) %this, i32 %hide) alwaysinline {
  %current_thread = call i64 @azul.get_current_thread()
  call void @azul.notify_jvmti_vthread_mount(i64 %current_thread, i32 %hide, ptr addrspace(1) %this) [ "deopt"() ]
  ret void
}

define void @_notifyJvmtiVThreadUnmount(ptr addrspace(1) %this, i32 %hide) alwaysinline {
  %current_thread = call i64 @azul.get_current_thread()
  call void @azul.notify_jvmti_vthread_unmount(i64 %current_thread, i32 %hide, ptr addrspace(1) %this) [ "deopt"() ]
  ret void
}

; NOTE: calling convention indicated in IR doesn't matter for this stub.
declare void @"StubRoutines::overhead()"(i64 %overhead_iterations) nounwind willreturn "gc-leaf-function" "has-latent-use"

declare ptr addrspace(1) @azul.multianewarray(i64, i32, i32, ptr) "consumes-replay-vmstate" nounwind

!8 = !{!"branch_weights", i32 4, i32 64}
