Merge "Upgrade pcre to pcre2-10.42" am: 0667e80ea7 am: 649056fcca am: 2113dcc6cb

Original change: https://android-review.googlesource.com/c/platform/external/pcre/+/2350403

Change-Id: I9edc0b9dd3bca961fe464c5c5f96724473d74d75
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
diff --git a/.bazelrc b/.bazelrc
new file mode 100644
index 0000000..29223e4
--- /dev/null
+++ b/.bazelrc
@@ -0,0 +1,3 @@
+common --experimental_enable_bzlmod
+build --incompatible_enable_cc_toolchain_resolution
+build --incompatible_strict_action_env
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..ae2602d
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,105 @@
+
+name: Build
+on: [push, pull_request]
+
+jobs:
+  linux:
+    name: Linux
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        
+      - name: Autogen
+        run: ./autogen.sh
+        
+      - name: Configure
+        run: ./configure CPPFLAGS='-Wall -Wextra' --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
+        
+      - name: Build
+        run: make
+        
+      - name: Test (main test script)
+        run: ./RunTest
+
+      - name: Test (JIT test program)
+        run: ./pcre2_jit_test
+
+      - name: Test (pcre2grep test script)
+        run: ./RunGrepTest
+    
+  alpine:
+    name: alpine
+    runs-on: ubuntu-latest
+    container: alpine 
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        
+      - name: Setup
+        run: apk add --no-cache automake autoconf gcc libtool make musl-dev 
+        
+      - name: Autogen
+        run: ./autogen.sh
+        
+      - name: Configure
+        run: ./configure CPPFLAGS='-Wall -Wextra' --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
+        
+      - name: Build
+        run: make
+        
+      - name: Test (main test script)
+        run: ./RunTest
+
+      - name: Test (JIT test program)
+        run: ./pcre2_jit_test
+
+      - name: Test (pcre2grep test script)
+        run: ./RunGrepTest
+        
+  macos:
+    name: macOS universal
+    runs-on: macos-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Configure
+        run: cmake -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DCMAKE_OSX_ARCHITECTURES='arm64;x86_64' -DCMAKE_C_FLAGS='-Wall -Wextra' -B build
+
+      - name: Build
+        run: cmake --build build
+
+      - name: Test (main test script)
+        run: |
+          cd build
+          ../RunTest
+
+      - name: Test (JIT test program)
+        run: |
+          cd build
+          ./pcre2_jit_test
+
+      - name: Test (pcre2grep test script)
+        run: |
+          cd build
+          ../RunGrepTest
+
+  windows:      
+    name: 32bit Windows
+    runs-on: windows-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Configure
+        run: cmake -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -DCMAKE_IGNORE_PREFIX_PATH=C:/Strawberry/c -B build -A Win32
+
+      - name: Build
+        run: cmake --build build
+
+      - name: Test
+        run: |
+          cd build\Debug
+          ..\..\RunTest.bat
+           
diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml
new file mode 100644
index 0000000..8ac7dc2
--- /dev/null
+++ b/.github/workflows/cifuzz.yml
@@ -0,0 +1,24 @@
+name: CIFuzz
+on: [pull_request]
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Build Fuzzers
+      id: build
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'pcre2'
+        dry-run: false
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'pcre2'
+        fuzz-seconds: 300
+        dry-run: false
+    - name: Upload Crash
+      uses: actions/upload-artifact@v3
+      if: failure() && steps.build.outcome == 'success'
+      with:
+        name: artifacts
+        path: ./out/artifacts
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000..3d0e891
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,73 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ master ]
+  schedule:
+    - cron: '27 6 * * 4'
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'cpp', 'python' ]
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
+        # Learn more about CodeQL language support at https://git.io/codeql-language-support
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v2
+      with:
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v2
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
+    #    and modify them (or add more) to build your code if your project
+    #    uses a compiled language
+
+    #- run: |
+    #   make bootstrap
+    #   make release
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v2
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
new file mode 100644
index 0000000..690f191
--- /dev/null
+++ b/.github/workflows/scorecards.yml
@@ -0,0 +1,55 @@
+name: Scorecards supply-chain security
+on:
+  # Only the default branch is supported.
+  branch_protection_rule:
+  schedule:
+    - cron: '23 17 * * 1'
+  push:
+    branches: [ master ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecards analysis
+    runs-on: ubuntu-latest
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      actions: read
+      contents: read
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@v3
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@c1aec4ac820532bab364f02a81873c555a0ba3a1 # v1.0.4
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # Read-only PAT token. To create it,
+          # follow the steps in https://github.com/ossf/scorecard-action#pat-token-creation.
+          repo_token: ${{ secrets.SCORECARD_READ_TOKEN }}
+          # Publish the results to enable scorecard badges. For more details, see
+          # https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories, `publish_results` will automatically be set to `false`,
+          # regardless of the value entered here.
+          publish_results: true
+
+      # Upload the results as artifacts (optional).
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2.3.1
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 5
+
+      # Upload the results to GitHub's code scanning dashboard.
+      - name: "Upload to code-scanning"
+        uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26
+        with:
+          sarif_file: results.sarif
diff --git a/.gitignore b/.gitignore
index 3e3284e..a51f231 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,16 @@
 # Public .gitignore file for PCRE2
 
 *.a
+*.gcda
+*.gcno
 *.lo
 *.la
 *.pc
 *.o
 *~
 
+*-coverage*
+
 __pycache__
 .deps
 .libs
@@ -37,6 +41,9 @@
 pcre2_jit_test
 pcre2_jit_test.log
 pcre2_jit_test.trs
+pcre2posix_test
+pcre2posix_test.log
+pcre2posix_test.trs
 pcre2demo
 pcre2fuzzcheck
 pcre2grep
@@ -75,4 +82,6 @@
 src/pcre2_chartables.c
 src/stamp-h1
 
+/bazel-*
+
 # End
diff --git a/132html b/132html
index 1bd62ba..30c02fe 100755
--- a/132html
+++ b/132html
@@ -94,7 +94,7 @@
       die "*** Processing abandoned\n";
       }
 
-    # Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.
+    # Instead of .br, relevant "literal" sections are enclosed in .nf/.fi.
 
     elsif (/^\.nf/)
       {
diff --git a/BUILD.bazel b/BUILD.bazel
new file mode 100644
index 0000000..1bd314e
--- /dev/null
+++ b/BUILD.bazel
@@ -0,0 +1,72 @@
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
+
+copy_file(
+    name = "config_h_generic",
+    src = "src/config.h.generic",
+    out = "src/config.h",
+)
+
+copy_file(
+    name = "pcre2_h_generic",
+    src = "src/pcre2.h.generic",
+    out = "src/pcre2.h",
+)
+
+copy_file(
+    name = "pcre2_chartables_c",
+    src = "src/pcre2_chartables.c.dist",
+    out = "src/pcre2_chartables.c",
+)
+
+cc_library(
+    name = "pcre2",
+    srcs = [
+        "src/pcre2_auto_possess.c",
+        "src/pcre2_compile.c",
+        "src/pcre2_config.c",
+        "src/pcre2_context.c",
+        "src/pcre2_convert.c",
+        "src/pcre2_dfa_match.c",
+        "src/pcre2_error.c",
+        "src/pcre2_extuni.c",
+        "src/pcre2_find_bracket.c",
+        "src/pcre2_maketables.c",
+        "src/pcre2_match.c",
+        "src/pcre2_match_data.c",
+        "src/pcre2_newline.c",
+        "src/pcre2_ord2utf.c",
+        "src/pcre2_pattern_info.c",
+        "src/pcre2_script_run.c",
+        "src/pcre2_serialize.c",
+        "src/pcre2_string_utils.c",
+        "src/pcre2_study.c",
+        "src/pcre2_substitute.c",
+        "src/pcre2_substring.c",
+        "src/pcre2_tables.c",
+        "src/pcre2_ucd.c",
+        "src/pcre2_ucptables.c",
+        "src/pcre2_valid_utf.c",
+        "src/pcre2_xclass.c",
+        ":pcre2_chartables_c",
+    ],
+    hdrs = glob(["src/*.h"]) + [
+        ":config_h_generic",
+        ":pcre2_h_generic",
+    ],
+    defines = [
+        "HAVE_CONFIG_H",
+        "PCRE2_CODE_UNIT_WIDTH=8",
+        "PCRE2_STATIC",
+    ],
+    includes = ["src"],
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+)
+
+cc_binary(
+    name = "pcre2demo",
+    srcs = ["src/pcre2demo.c"],
+    visibility = ["//visibility:public"],
+    deps = [":pcre2"],
+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7febf33..cec7dfb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,12 +99,12 @@
 #            build in one go.
 # 2021-08-28 PH increased minimum version
 # 2021-08-28 PH added test for realpath()
-
-PROJECT(PCRE2 C)
+# 2022-12-10 PH added support for pcre2posix_test
 
 # Increased minimum to 2.8.5 to support GNUInstallDirs.
-# Increased minimum to 3.0.0 because older than 2.8.12 is deprecated.
-CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0)
+# Increased minimum to 3.1 to support imported targets.
+CMAKE_MINIMUM_REQUIRED(VERSION 3.1)
+PROJECT(PCRE2 C)
 
 # Set policy CMP0026 to avoid warnings for the use of LOCATION in
 # GET_TARGET_PROPERTY. This should no longer be required.
@@ -382,7 +382,13 @@
 ENDIF(PCRE2_SUPPORT_UNICODE)
 
 IF(PCRE2_SUPPORT_JIT)
-        SET(SUPPORT_JIT 1)
+	SET(SUPPORT_JIT 1)
+	IF(UNIX)
+		FIND_PACKAGE(Threads REQUIRED)
+		IF(CMAKE_USE_PTHREADS_INIT)
+			SET(REQUIRE_PTHREAD 1)
+		ENDIF(CMAKE_USE_PTHREADS_INIT)
+	ENDIF(UNIX)
 ENDIF(PCRE2_SUPPORT_JIT)
 
 IF(PCRE2_SUPPORT_JIT_SEALLOC)
@@ -652,6 +658,8 @@
 ENDIF(MINGW AND BUILD_SHARED_LIBS)
 
 IF(MSVC AND BUILD_SHARED_LIBS)
+  SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-posix.pdb ${dll_pdb_files})
+  SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-posixd.pdb ${dll_pdb_debug_files})
   IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
     SET(PCRE2_SOURCES ${PCRE2_SOURCES} pcre2.rc)
   ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
@@ -697,6 +705,10 @@
       VERSION ${LIBPCRE2_8_VERSION}
       SOVERSION ${LIBPCRE2_8_SOVERSION})
     TARGET_COMPILE_DEFINITIONS(pcre2-8-static PUBLIC PCRE2_STATIC)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-8-static PUBLIC ${PROJECT_BINARY_DIR})
+    IF(REQUIRE_PTHREAD)
+        TARGET_LINK_LIBRARIES(pcre2-8-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
     SET(targets ${targets} pcre2-8-static)
     ADD_LIBRARY(pcre2-posix-static STATIC ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
     SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES
@@ -707,6 +719,7 @@
       SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
     TARGET_LINK_LIBRARIES(pcre2-posix-static pcre2-8-static)
     TARGET_COMPILE_DEFINITIONS(pcre2-posix-static PUBLIC PCRE2_STATIC)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-posix-static PUBLIC ${PROJECT_BINARY_DIR})
     SET(targets ${targets} pcre2-posix-static)
 
     IF(MSVC)
@@ -723,6 +736,7 @@
 
   IF(BUILD_SHARED_LIBS)
     ADD_LIBRARY(pcre2-8-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-8-shared PUBLIC ${PROJECT_BINARY_DIR})
     SET_TARGET_PROPERTIES(pcre2-8-shared PROPERTIES
       COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
       MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
@@ -730,8 +744,12 @@
       VERSION ${LIBPCRE2_8_VERSION}
       SOVERSION ${LIBPCRE2_8_SOVERSION}
       OUTPUT_NAME pcre2-8)
+    IF(REQUIRE_PTHREAD)
+        TARGET_LINK_LIBRARIES(pcre2-8-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
     SET(targets ${targets} pcre2-8-shared)
     ADD_LIBRARY(pcre2-posix-shared SHARED ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
+    TARGET_INCLUDE_DIRECTORIES(pcre2-posix-shared PUBLIC ${PROJECT_BINARY_DIR})
     SET_TARGET_PROPERTIES(pcre2-posix-shared PROPERTIES
       COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
       MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
@@ -741,6 +759,8 @@
       OUTPUT_NAME pcre2-posix)
     TARGET_LINK_LIBRARIES(pcre2-posix-shared pcre2-8-shared)
     SET(targets ${targets} pcre2-posix-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-8.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-8d.pdb ${dll_pdb_debug_files})
 
     IF(MINGW)
       IF(NON_STANDARD_LIB_PREFIX)
@@ -766,6 +786,7 @@
 IF(PCRE2_BUILD_PCRE2_16)
   IF(BUILD_STATIC_LIBS)
     ADD_LIBRARY(pcre2-16-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-16-static PUBLIC ${PROJECT_BINARY_DIR})
     SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES
       COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
       MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
@@ -773,6 +794,9 @@
       VERSION ${LIBPCRE2_16_VERSION}
       SOVERSION ${LIBPCRE2_16_SOVERSION})
     TARGET_COMPILE_DEFINITIONS(pcre2-16-static PUBLIC PCRE2_STATIC)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-16-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
     SET(targets ${targets} pcre2-16-static)
 
     IF(MSVC)
@@ -787,6 +811,7 @@
 
   IF(BUILD_SHARED_LIBS)
     ADD_LIBRARY(pcre2-16-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-16-shared PUBLIC ${PROJECT_BINARY_DIR})
     SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES
       COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
       MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
@@ -794,7 +819,12 @@
       VERSION ${LIBPCRE2_16_VERSION}
       SOVERSION ${LIBPCRE2_16_SOVERSION}
       OUTPUT_NAME pcre2-16)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-16-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
     SET(targets ${targets} pcre2-16-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-16.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-16d.pdb ${dll_pdb_debug_files})
 
     IF(MINGW)
       IF(NON_STANDARD_LIB_PREFIX)
@@ -818,6 +848,7 @@
 IF(PCRE2_BUILD_PCRE2_32)
   IF(BUILD_STATIC_LIBS)
     ADD_LIBRARY(pcre2-32-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-32-static PUBLIC ${PROJECT_BINARY_DIR})
     SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES
       COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
       MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
@@ -825,6 +856,9 @@
       VERSION ${LIBPCRE2_32_VERSION}
       SOVERSION ${LIBPCRE2_32_SOVERSION})
     TARGET_COMPILE_DEFINITIONS(pcre2-32-static PUBLIC PCRE2_STATIC)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-32-static Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
     SET(targets ${targets} pcre2-32-static)
 
     IF(MSVC)
@@ -839,6 +873,7 @@
 
   IF(BUILD_SHARED_LIBS)
     ADD_LIBRARY(pcre2-32-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
+    TARGET_INCLUDE_DIRECTORIES(pcre2-32-shared PUBLIC ${PROJECT_BINARY_DIR})
     SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES
       COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
       MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
@@ -846,7 +881,12 @@
       VERSION ${LIBPCRE2_32_VERSION}
       SOVERSION ${LIBPCRE2_32_SOVERSION}
       OUTPUT_NAME pcre2-32)
+    IF(REQUIRE_PTHREAD)
+      TARGET_LINK_LIBRARIES(pcre2-32-shared Threads::Threads)
+    ENDIF(REQUIRE_PTHREAD)
     SET(targets ${targets} pcre2-32-shared)
+    SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-32.pdb ${dll_pdb_files})
+    SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-32d.pdb ${dll_pdb_debug_files})
 
     IF(MINGW)
       IF(NON_STANDARD_LIB_PREFIX)
@@ -902,10 +942,16 @@
   ENDIF(PCRE2_BUILD_PCRE2_32)
   TARGET_LINK_LIBRARIES(pcre2test ${PCRE2TEST_LIBS} ${PCRE2TEST_LINKER_FLAGS})
 
+  IF(PCRE2_BUILD_PCRE2_8)
+    ADD_EXECUTABLE(pcre2posix_test src/pcre2posix_test.c)
+    SET(targets ${targets} pcre2posix_test)
+    TARGET_LINK_LIBRARIES(pcre2posix_test pcre2-posix pcre2-8)
+  ENDIF(PCRE2_BUILD_PCRE2_8)
+
   IF(PCRE2_SUPPORT_JIT)
     ADD_EXECUTABLE(pcre2_jit_test src/pcre2_jit_test.c)
     SET(targets ${targets} pcre2_jit_test)
-    SET(PCRE2_JIT_TEST_LIBS )
+    SET(PCRE2_JIT_TEST_LIBS)
     IF(PCRE2_BUILD_PCRE2_8)
       LIST(APPEND PCRE2_JIT_TEST_LIBS pcre2-8)
     ENDIF(PCRE2_BUILD_PCRE2_8)
@@ -1017,6 +1063,11 @@
     ADD_TEST(pcre2_jit_test pcre2_jit_test)
   ENDIF(PCRE2_SUPPORT_JIT)
 
+  IF(PCRE2_BUILD_PCRE2_8)
+    ADD_TEST(pcre2posix_test pcre2posix_test)
+  ENDIF(PCRE2_BUILD_PCRE2_8)
+
+
 ENDIF(PCRE2_BUILD_TESTS)
 
 # Installation
@@ -1053,18 +1104,8 @@
 INSTALL(FILES ${html} DESTINATION share/doc/pcre2/html)
 
 IF(MSVC AND INSTALL_MSVC_PDB)
- INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2-8.pdb
-               ${PROJECT_BINARY_DIR}/pcre2-16.pdb
-               ${PROJECT_BINARY_DIR}/pcre2-32.pdb
-               ${PROJECT_BINARY_DIR}/pcre2-posix.pdb
-         DESTINATION bin
-         CONFIGURATIONS RelWithDebInfo)
- INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2-8d.pdb
-               ${PROJECT_BINARY_DIR}/pcre2-16d.pdb
-               ${PROJECT_BINARY_DIR}/pcre2-32d.pdb
-               ${PROJECT_BINARY_DIR}/pcre2-posixd.pdb
-         DESTINATION bin
-         CONFIGURATIONS Debug)
+ INSTALL(FILES ${dll_pdb_files} DESTINATION bin CONFIGURATIONS RelWithDebInfo)
+ INSTALL(FILES ${dll_pdb_debug_files} DESTINATION bin CONFIGURATIONS Debug)
 ENDIF(MSVC AND INSTALL_MSVC_PDB)
 
 # Help, only for nice output
diff --git a/ChangeLog b/ChangeLog
index 856250f..2b10036 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,118 @@
-Change Log for PCRE2
---------------------
+Change Log for PCRE2 - see also the Git log
+-------------------------------------------
+
+
+Version 10.42 11-December-2022
+------------------------------
+
+1. Change 19 of 10.41 wasn't quite right; it put the definition of a default,
+empty value for PCRE2_CALL_CONVENTION in src/pcre2posix.c instead of
+src/pcre2posix.h, which meant that programs that included pcre2posix.h but not
+pcre2.h failed to compile.
+
+2. To catch similar issues to the above in future, a new small test program
+that includes pcre2posix.h but not pcre2.h has been added to the test suite.
+
+3. When the -S option of pcre2test was used to set a stack size greater than
+the allowed maximum, the error message displayed the hard limit incorrectly.
+This was pointed out on GitHub pull request #171, but the suggested patch
+didn't cope with all cases. Some further modification was required.
+
+4. Supplying an ovector count of more than 65535 to pcre2_match_data_create()
+caused a crash because the field in the match data block is only 16 bits. A
+maximum of 65535 is now silently applied.
+
+5. Merged @carenas patch #175 which fixes #86 - segfault on aarch64 (ARM),
+
+
+Version 10.41 06-December-2022
+------------------------------
+
+1. Add fflush() before and after a fork callout in pcre2grep to get its output
+to be the same on all systems. (There were previously ordering differences in
+Alpine Linux).
+
+2. Merged patch from @carenas (GitHub #110) for pthreads support in CMake.
+
+3. SSF scorecards grumbled about possible overflow in an expression in
+pcre2test. It never would have overflowed in practice, but some casts have been
+added and at the some time there's been some tidying of fprints that output
+size_t values.
+
+4. PR #94 showed up an unused enum in pcre2_convert.c, which is now removed.
+
+5. Minor code re-arrangement to remove gcc warning about realloc() in
+pcre2test.
+
+6. Change a number of int variables that hold buffer and line lengths in
+pcre2grep to PCRE2_SIZE (aka size_t).
+
+7. Added an #ifdef to cut out a call to PRIV(jit_free) when JIT is not
+supported (even though that function would do nothing in that case) at the
+request of a user who doesn't even want to link with pcre_jit_compile.o. Also
+tidied up an untidy #ifdef arrangement in pcre2test.
+
+8. Fixed an issue in the backtracking optimization of character repeats in
+JIT. Furthermore optimize star repetitions, not just plus repetitions.
+
+9. Removed the use of an initial backtracking frames vector on the system stack
+in pcre2_match() so that it now always uses the heap. (In a multi-thread
+environment with very small stacks there had been an issue.) This also is
+tidier for JIT matching, which didn't need that vector. The heap vector is now
+remembered in the match data block and re-used if that block itself is re-used.
+It is freed with the match data block.
+
+10. Adjusted the find_limits code in pcre2test to work with change 9 above.
+
+11. Added find_limits_noheap to pcre2test, because the heap limits are now
+different in different environments and so cannot be included in the standard
+tests.
+
+12. Created a test for pcre2_match() heap processing that is not part of the
+tests run by 'make check', but can be run manually. The current output is from
+a 64-bit system.
+
+13. Implemented -Z aka --null in pcre2grep.
+
+14. A minor change to pcre2test and the addition of several new pcre2grep tests
+have improved LCOV coverage statistics. At the same time, code in pcre2grep and
+elsewhere that can never be obeyed in normal testing has been excluded from
+coverage.
+
+15. Fixed a bug in pcre2grep that could cause an extra newline to be written
+after output generaed by --output.
+
+16. If a file has a .bz2 extension but is not in fact compressed, pcre2grep
+should process it as a plain text file. A bug stopped this happening; now fixed
+and added to the tests.
+
+17. When pcre2grep was running not in UTF mode, if a string specified by
+--output or obtained from a callout in a pattern contained a character (byte)
+greater than 127, it was incorrectly output in UTF-8 format.
+
+18. Added some casts after warnings from Clang sanitize.
+
+19. Merged patch from cbouc (GitHub #139): 4 function prototypes were missing
+PCRE2_CALL_CONVENTION in src/pcre2posix.h. All function prototypes returning
+pointers had out of place PCRE2_CALL_CONVENTION in src/pcre2.h.*. These
+produced errors when building for Windows with #define PCRE2_CALL_CONVENTION
+__stdcall.
+
+20. A negative repeat value in a pcre2test subject line was not being
+diagnosed, leading to infinite looping.
+
+21. Updated RunGrepTest to discard the warning that Bash now gives when setting
+LC_CTYPE to a bad value (because older versions didn't).
+
+22. Updated pcre2grep so that it behaves like GNU grep when matching more than
+one pattern and a later pattern matches at an earlier point in the subject when
+the matched substrings are being identified by colour or by offsets.
+
+23. Updated the PrepareRelease script so that the man page that it makes for
+the pcre2demo demonstration program is more standard and does not cause errors
+when processed by lexgrog or mandb -c (GitHub issue #160).
+
+24. The JIT compiler was updated.
 
 
 Version 10.40 15-April-2022
@@ -92,7 +205,7 @@
 being flagged as caseless, causing some matches that should have succeeded to
 fail.
 
-23. Fixed a unicode properrty matching issue in JIT. The character was not
+23. Fixed a unicode property matching issue in JIT. The character was not
 fully read in caseless matching.
 
 24. Fixed an issue affecting recursions in JIT caused by duplicated data
@@ -119,10 +232,10 @@
   honoured if chosen.
 
   prtdiff_t is signed, so use a signed type instead, and make sure
-  that an appropiate width is chosen if pointers are 64bit wide and
+  that an appropriate width is chosen if pointers are 64bit wide and
   long is not (ex: Windows 64bit).
 
-  IMHO removing the cast (and therefore the positibilty of truncation)
+  IMHO removing the cast (and therefore the possibilty of truncation)
   make the code cleaner and the fallback is likely portable enough
   with all 64-bit POSIX systems doing LP64 except for Windows.
 
@@ -173,7 +286,7 @@
 -----------------------------
 
 1. Fix invalid single character repetition issues in JIT when the repetition
-is inside a capturing bracket and the bracket is preceeded by character
+is inside a capturing bracket and the bracket is preceded by character
 literals.
 
 2. Installed revised CMake configuration files provided by Jan-Willem Blokland.
@@ -413,7 +526,7 @@
 
 7. Added PCRE2_SUBSTITUTE_MATCHED.
 
-8. Added (?* and (?<* as synonms for (*napla: and (*naplb: to match another
+8. Added (?* and (?<* as synonyms for (*napla: and (*naplb: to match another
 regex engine. The Perl regex folks are aware of this usage and have made a note
 about it.
 
@@ -844,7 +957,7 @@
 warnings were reported.
 
 38. Using the clang compiler with sanitizing options causes runtime complaints
-about truncation for statments such as x = ~x when x is an 8-bit value; it
+about truncation for statements such as x = ~x when x is an 8-bit value; it
 seems to compute ~x as a 32-bit value. Changing such statements to x = 255 ^ x
 gets rid of the warnings. There were also two missing casts in pcre2test.
 
diff --git a/HACKING b/HACKING
index cad11b3..2f194db 100644
--- a/HACKING
+++ b/HACKING
@@ -8,8 +8,8 @@
 the pcre2test documentation and the comment at the head of the RunTest file.
 
 PCRE1 releases were up to 8.3x when PCRE2 was developed, and later bug fix
-releases remain in the 8.xx series. PCRE2 releases started at 10.00 to avoid
-confusion with PCRE1.
+releases carried on the 8.xx series, up to the final 8.45 release. PCRE2
+releases started at 10.00 to avoid confusion with PCRE1.
 
 
 Historical note 1
@@ -38,8 +38,8 @@
 By contrast, the code originally written by Henry Spencer (which was
 subsequently heavily modified for Perl) compiles the expression twice: once in
 a dummy mode in order to find out how much store will be needed, and then for
-real. (The Perl version probably doesn't do this any more; I'm talking about
-the original library.) The execution function operates by backtracking and
+real. (The Perl version may or may not still do this; I'm talking about the
+original library.) The execution function operates by backtracking and
 maximizing (or, optionally, minimizing, in Perl) the amount of the subject that
 matches individual wild portions of the pattern. This is an "NFA algorithm" in
 Friedl's terminology.
@@ -151,8 +151,8 @@
 advance to check for such values. When auto-callouts are enabled, the generous
 assumption is made that there will be a callout for each pattern code unit
 (which of course is only actually true if all code units are literals) plus one
-at the end. There is a default parsed pattern vector on the system stack, but
-if this is not big enough, heap memory is used.
+at the end. A default parsed pattern vector is defined on the system stack, to
+minimize memory handling, but if this is not big enough, heap memory is used.
 
 As before, the actual compiling function is run twice, the first time to
 determine the amount of memory needed for the final compiled pattern. It
@@ -187,7 +187,7 @@
 META_CLASS_EMPTY_NOT  [^] negative empty class - ditto
 META_CLASS_END        ] end of non-empty class
 META_CLASS_NOT        [^ start non-empty negative class
-META_COMMIT           (*COMMIT)
+META_COMMIT           (*COMMIT) - no argument (see below for with argument)
 META_COND_ASSERT      (?(?assertion)
 META_DOLLAR           $ metacharacter
 META_DOT              . metacharacter
@@ -201,18 +201,18 @@
 META_PLUS             +
 META_PLUS_PLUS        ++
 META_PLUS_QUERY       +?
-META_PRUNE            (*PRUNE) - no argument
+META_PRUNE            (*PRUNE) - no argument (see below for with argument)
 META_QUERY            ?
 META_QUERY_PLUS       ?+
 META_QUERY_QUERY      ??
 META_RANGE_ESCAPED    hyphen in class range with at least one escape
 META_RANGE_LITERAL    hyphen in class range defined literally
-META_SKIP             (*SKIP) - no argument
-META_THEN             (*THEN) - no argument
+META_SKIP             (*SKIP) - no argument (see below for with argument)
+META_THEN             (*THEN) - no argument (see below for with argument)
 
 The two RANGE values occur only in character classes. They are positioned
 between two literals that define the start and end of the range. In an EBCDIC
-evironment it is necessary to know whether either of the range values was
+environment it is necessary to know whether either of the range values was
 specified as an escape. In an ASCII/Unicode environment the distinction is not
 relevant.
 
@@ -229,17 +229,16 @@
 is the length of its branch, for which OP_REVERSE must be generated.
 
 META_BACKREF, META_CAPTURE, and META_RECURSE have the capture group number as
-their data in the lower 16 bits of the element.
+their data in the lower 16 bits of the element. META_RECURSE is followed by an
+offset, for use in error messages.
 
 META_BACKREF is followed by an offset if the back reference group number is 10
-or more. The offsets of the first ocurrences of references to groups whose
+or more. The offsets of the first occurrences of references to groups whose
 numbers are less than 10 are put in cb->small_ref_offset[] (only the first
 occurrence is useful). On 64-bit systems this avoids using more than two parsed
 pattern elements for items such as \3. The offset is used when an error occurs
 because the reference is to a non-existent group.
 
-META_RECURSE is always followed by an offset, for use in error messages.
-
 META_ESCAPE has an ESC_xxx value as its data. For ESC_P and ESC_p, the next
 element contains the 16-bit type and data property values, packed together.
 ESC_g and ESC_k are used only for named references - numerical ones are turned
@@ -291,9 +290,9 @@
 META_LOOKBEHIND_NA    (*naplb:  start of non-atomic lookbehind
 META_LOOKBEHINDNOT    (?<!      start of negative lookbehind
 
-The following are followed by two elements, the minimum and maximum. Repeat
-values are limited to 65535 (MAX_REPEAT). A maximum value of "unlimited" is
-represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
+The following are followed by two elements, the minimum and maximum. The
+maximum value is limited to 65535 (MAX_REPEAT). A maximum value of "unlimited"
+is represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
 
 META_MINMAX           {n,m}  repeat
 META_MINMAX_PLUS      {n,m}+ repeat
@@ -347,11 +346,11 @@
 Changeable options
 ------------------
 
-The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL, and
-others) may be changed in the middle of patterns by items such as (?i). Their
-processing is handled entirely at compile time by generating different opcodes
-for the different settings. The runtime functions do not need to keep track of
-an option's state.
+The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and
+some others may be changed in the middle of patterns by items such as (?i).
+Their processing is handled entirely at compile time by generating different
+opcodes for the different settings. The runtime functions do not need to keep
+track of an option's state.
 
 PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE
 are tracked and processed during the parsing pre-pass. The others are handled
@@ -372,7 +371,7 @@
 default value for LINK_SIZE is 2, except for the 32-bit library, where it can
 only be 4. The 8-bit library can be compiled to used 3-byte or 4-byte values,
 and the 16-bit library can be compiled to use 4-byte values, though this
-impairs performance. Specifing a LINK_SIZE larger than 2 for these libraries is
+impairs performance. Specifying a LINK_SIZE larger than 2 for these libraries is
 necessary only when patterns whose compiled length is greater than 65535 code
 units are going to be processed. When a LINK_SIZE value uses more than one code
 unit, the most significant unit is first.
@@ -437,7 +436,7 @@
 --------------------------
 
 Verbs with no arguments generate opcodes with no following data (as listed
-in the section above). 
+in the section above).
 
 (*MARK:NAME) generates OP_MARK followed by the mark name, preceded by a
 length in one code unit, and followed by a binary zero. The name length is
@@ -468,8 +467,8 @@
 case-equivalent code points (which is possible only in UTF mode) is handled by
 compiling a Unicode property item (see below), with the pseudo-property
 PT_CLIST. The value of this property is an offset in a vector called
-"ucd_caseless_sets" which identifies the start of a short list of equivalent
-characters, terminated by the value NOTACHAR (0xffffffff).
+"ucd_caseless_sets" which identifies the start of a short list of case
+equivalent characters, terminated by the value NOTACHAR (0xffffffff).
 
 
 Repeating single characters
@@ -546,9 +545,9 @@
 and a value. The types are a set of #defines of the form PT_xxx, and the values
 are enumerations of the form ucp_xx, defined in the pcre2_ucp.h source file.
 The value is relevant only for PT_GC (General Category), PT_PC (Particular
-Category), PT_SC (Script), PT_BIDICL (Bidi Class), and the pseudo-property
-PT_CLIST, which is used to identify a list of case-equivalent characters when
-there are three or more.
+Category), PT_SC (Script), PT_BIDICL (Bidi Class), PT_BOOL (Boolean property),
+and the pseudo-property PT_CLIST, which is used to identify a list of
+case-equivalent characters when there are three or more (see above).
 
 Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
 three code units: OP_PROP or OP_NOTPROP, and then the desired property type and
@@ -666,9 +665,9 @@
 There are several opcodes that mark the end of a subpattern group. OP_KET is
 used for subpatterns that do not repeat indefinitely, OP_KETRMIN and
 OP_KETRMAX are used for indefinite repetitions, minimally or maximally
-respectively, and OP_KETRPOS for possessive repetitions (see below for more 
+respectively, and OP_KETRPOS for possessive repetitions (see below for more
 details). All four are followed by a LINK_SIZE value giving (as a positive
-number) the offset back to the matching bracket opcode.
+number) the offset back to the matching opening bracket opcode.
 
 If a subpattern is quantified such that it is permitted to match zero times, it
 is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
@@ -719,7 +718,7 @@
 
 Forward assertions are also just like other subpatterns, but starting with one
 of the opcodes OP_ASSERT, OP_ASSERT_NA (non-atomic assertion), or
-OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK, 
+OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK,
 OP_ASSERTBACK_NA, and OP_ASSERTBACK_NOT, and the first opcode inside the
 assertion is OP_REVERSE, followed by a count of the number of characters to
 move back the pointer in the subject string. In ASCII or UTF-32 mode, the count
@@ -828,4 +827,4 @@
 opcode are the correct length, in order to catch updating errors.
 
 Philip Hazel
-December 2021
+April 2022
diff --git a/METADATA b/METADATA
index 86227c6..3d28b1c 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update pcre
+# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+
 name: "libpcre2"
 description: "The PCRE library is a set of functions that implement regular expression pattern matching using the same syntax and semantics as Perl 5. PCRE has its own native API, as well as a set of wrapper functions that correspond to the POSIX regular expression API. The PCRE library is free, even for building proprietary software."
 third_party {
@@ -7,16 +11,16 @@
   }
   url {
     type: GIT
-    value: "https://github.com/PhilipHazel/pcre2/"
+    value: "https://github.com/PCRE2Project/pcre2/"
   }
-  version: "pcre2-10.40"
+  version: "pcre2-10.42"
   license_type: NOTICE
   security {
     tag: "NVD-CPE2.3:cpe:/a:pcre:pcre2"
   }
   last_upgrade_date {
     year: 2022
-    month: 4
-    day: 15
+    month: 12
+    day: 13
   }
 }
diff --git a/MODULE.bazel b/MODULE.bazel
new file mode 100644
index 0000000..e058d5e
--- /dev/null
+++ b/MODULE.bazel
@@ -0,0 +1,8 @@
+module(
+    name = "pcre2",
+    version = "10.40",
+    compatibility_level = 1,
+)
+
+bazel_dep(name = "rules_cc", version = "0.0.1")
+bazel_dep(name = "bazel_skylib", version = "1.2.1")
diff --git a/Makefile.am b/Makefile.am
index b1adb6f..1a15d11 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -452,9 +452,10 @@
   src/sljit/sljitNativePPC_32.c \
   src/sljit/sljitNativePPC_64.c \
   src/sljit/sljitNativePPC_common.c \
+  src/sljit/sljitNativeRISCV_32.c \
+  src/sljit/sljitNativeRISCV_64.c \
+  src/sljit/sljitNativeRISCV_common.c \
   src/sljit/sljitNativeS390X.c \
-  src/sljit/sljitNativeSPARC_32.c \
-  src/sljit/sljitNativeSPARC_common.c \
   src/sljit/sljitNativeX86_32.c \
   src/sljit/sljitNativeX86_64.c \
   src/sljit/sljitNativeX86_common.c \
@@ -553,6 +554,17 @@
 
 ## -------- Testing ----------
 
+## If the 8-bit library is enabled, build the POSIX wrapper test program and
+## arrange for it to run.
+
+if WITH_PCRE2_8
+TESTS += pcre2posix_test
+noinst_PROGRAMS += pcre2posix_test
+pcre2posix_test_SOURCES = src/pcre2posix_test.c
+pcre2posix_test_CFLAGS = $(AM_CFLAGS)
+pcre2posix_test_LDADD = libpcre2-posix.la libpcre2-8.la
+endif # WITH_PCRE2_8
+
 ## If JIT support is enabled, arrange for the JIT test program to run.
 
 if WITH_JIT
@@ -631,15 +643,23 @@
   testdata/grepinput \
   testdata/grepinput3 \
   testdata/grepinput8 \
+  testdata/grepinputC.bz2 \
+  testdata/grepinputC.gz \
   testdata/grepinputM \
   testdata/grepinputv \
   testdata/grepinputx \
   testdata/greplist \
+  testdata/grepnot.bz2 \
   testdata/grepoutput \
   testdata/grepoutput8 \
   testdata/grepoutputC \
   testdata/grepoutputCN \
+  testdata/grepoutputCNU \
+  testdata/grepoutputCU \
+  testdata/grepoutputCbz2 \
+  testdata/grepoutputCgz \
   testdata/grepoutputN \
+  testdata/grepoutputUN \
   testdata/greppatN4 \
   testdata/testbtables \
   testdata/testinput1 \
@@ -669,6 +689,7 @@
   testdata/testinput25 \
   testdata/testinput26 \
   testdata/testinputEBC \
+  testdata/testinputheap \
   testdata/testoutput1 \
   testdata/testoutput2 \
   testdata/testoutput3 \
@@ -712,6 +733,9 @@
   testdata/testoutput25 \
   testdata/testoutput26 \
   testdata/testoutputEBC \
+  testdata/testoutputheap-16 \
+  testdata/testoutputheap-32 \
+  testdata/testoutputheap-8 \
   testdata/valgrind-jit.supp \
   testdata/wintestinput3 \
   testdata/wintestoutput3 \
@@ -738,7 +762,7 @@
 ## ------------ End of testing -------------
 
 
-# PCRE2 demonstration program. Not built automatcally. The point is that the
+# PCRE2 demonstration program. Not built automatically. The point is that the
 # users should build it themselves. So just distribute the source.
 
 EXTRA_DIST += src/pcre2demo.c
diff --git a/NEWS b/NEWS
index c9b8da2..97da19e 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,29 @@
 -------------------------
 
 
+Version 10.42 11-December-2022
+------------------------------
+
+This is an unexpectedly early release to fix a problem that was introduced in
+10.41. ChangeLog number 19 (GitHub #139) added the default definition of
+PCRE2_CALL_CONVENTION to pcre2posix.c instead of pcre2posix.h, which meant that
+programs including pcre2posix.h but not pcre2.h couldn't compile. A new test
+that checks this case has been added.
+
+A couple of other minor issues are also fixed, and a patch for an intermittent
+JIT fault is also included. See ChangeLog and the Git log.
+
+
+Version 10.41 06-December-2022
+------------------------------
+
+This is another mainly bug-fixing and code-tidying release. There is one
+significant upgrade to pcre2grep: it now behaves like GNU grep when matching
+more than one pattern and a later pattern matches at an earlier point in the
+subject when the matched substrings are being identified by colour or by
+offsets.
+
+
 Version 10.40 15-April-2022
 ---------------------------
 
diff --git a/NON-AUTOTOOLS-BUILD b/NON-AUTOTOOLS-BUILD
index 88e2f21..d83591a 100644
--- a/NON-AUTOTOOLS-BUILD
+++ b/NON-AUTOTOOLS-BUILD
@@ -69,7 +69,7 @@
      Note also that the src/config.h.generic file is created from a config.h
      that was generated by Autotools, which automatically includes settings of
      a number of macros that are not actually used by PCRE2 (for example,
-     HAVE_MEMORY_H).
+     HAVE_DLFCN_H).
 
  (2) Copy or rename the file src/pcre2.h.generic as src/pcre2.h.
 
@@ -121,6 +121,7 @@
        pcre2_substring.c
        pcre2_tables.c
        pcre2_ucd.c
+       pcre2_ucptables.c
        pcre2_valid_utf.c
        pcre2_xclass.c
 
@@ -135,7 +136,7 @@
      pcre2_jit_compile.c #includes other files from the sljit subdirectory,
      all of whose names begin with "sljit". It also #includes
      src/pcre2_jit_match.c and src/pcre2_jit_misc.c, so you should not compile
-     these yourself.
+     those yourself.
 
      Note also that the pcre2_fuzzsupport.c file contains special code that is
      useful to those who want to run fuzzing tests on the PCRE2 library. Unless
@@ -185,7 +186,11 @@
      the RunTest script. You might also like to build and run the freestanding
      JIT test program, src/pcre2_jit_test.c.
 
-(11) If you want to use the pcre2grep command, compile and link
+(11) The pcre2test program tests the POSIX wrapper library, but there is also a
+     freestanding test program in src/pcre2posix_test.c. It must be linked with
+     both the pcre2posix library and the 8-bit PCRE2 library.
+
+(12) If you want to use the pcre2grep command, compile and link
      src/pcre2grep.c; it uses only the basic 8-bit PCRE2 library (it does not
      need the pcre2posix library). If you have built the PCRE2 library with JIT
      support by defining SUPPORT_JIT in src/config.h, you can also define
@@ -306,7 +311,7 @@
 3.  Create a new, empty build directory, preferably a subdirectory of the
     source dir. For example, C:\pcre2\pcre2-xx\build.
 
-4.  Run cmake-gui from the Shell envirornment of your build tool, for example,
+4.  Run cmake-gui from the Shell environment of your build tool, for example,
     Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
     to start Cmake from the Windows Start menu, as this can lead to errors.
 
@@ -373,7 +378,7 @@
 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
    have been created.
 
-2. Edit RunTest.bat to indentify the full or relative location of
+2. Edit RunTest.bat to identify the full or relative location of
    the pcre2 source (wherein which the testdata folder resides), e.g.:
 
    set srcdir=C:\pcre2\pcre2-10.00
@@ -405,6 +410,6 @@
 z/OS file formats. The port provides an API for LE languages such as COBOL and
 for the z/OS and z/VM versions of the Rexx languages.
 
-===========================
-Last Updated: 28 April 2021
-===========================
+==============================
+Last Updated: 10 December 2022
+==============================
diff --git a/PrepareRelease b/PrepareRelease
index e7cf8db..1852c76 100755
--- a/PrepareRelease
+++ b/PrepareRelease
@@ -94,7 +94,10 @@
 perl <<"END" >pcre2demo.3
   open(IN, "../src/pcre2demo.c") || die "Failed to open src/pcre2demo.c\n";
   open(OUT, ">pcre2demo.3") || die "Failed to open pcre2demo.3\n";
-  print OUT ".\\\" Start example.\n" .
+  print OUT ".SH NAME\n" .
+            "// - A demonstration C program for PCRE2 - //\n" .
+            ".sp\n" .  
+            ".\\\" Start example.\n" .
             ".de EX\n" .
             ".  nr mE \\\\n(.f\n" .
             ".  nf\n" .
diff --git a/README b/README
index 7896944..c88acff 100644
--- a/README
+++ b/README
@@ -8,7 +8,7 @@
 now obsolete and no longer maintained. The latest release of PCRE2 is available
 in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
 
-https://github.com/PhilipHazel/pcre2/releases
+https://github.com/PCRE2Project/pcre2/releases
 
 There is a mailing list for discussion about the development of PCRE2 at
 pcre2-dev@googlegroups.com. You can subscribe by sending an email to
@@ -17,7 +17,7 @@
 You can access the archives and also subscribe or manage your subscription
 here:
 
-https://groups.google.com/pcre2-dev
+https://groups.google.com/g/pcre2-dev
 
 Please read the NEWS file if you are upgrading from a previous release. The
 contents of this README file are:
@@ -375,10 +375,10 @@
   necessary to specify something like LIBS="-lncurses" as well. This is
   because, to quote the readline INSTALL, "Readline uses the termcap functions,
   but does not link with the termcap or curses library itself, allowing
-  applications which link with readline the to choose an appropriate library."
-  If you get error messages about missing functions tgetstr, tgetent, tputs,
-  tgetflag, or tgoto, this is the problem, and linking with the ncurses library
-  should fix it.
+  applications which link with readline the option to choose an appropriate
+  library." If you get error messages about missing functions tgetstr, tgetent,
+  tputs, tgetflag, or tgoto, this is the problem, and linking with the ncurses
+  library should fix it.
 
 . The C99 standard defines formatting modifiers z and t for size_t and
   ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
@@ -400,17 +400,17 @@
   Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
   be created. This is normally run under valgrind or used when PCRE2 is
   compiled with address sanitizing enabled. It calls the fuzzing function and
-  outputs information about it is doing. The input strings are specified by
-  arguments: if an argument starts with "=" the rest of it is a literal input
-  string. Otherwise, it is assumed to be a file name, and the contents of the
-  file are the test string.
+  outputs information about what it is doing. The input strings are specified
+  by arguments: if an argument starts with "=" the rest of it is a literal
+  input string. Otherwise, it is assumed to be a file name, and the contents
+  of the file are the test string.
 
 . Releases before 10.30 could be compiled with --disable-stack-for-recursion,
   which caused pcre2_match() to use individual blocks on the heap for
   backtracking instead of recursive function calls (which use the stack). This
-  is now obsolete since pcre2_match() was refactored always to use the heap (in
-  a much more efficient way than before). This option is retained for backwards
-  compatibility, but has no effect other than to output a warning.
+  is now obsolete because pcre2_match() was refactored always to use the heap
+  (in a much more efficient way than before). This option is retained for
+  backwards compatibility, but has no effect other than to output a warning.
 
 The "configure" script builds the following files for the basic C library:
 
@@ -438,8 +438,9 @@
 libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test
 program called pcre2test. If you enabled JIT support with --enable-jit, another
 test program called pcre2_jit_test is built as well. If the 8-bit library is
-built, libpcre2-posix and the pcre2grep command are also built. Running
-"make" with the -j option may speed up compilation on multiprocessor systems.
+built, libpcre2-posix, pcre2posix_test, and the pcre2grep command are also
+built. Running "make" with the -j option may speed up compilation on
+multiprocessor systems.
 
 The command "make check" runs all the appropriate tests. Details of the PCRE2
 tests are given below in a separate section of this document. The -j option of
@@ -591,9 +592,11 @@
 
 To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
 There is another script called RunGrepTest that tests the pcre2grep command.
-When JIT support is enabled, a third test program called pcre2_jit_test is
-built. Both the scripts and all the program tests are run if you obey "make
-check". For other environments, see the instructions in NON-AUTOTOOLS-BUILD.
+When the 8-bit library is built, a test program for the POSIX wrapper, called
+pcre2posix_test, is compiled, and when JIT support is enabled, a test program
+called pcre2_jit_test is built. The scripts and the program tests are all run
+when you obey "make check". For other environments, see the instructions in
+NON-AUTOTOOLS-BUILD.
 
 The RunTest script runs the pcre2test test program (which is documented in its
 own man page) on each of the relevant testinput files in the testdata
@@ -695,7 +698,7 @@
 different code unit widths.
 
 Test 15 contains a number of tests that must not be run with JIT. They check,
-among other non-JIT things, the match-limiting features of the intepretive
+among other non-JIT things, the match-limiting features of the interpretive
 matcher.
 
 Test 16 is run only when JIT support is not available. It checks that an
@@ -716,6 +719,9 @@
 Tests 24 and 25 test the experimental pattern conversion functions, without and
 with UTF support, respectively.
 
+Test 26 checks Unicode property support using tests that are generated
+automatically from the Unicode data tables.
+
 
 Character tables
 ----------------
@@ -819,6 +825,7 @@
   src/pcre2_substring.c    )
   src/pcre2_tables.c       )
   src/pcre2_ucd.c          )
+  src/pcre2_ucptables.c    )
   src/pcre2_valid_utf.c    )
   src/pcre2_xclass.c       )
 
@@ -830,6 +837,8 @@
   src/pcre2posix.h         header for the external POSIX wrapper API
   src/pcre2_internal.h     header for internal use
   src/pcre2_intmodedep.h   a mode-specific internal header
+  src/pcre2_jit_neon_inc.h header used by JIT
+  src/pcre2_jit_simd_inc.h header used by JIT
   src/pcre2_ucp.h          header for Unicode property handling
 
   sljit/*                  source files for the JIT compiler
@@ -840,6 +849,7 @@
   src/pcre2grep.c          source of a grep utility that uses PCRE2
   src/pcre2test.c          comprehensive test program
   src/pcre2_jit_test.c     JIT test program
+  src/pcre2posix_test.c    POSIX wrapper API test program
 
 (C) Auxiliary files:
 
@@ -911,4 +921,4 @@
 Philip Hazel
 Email local part: Philip.Hazel
 Email domain: gmail.com
-Last updated: 15 April 2022
+Last updated: 10 December 2022
diff --git a/README.md b/README.md
index 7295ded..d3fff17 100644
--- a/README.md
+++ b/README.md
@@ -14,14 +14,14 @@
 ## Download
 
 As well as downloading from the 
-[GitHub site](https://github.com/PhilipHazel/pcre2), you can download PCRE2 
+[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2 
 or the older, unmaintained PCRE1 library from an 
 [*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
 
 You can check out the PCRE2 source code via Git or Subversion:
 
-    git clone https://github.com/PhilipHazel/pcre2.git
-    svn co    https://github.com/PhilipHazel/pcre2.git
+    git clone https://github.com/PCRE2Project/pcre2.git
+    svn co    https://github.com/PCRE2Project/pcre2.git
 
 ## Contributed Ports
 
@@ -36,7 +36,7 @@
 ## Documentation
 
 You can read the PCRE2 documentation 
-[here](https://philiphazel.github.io/pcre2/doc/html/index.html).
+[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
 
 Comparisons to Perl's regular expression semantics can be found in the
 community authored Wikipedia entry for PCRE.
diff --git a/RunGrepTest b/RunGrepTest
index 443ed76..0a00e82 100755
--- a/RunGrepTest
+++ b/RunGrepTest
@@ -68,6 +68,27 @@
 diff -u  /dev/null /dev/null 2>/dev/null && cf="diff -u"
 diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"
 
+# Add a -a (always treat as text) if available. This was added in an attempt
+# to get more detail from an Alpine Linux test failure on GitHub.
+
+$cf -a /dev/null /dev/null 2>/dev/null && cf="$cf -a"
+
+# Some tests involve NUL characters. It seems impossible to handle them easily
+# in many operating systems. An earlier version of this script used sed to
+# translate NUL into the string ZERO, but this didn't work on Solaris (aka
+# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
+# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
+# even when using GNU sed. A user suggested using tr instead, which
+# necessitates translating to a single character. However, on (some versions
+# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
+# /usr/xpg4/bin/tr is available, it can do so, so test for that.
+
+if [ -x /usr/xpg4/bin/tr ] ; then
+  tr=/usr/xpg4/bin/tr
+else
+  tr=tr
+fi
+
 # If this test is being run from "make check", $srcdir will be set. If not, set
 # it to the current or parent directory, whichever one contains the test data.
 # Subsequently, we run most of the pcre2grep tests in the source directory so
@@ -254,7 +275,7 @@
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 36 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude 'grepinput$' --exclude=grepinput8 --exclude=grepinputM --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include='grepinput[^C]' --exclude 'grepinput$' --exclude=grepinput8 --exclude=grepinputM --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 37 -----------------------------" >>testtrygrep
@@ -296,7 +317,10 @@
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 46 ------------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -e 'unopened)' -e abc ./testdata/grepinput) >>testtrygrep 2>&1
 (cd $srcdir; $valgrind $vjs $pcre2grep -eabc -e '(unclosed' ./testdata/grepinput) >>testtrygrep 2>&1
+(cd $srcdir; $valgrind $vjs $pcre2grep -eabc -e xyz -e '[unclosed' ./testdata/grepinput) >>testtrygrep 2>&1
+(cd $srcdir; $valgrind $vjs $pcre2grep --regex=123 -eabc -e xyz -e '[unclosed' ./testdata/grepinput) >>testtrygrep 2>&1
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 47 ------------------------------" >>testtrygrep
@@ -339,11 +363,11 @@
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 56 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -c lazy ./testdata/grepinput*) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -c --exclude=grepinputC lazy ./testdata/grepinput*) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 57 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -c -l lazy ./testdata/grepinput*) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -c -l --exclude=grepinputC lazy ./testdata/grepinput*) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 58 -----------------------------" >>testtrygrep
@@ -367,7 +391,7 @@
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 63 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $pcre2grep --recursion-limit=1000 --no-jit -M 'This is a file(.|\R)*file.' ./testdata/grepinput) >>testtrygrep 2>&1
+(cd $srcdir; $valgrind $pcre2grep --recursion-limit=1K --no-jit -M 'This is a file(.|\R)*file.' ./testdata/grepinput) >>testtrygrep 2>&1
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 64 ------------------------------" >>testtrygrep
@@ -506,25 +530,25 @@
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 96 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include-dir=testdata --exclude '^(?!grepinput)' --exclude=grepinputM 'fox' ./test* | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include-dir=testdata --exclude '^(?!grepinput)' --exclude=grepinput[MC] 'fox' ./test* | sort) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 97 -----------------------------" >>testtrygrep
 echo "grepinput$" >testtemp1grep
 echo "grepinput8" >>testtemp1grep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude=grepinputM --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude=grepinput[MC] --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 98 -----------------------------" >>testtrygrep
 echo "grepinput$" >testtemp1grep
 echo "grepinput8" >>testtemp1grep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --exclude=grepinput3 --exclude=grepinputM --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --exclude=grepinput3 --exclude=grepinput[MC] --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 99 -----------------------------" >>testtrygrep
 echo "grepinput$" >testtemp1grep
 echo "grepinput8" >testtemp2grep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include grepinput --exclude=grepinputM --exclude-from $builddir/testtemp1grep --exclude-from=$builddir/testtemp2grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include grepinput --exclude=grepinput[MC] --exclude-from $builddir/testtemp1grep --exclude-from=$builddir/testtemp2grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 100 ------------------------------" >>testtrygrep
@@ -566,7 +590,7 @@
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 109 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -cq lazy ./testdata/grepinput*) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -cq --exclude=grepinputC lazy ./testdata/grepinput*) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 110 -----------------------------" >>testtrygrep
@@ -582,27 +606,27 @@
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 113 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep --total-count 'the' testdata/grepinput*) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --total-count --exclude=grepinputC 'the' testdata/grepinput*) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 114 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -tc 'the' testdata/grepinput*) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -tc --exclude=grepinputC 'the' testdata/grepinput*) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 115 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -tlc 'the' testdata/grepinput*) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -tlc --exclude=grepinputC 'the' testdata/grepinput*) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 116 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep --exclude=grepinputM -th 'the' testdata/grepinput*) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --exclude=grepinput[MC] -th 'the' testdata/grepinput*) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 117 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -tch 'the' testdata/grepinput*) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -tch --exclude=grepinputC 'the' testdata/grepinput*) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 118 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -tL 'the' testdata/grepinput*) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -tL --exclude=grepinputC 'the' testdata/grepinput*) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 119 -----------------------------" >>testtrygrep
@@ -613,6 +637,18 @@
 echo "---------------------------- Test 120 ------------------------------" >>testtrygrep
 (cd $srcdir; $valgrind $vjs $pcre2grep -HO '$0:$2$1$3' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep
 echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -m 1 -O '$0:$a$b$e$f$r$t$v' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -HO '${X}' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -HO 'XX$' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -O '$x{12345678}' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -O '$x{123Z' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --output '$x{1234}' '(\w+) binary (\w+)(\.)?' ./testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 121 -----------------------------" >>testtrygrep
 (cd $srcdir; $valgrind $vjs $pcre2grep -F '\E and (regex)' testdata/grepinputv) >>testtrygrep
@@ -646,19 +682,24 @@
 echo "RC=$?" >>testtrygrep
 $valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=[ac]\K)' testNinputgrep >>testtrygrep
 echo "RC=$?" >>testtrygrep
+GREP_COLORS='ms=1;20' $valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=[ac]\K)' testNinputgrep >>testtrygrep
+echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 126 -----------------------------" >>testtrygrep
 printf 'Next line pattern has binary zero\nABC\0XYZ\n' >testtemp1grep
 printf 'ABC\0XYZ\nABCDEF\nDEFABC\n' >testtemp2grep
 $valgrind $vjs $pcre2grep -a -f testtemp1grep testtemp2grep >>testtrygrep
 echo "RC=$?" >>testtrygrep
+printf 'Next line pattern is erroneous.\n^abc)(xy' >testtemp1grep
+$valgrind $vjs $pcre2grep -a -f testtemp1grep testtemp2grep >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 127 -----------------------------" >>testtrygrep
 (cd $srcdir; $valgrind $vjs $pcre2grep -o --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 128 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
+(cd $srcdir; $valgrind $vjs $pcre2grep -m1M -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 129 -----------------------------" >>testtrygrep
@@ -682,9 +723,139 @@
 echo "RC=$?" >>testtrygrep
 
 echo "---------------------------- Test 134 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
+(cd $srcdir; $valgrind $vjs $pcre2grep --max-count=1 -nH -O '=$x{41}$x423$o{103}$o1045=' 'fox' -) <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
 echo "RC=$?" >>testtrygrep
 
+echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 136 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -m1MK -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --max-count=1MK -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 137 -----------------------------" >>testtrygrep
+printf 'Last line\nhas no newline' >testtemp1grep
+$valgrind $vjs $pcre2grep -A1 Last testtemp1grep >>testtrygrep
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 138 -----------------------------" >>testtrygrep
+printf 'AbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\nAbC\n' >testtemp1grep
+$valgrind $vjs $pcre2grep --no-jit --heap-limit=0 b testtemp1grep >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 139 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --line-buffered 'fox' testdata/grepinputv) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 140 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=10 -A1 'brown' testdata/grepinputv) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 141 -----------------------------" >>testtrygrep
+printf "$srcdir/testdata/grepinputv\n-\n" >testtemp1grep
+printf 'This is a line from stdin.' >testtemp2grep
+$valgrind $vjs $pcre2grep --file-list testtemp1grep "line from stdin" <testtemp2grep >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 142 -----------------------------" >>testtrygrep
+printf "/does/not/exist\n" >testtemp1grep
+printf 'This is a line from stdin.' >testtemp2grep
+$valgrind $vjs $pcre2grep --file-list testtemp1grep "line from stdin" >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 143 -----------------------------" >>testtrygrep
+printf 'fox|cat' >testtemp1grep
+$valgrind $vjs $pcre2grep -f - $srcdir/testdata/grepinputv <testtemp1grep >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 144 -----------------------------" >>testtrygrep
+$valgrind $vjs $pcre2grep -f /non/exist $srcdir/testdata/grepinputv >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 145 -----------------------------" >>testtrygrep
+printf '*meta*\rdog.' >testtemp1grep
+$valgrind $vjs $pcre2grep -Ncr -F -f testtemp1grep $srcdir/testdata/grepinputv >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 146 -----------------------------" >>testtrygrep
+printf 'A123B' >testtemp1grep
+$valgrind $vjs $pcre2grep -H -e '123|fox' - <testtemp1grep >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep -h -e '123|fox' - $srcdir/testdata/grepinputv <testtemp1grep >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep - $srcdir/testdata/grepinputv <testtemp1grep >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 147 -----------------------------" >>testtrygrep
+$valgrind $vjs $pcre2grep -e '123|fox' -- -nonfile >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 148 -----------------------------" >>testtrygrep
+$valgrind $vjs $pcre2grep --nonexist >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep -n-n-bad >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --context >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --only-matching --output=xx >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=badvalue >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --newline=badvalue >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep -d badvalue >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep -D badvalue >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --buffer-size=0 >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --exclude '(badpat' abc /dev/null >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --exclude-from /non/exist abc /dev/null >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --include-from /non/exist abc /dev/null >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --file-list=/non/exist abc /dev/null >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 149 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --binary-files=binary "dog" ./testdata/grepbinary) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --binary-files=wrong "dog" ./testdata/grepbinary) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
+
+# This test runs the code that tests locale support. However, on some systems
+# (e.g. Alpine Linux) there is no locale support and running this test just
+# generates a "no match" result. Therefore, we test for locale support, and if
+# it is found missing, we pretend that the test has run as expected so that the
+# output matches.
+
+echo "---------------------------- Test 150 -----------------------------" >>testtrygrep
+which locale >/dev/null 2>&1
+if [ $? -ne 0 ]; then
+  echo "pcre2grep: Failed to set locale badlocale (obtained from LC_CTYPE)" >>testtrygrep
+  echo "RC=2" >>testtrygrep
+else
+
+  (cd $srcdir; unset LC_ALL; env LC_CTYPE=badlocale $valgrind $vjs $pcre2grep abc /dev/null) >>testtrygrep 2>&1
+  echo "RC=$?" >>testtrygrep
+fi
+
+echo "---------------------------- Test 151 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --colour=always -e this -e The -e 'The wo' testdata/grepinputv) >>testtrygrep
+
+
+
+
 # Now compare the results.
 
 $cf $srcdir/testdata/grepoutput testtrygrep
@@ -742,9 +913,11 @@
 
 printf '%c--------------------------- Test N1 ------------------------------\r\n' - >testtrygrep
 $valgrind $vjs $pcre2grep -n -N CR "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
+$valgrind $vjs $pcre2grep -B1 -n -N CR "^def" testNinputgrep >>testtrygrep
 
 printf '%c--------------------------- Test N2 ------------------------------\r\n' - >>testtrygrep
 $valgrind $vjs $pcre2grep -n --newline=crlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
+$valgrind $vjs $pcre2grep -B1 -n -N CRLF "^ghi" testNinputgrep >>testtrygrep
 
 printf '%c--------------------------- Test N3 ------------------------------\r\n' - >>testtrygrep
 pattern=`printf 'def\rjkl'`
@@ -755,34 +928,39 @@
 
 printf '%c--------------------------- Test N5 ------------------------------\r\n' - >>testtrygrep
 $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
+$valgrind $vjs $pcre2grep -B1 -n --newline=any "^def" testNinputgrep >>testtrygrep
 
 printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
 $valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
-
-# This next test involves NUL characters. It seems impossible to handle them
-# easily in many operating systems. An earlier version of this script used sed
-# to translate NUL into the string ZERO, but this didn't work on Solaris (aka
-# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
-# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
-# even when using GNU sed. A user suggested using tr instead, which
-# necessitates translating to a single character (@). However, on (some
-# versions of?) Solaris, the normal "tr" cannot handle binary zeros, but if
-# /usr/xpg4/bin/tr is available, it can do so, so test for that.
-
-if [ -x /usr/xpg4/bin/tr ] ; then
-  tr=/usr/xpg4/bin/tr
-else
-  tr=tr
-fi
+$valgrind $vjs $pcre2grep -B1 -n --newline=anycrlf "^jkl" testNinputgrep >>testtrygrep
 
 printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
-printf 'abc\0def' >testNinputgrep
+printf 'xyz\0abc\0def' >testNinputgrep
 $valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
+$valgrind $vjs $pcre2grep -B1 -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
 echo "" >>testtrygrep
 
 $cf $srcdir/testdata/grepoutputN testtrygrep
 if [ $? != 0 ] ; then exit 1; fi
 
+
+# These newline tests need UTF support.
+
+if [ $utf8 -ne 0 ] ; then
+  echo "Testing pcre2grep newline settings with UTF-8 features"
+
+  printf '%c--------------------------- Test UN1 ------------------------------\r\n' - >testtrygrep
+  printf 'abc\341\210\264def\nxyz' >testNinputgrep
+  $valgrind $vjs $pcre2grep -nau --newline=anycrlf "^(abc|def)" testNinputgrep >>testtrygrep
+  echo "" >>testtrygrep
+
+  $cf $srcdir/testdata/grepoutputUN testtrygrep
+  if [ $? != 0 ] ; then exit 1; fi
+else
+  echo "Skipping pcre2grep newline UTF-8 tests: no UTF-8 support in PCRE2 library"
+fi
+
+
 # If pcre2grep supports script callouts, run some tests on them. It is possible
 # to restrict these callouts to the non-fork case, either for security, or for
 # environments that do not support fork(). This is handled by comparing to a
@@ -793,20 +971,59 @@
   $valgrind $vjs $pcre2grep '(T)(..(.))(?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4) ($14) ($0)")()' $srcdir/testdata/grepinputv >testtrygrep
   $valgrind $vjs $pcre2grep '(T)(..(.))()()()()()()()(..)(?C"/bin/echo|Arg1: [$11] [${11}]")' $srcdir/testdata/grepinputv >>testtrygrep
   $valgrind $vjs $pcre2grep '(T)(?C"|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep
+  $valgrind $vjs $pcre2grep '(T)(?C"/bin/echo|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep
   $valgrind $vjs $pcre2grep '(T)(?C"|$1$n")(*F)' $srcdir/testdata/grepinputv >>testtrygrep
   $valgrind $vjs $pcre2grep -m1 '(T)(?C"|$0:$1:$x{41}$o{101}$n")' $srcdir/testdata/grepinputv >>testtrygrep
 
   if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'Non-fork callout scripts in patterns are supported'; then
+    nonfork=1
     $cf $srcdir/testdata/grepoutputCN testtrygrep
   else
+    nonfork=0
     $cf $srcdir/testdata/grepoutputC testtrygrep
   fi
-
   if [ $? != 0 ] ; then exit 1; fi
+
+  # These callout tests need UTF support.
+
+  if [ $utf8 -ne 0 ] ; then
+    echo "Testing pcre2grep script callout with UTF-8 features"
+    $valgrind $vjs $pcre2grep -u '(T)(?C"|$0:$x{a6}$n")' $srcdir/testdata/grepinputv >testtrygrep
+    $valgrind $vjs $pcre2grep -u '(T)(?C"/bin/echo|$0:$x{a6}$n")' $srcdir/testdata/grepinputv >>testtrygrep
+
+    if [ $nonfork = 1 ] ; then
+      $cf $srcdir/testdata/grepoutputCNU testtrygrep
+    else
+      $cf $srcdir/testdata/grepoutputCU testtrygrep
+    fi
+    if [ $? != 0 ] ; then exit 1; fi
+  fi
 else
   echo "Script callouts are not supported"
 fi
 
+
+# Test reading .gz and .bz2 files when supported.
+
+if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q '\.gz are read using zlib'; then
+  echo "Testing reading .gz file"
+  $valgrind $vjs $pcre2grep 'one|two' $srcdir/testdata/grepinputC.gz >testtrygrep
+  echo "RC=$?" >>testtrygrep
+  $cf $srcdir/testdata/grepoutputCgz testtrygrep
+  if [ $? != 0 ] ; then exit 1; fi
+fi
+
+if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q '\.bz2 are read using bzlib2'; then
+  echo "Testing reading .bz2 file"
+  $valgrind $vjs $pcre2grep 'one|two' $srcdir/testdata/grepinputC.bz2 >testtrygrep
+  echo "RC=$?" >>testtrygrep
+  $valgrind $vjs $pcre2grep 'one|two' $srcdir/testdata/grepnot.bz2 >>testtrygrep
+  echo "RC=$?" >>testtrygrep
+  $cf $srcdir/testdata/grepoutputCbz2 testtrygrep
+  if [ $? != 0 ] ; then exit 1; fi
+fi
+
+
 # Finally, some tests to exercise code that is not tested above, just to be
 # sure that it runs OK. Doing this improves the coverage statistics. The output
 # is not checked.
@@ -816,6 +1033,9 @@
 checkspecial '-xxxxx' 2
 checkspecial '--help' 0
 checkspecial '--line-buffered --colour=auto abc /dev/null' 1
+checkspecial '--line-buffered --color abc /dev/null' 1
+checkspecial '-dskip abc .' 1
+checkspecial '-Dread -Dskip abc /dev/null' 1
 
 # Clean up local working files
 rm -f testNinputgrep teststderrgrep testtrygrep testtemp1grep testtemp2grep
diff --git a/RunTest b/RunTest
index 9b67870..ae32f5d 100755
--- a/RunTest
+++ b/RunTest
@@ -17,8 +17,16 @@
 # individual test numbers, ranges of tests such as 3-6 or 3- (meaning 3 to the
 # end), or a number preceded by ~ to exclude a test. For example, "3-15 ~10"
 # runs tests 3 to 15, excluding test 10, and just "~10" runs all the tests
-# except test 10. Whatever order the arguments are in, the tests are always run
-# in numerical order.
+# except test 10. Whatever order the arguments are in, these tests are always
+# run in numerical order.
+#
+# If no specific tests are selected (which is the case when this script is run
+# via 'make check') the default is to run all the numbered tests.
+#
+# There may also be named (as well as numbered) tests for special purposes. At
+# present there is just one, called "heap". This test's output contains the
+# sizes of heap frames and frame vectors, which depend on the environment. It
+# is therefore not run unless explicitly requested.
 #
 # Inappropriate tests are automatically skipped (with a comment to say so). For
 # example, if JIT support is not compiled, test 16 is skipped, whereas if JIT
@@ -82,6 +90,7 @@
 title25="Test 25: UTF pattern conversion tests"
 title26="Test 26: Auto-generated unicode property tests"
 maxtest=26
+titleheap="Test 'heap': Environment-specific heap tests"
 
 if [ $# -eq 1 -a "$1" = "list" ]; then
   echo $title0
@@ -111,6 +120,11 @@
   echo $title24
   echo $title25
   echo $title26
+  echo ""
+  echo $titleheap
+  echo ""
+  echo "Numbered tests are automatically run if nothing selected."
+  echo "Named tests must be explicitly selected."
   exit 0
 fi
 
@@ -241,6 +255,7 @@
 do24=no
 do25=no
 do26=no
+doheap=no
 
 while [ $# -gt 0 ] ; do
   case $1 in
@@ -271,6 +286,7 @@
    24) do24=yes;;
    25) do25=yes;;
    26) do26=yes;;
+ heap) doheap=yes;;
    -8) arg8=yes;;
   -16) arg16=yes;;
   -32) arg32=yes;;
@@ -412,8 +428,8 @@
   fi
 fi
 
-# If no specific tests were requested, select all. Those that are not
-# relevant will be automatically skipped.
+# If no specific tests were requested, select all the numbered tests. Those
+# that are not relevant will be automatically skipped.
 
 if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
      $do4  = no -a $do5  = no -a $do6  = no -a $do7  = no -a \
@@ -421,7 +437,7 @@
      $do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
      $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
      $do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
-     $do24 = no -a $do25 = no -a $do26 = no \
+     $do24 = no -a $do25 = no -a $do26 = no -a $doheap = no \
    ]; then
   do0=yes
   do1=yes
@@ -640,7 +656,7 @@
   # Test of internal offsets and code sizes. This test is run only when there
   # is UTF/UCP support. The actual tests are mostly the same as in some of the
   # above, but in this test we inspect some offsets and sizes. This is a
-  # doublecheck for the maintainer, just in case something changes unexpectely.
+  # doublecheck for the maintainer, just in case something changes unexpectedly.
   # The output from this test is different in 8-bit, 16-bit, and 32-bit modes
   # and for different link sizes, so there are different output files for each
   # mode and link size.
@@ -882,6 +898,15 @@
     fi
   fi
 
+  # Manually selected heap tests - output may vary in different environments,
+  # which is why that are not automatically run.
+
+  if [ $doheap = yes ] ; then
+    echo $titleheap
+    $sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinputheap testtry
+    checkresult $? heap-$bits ""
+  fi
+
 # End of loop for 8/16/32-bit tests
 done
 
diff --git a/RunTest.bat b/RunTest.bat
index 791f265..401b95a 100644
--- a/RunTest.bat
+++ b/RunTest.bat
Binary files differ
diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel
new file mode 100644
index 0000000..4ce2c8c
--- /dev/null
+++ b/WORKSPACE.bazel
@@ -0,0 +1 @@
+# See MODULE.bazel
diff --git a/configure.ac b/configure.ac
index 9b4d796..ce5bda3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -9,15 +9,15 @@
 dnl be defined as -RC2, for example. For real releases, it should be empty.
 
 m4_define(pcre2_major, [10])
-m4_define(pcre2_minor, [40])
+m4_define(pcre2_minor, [42])
 m4_define(pcre2_prerelease, [])
-m4_define(pcre2_date, [2022-04-14])
+m4_define(pcre2_date, [2022-12-11])
 
 # Libtool shared library interface versions (current:revision:age)
-m4_define(libpcre2_8_version,     [11:0:11])
-m4_define(libpcre2_16_version,    [11:0:11])
-m4_define(libpcre2_32_version,    [11:0:11])
-m4_define(libpcre2_posix_version, [3:2:0])
+m4_define(libpcre2_8_version,     [11:2:11])
+m4_define(libpcre2_16_version,    [11:2:11])
+m4_define(libpcre2_32_version,    [11:2:11])
+m4_define(libpcre2_posix_version, [3:4:0])
 
 # NOTE: The CMakeLists.txt file searches for the above variables in the first
 # 50 lines of this file. Please update that if the variables above are moved.
@@ -67,6 +67,8 @@
 LT_INIT([win32-dll])
 AC_PROG_LN_S
 
+AC_SYS_LARGEFILE
+
 # Check for GCC visibility feature
 
 PCRE2_VISIBILITY
diff --git a/doc/html/NON-AUTOTOOLS-BUILD.txt b/doc/html/NON-AUTOTOOLS-BUILD.txt
index 88e2f21..d83591a 100644
--- a/doc/html/NON-AUTOTOOLS-BUILD.txt
+++ b/doc/html/NON-AUTOTOOLS-BUILD.txt
@@ -69,7 +69,7 @@
      Note also that the src/config.h.generic file is created from a config.h
      that was generated by Autotools, which automatically includes settings of
      a number of macros that are not actually used by PCRE2 (for example,
-     HAVE_MEMORY_H).
+     HAVE_DLFCN_H).
 
  (2) Copy or rename the file src/pcre2.h.generic as src/pcre2.h.
 
@@ -121,6 +121,7 @@
        pcre2_substring.c
        pcre2_tables.c
        pcre2_ucd.c
+       pcre2_ucptables.c
        pcre2_valid_utf.c
        pcre2_xclass.c
 
@@ -135,7 +136,7 @@
      pcre2_jit_compile.c #includes other files from the sljit subdirectory,
      all of whose names begin with "sljit". It also #includes
      src/pcre2_jit_match.c and src/pcre2_jit_misc.c, so you should not compile
-     these yourself.
+     those yourself.
 
      Note also that the pcre2_fuzzsupport.c file contains special code that is
      useful to those who want to run fuzzing tests on the PCRE2 library. Unless
@@ -185,7 +186,11 @@
      the RunTest script. You might also like to build and run the freestanding
      JIT test program, src/pcre2_jit_test.c.
 
-(11) If you want to use the pcre2grep command, compile and link
+(11) The pcre2test program tests the POSIX wrapper library, but there is also a
+     freestanding test program in src/pcre2posix_test.c. It must be linked with
+     both the pcre2posix library and the 8-bit PCRE2 library.
+
+(12) If you want to use the pcre2grep command, compile and link
      src/pcre2grep.c; it uses only the basic 8-bit PCRE2 library (it does not
      need the pcre2posix library). If you have built the PCRE2 library with JIT
      support by defining SUPPORT_JIT in src/config.h, you can also define
@@ -306,7 +311,7 @@
 3.  Create a new, empty build directory, preferably a subdirectory of the
     source dir. For example, C:\pcre2\pcre2-xx\build.
 
-4.  Run cmake-gui from the Shell envirornment of your build tool, for example,
+4.  Run cmake-gui from the Shell environment of your build tool, for example,
     Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
     to start Cmake from the Windows Start menu, as this can lead to errors.
 
@@ -373,7 +378,7 @@
 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
    have been created.
 
-2. Edit RunTest.bat to indentify the full or relative location of
+2. Edit RunTest.bat to identify the full or relative location of
    the pcre2 source (wherein which the testdata folder resides), e.g.:
 
    set srcdir=C:\pcre2\pcre2-10.00
@@ -405,6 +410,6 @@
 z/OS file formats. The port provides an API for LE languages such as COBOL and
 for the z/OS and z/VM versions of the Rexx languages.
 
-===========================
-Last Updated: 28 April 2021
-===========================
+==============================
+Last Updated: 10 December 2022
+==============================
diff --git a/doc/html/README.txt b/doc/html/README.txt
index 7896944..c88acff 100644
--- a/doc/html/README.txt
+++ b/doc/html/README.txt
@@ -8,7 +8,7 @@
 now obsolete and no longer maintained. The latest release of PCRE2 is available
 in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
 
-https://github.com/PhilipHazel/pcre2/releases
+https://github.com/PCRE2Project/pcre2/releases
 
 There is a mailing list for discussion about the development of PCRE2 at
 pcre2-dev@googlegroups.com. You can subscribe by sending an email to
@@ -17,7 +17,7 @@
 You can access the archives and also subscribe or manage your subscription
 here:
 
-https://groups.google.com/pcre2-dev
+https://groups.google.com/g/pcre2-dev
 
 Please read the NEWS file if you are upgrading from a previous release. The
 contents of this README file are:
@@ -375,10 +375,10 @@
   necessary to specify something like LIBS="-lncurses" as well. This is
   because, to quote the readline INSTALL, "Readline uses the termcap functions,
   but does not link with the termcap or curses library itself, allowing
-  applications which link with readline the to choose an appropriate library."
-  If you get error messages about missing functions tgetstr, tgetent, tputs,
-  tgetflag, or tgoto, this is the problem, and linking with the ncurses library
-  should fix it.
+  applications which link with readline the option to choose an appropriate
+  library." If you get error messages about missing functions tgetstr, tgetent,
+  tputs, tgetflag, or tgoto, this is the problem, and linking with the ncurses
+  library should fix it.
 
 . The C99 standard defines formatting modifiers z and t for size_t and
   ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
@@ -400,17 +400,17 @@
   Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
   be created. This is normally run under valgrind or used when PCRE2 is
   compiled with address sanitizing enabled. It calls the fuzzing function and
-  outputs information about it is doing. The input strings are specified by
-  arguments: if an argument starts with "=" the rest of it is a literal input
-  string. Otherwise, it is assumed to be a file name, and the contents of the
-  file are the test string.
+  outputs information about what it is doing. The input strings are specified
+  by arguments: if an argument starts with "=" the rest of it is a literal
+  input string. Otherwise, it is assumed to be a file name, and the contents
+  of the file are the test string.
 
 . Releases before 10.30 could be compiled with --disable-stack-for-recursion,
   which caused pcre2_match() to use individual blocks on the heap for
   backtracking instead of recursive function calls (which use the stack). This
-  is now obsolete since pcre2_match() was refactored always to use the heap (in
-  a much more efficient way than before). This option is retained for backwards
-  compatibility, but has no effect other than to output a warning.
+  is now obsolete because pcre2_match() was refactored always to use the heap
+  (in a much more efficient way than before). This option is retained for
+  backwards compatibility, but has no effect other than to output a warning.
 
 The "configure" script builds the following files for the basic C library:
 
@@ -438,8 +438,9 @@
 libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test
 program called pcre2test. If you enabled JIT support with --enable-jit, another
 test program called pcre2_jit_test is built as well. If the 8-bit library is
-built, libpcre2-posix and the pcre2grep command are also built. Running
-"make" with the -j option may speed up compilation on multiprocessor systems.
+built, libpcre2-posix, pcre2posix_test, and the pcre2grep command are also
+built. Running "make" with the -j option may speed up compilation on
+multiprocessor systems.
 
 The command "make check" runs all the appropriate tests. Details of the PCRE2
 tests are given below in a separate section of this document. The -j option of
@@ -591,9 +592,11 @@
 
 To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
 There is another script called RunGrepTest that tests the pcre2grep command.
-When JIT support is enabled, a third test program called pcre2_jit_test is
-built. Both the scripts and all the program tests are run if you obey "make
-check". For other environments, see the instructions in NON-AUTOTOOLS-BUILD.
+When the 8-bit library is built, a test program for the POSIX wrapper, called
+pcre2posix_test, is compiled, and when JIT support is enabled, a test program
+called pcre2_jit_test is built. The scripts and the program tests are all run
+when you obey "make check". For other environments, see the instructions in
+NON-AUTOTOOLS-BUILD.
 
 The RunTest script runs the pcre2test test program (which is documented in its
 own man page) on each of the relevant testinput files in the testdata
@@ -695,7 +698,7 @@
 different code unit widths.
 
 Test 15 contains a number of tests that must not be run with JIT. They check,
-among other non-JIT things, the match-limiting features of the intepretive
+among other non-JIT things, the match-limiting features of the interpretive
 matcher.
 
 Test 16 is run only when JIT support is not available. It checks that an
@@ -716,6 +719,9 @@
 Tests 24 and 25 test the experimental pattern conversion functions, without and
 with UTF support, respectively.
 
+Test 26 checks Unicode property support using tests that are generated
+automatically from the Unicode data tables.
+
 
 Character tables
 ----------------
@@ -819,6 +825,7 @@
   src/pcre2_substring.c    )
   src/pcre2_tables.c       )
   src/pcre2_ucd.c          )
+  src/pcre2_ucptables.c    )
   src/pcre2_valid_utf.c    )
   src/pcre2_xclass.c       )
 
@@ -830,6 +837,8 @@
   src/pcre2posix.h         header for the external POSIX wrapper API
   src/pcre2_internal.h     header for internal use
   src/pcre2_intmodedep.h   a mode-specific internal header
+  src/pcre2_jit_neon_inc.h header used by JIT
+  src/pcre2_jit_simd_inc.h header used by JIT
   src/pcre2_ucp.h          header for Unicode property handling
 
   sljit/*                  source files for the JIT compiler
@@ -840,6 +849,7 @@
   src/pcre2grep.c          source of a grep utility that uses PCRE2
   src/pcre2test.c          comprehensive test program
   src/pcre2_jit_test.c     JIT test program
+  src/pcre2posix_test.c    POSIX wrapper API test program
 
 (C) Auxiliary files:
 
@@ -911,4 +921,4 @@
 Philip Hazel
 Email local part: Philip.Hazel
 Email domain: gmail.com
-Last updated: 15 April 2022
+Last updated: 10 December 2022
diff --git a/doc/html/pcre2_compile.html b/doc/html/pcre2_compile.html
index f6485f2..615e090 100644
--- a/doc/html/pcre2_compile.html
+++ b/doc/html/pcre2_compile.html
@@ -92,8 +92,18 @@
 function.
 </P>
 <P>
-The yield of this function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected.
+If either of <i>errorcode</i> or <i>erroroffset</i> is NULL, the function returns
+NULL immediately. Otherwise, the yield of this function is a pointer to a
+private data structure that contains the compiled pattern, or NULL if an error
+was detected. In the error case, a text error message can be obtained by
+passing the value returned via the <i>errorcode</i> argument to the the
+<b>pcre2_get_error_message()</b> function. The offset (in code units) where the
+error was encountered is returned via the <i>erroroffset</i> argument.
+</P>
+<P>
+If there is no error, the value passed via <i>errorcode</i> returns the message
+"no error" if passed to <b>pcre2_get_error_message()</b>, and the value passed
+via <i>erroroffset</i> is zero.
 </P>
 <P>
 There is a complete description of the PCRE2 native API, with more detail on
diff --git a/doc/html/pcre2_match_data_create_from_pattern.html b/doc/html/pcre2_match_data_create_from_pattern.html
index 4836474..db58ab9 100644
--- a/doc/html/pcre2_match_data_create_from_pattern.html
+++ b/doc/html/pcre2_match_data_create_from_pattern.html
@@ -33,7 +33,7 @@
 vector" (ovector) within the match data block, and are used to identify the
 matched string and any captured substrings when matching with
 <b>pcre2_match()</b>. If you are using <b>pcre2_dfa_match()</b>, which uses the
-outut vector in a different way, you should use <b>pcre2_match_data_create()</b>
+output vector in a different way, you should use <b>pcre2_match_data_create()</b>
 instead of this function.
 </P>
 <P>
diff --git a/doc/html/pcre2_serialize_decode.html b/doc/html/pcre2_serialize_decode.html
index cff6e6c..618ffa9 100644
--- a/doc/html/pcre2_serialize_decode.html
+++ b/doc/html/pcre2_serialize_decode.html
@@ -48,7 +48,7 @@
   PCRE2_ERROR_BADDATA   <i>number_of_codes</i> is zero or less
   PCRE2_ERROR_BADMAGIC  mismatch of id bytes in <i>bytes</i>
   PCRE2_ERROR_BADMODE   mismatch of variable unit size or PCRE version
-  PCRE2_ERROR_MEMORY    memory allocation failed
+  PCRE2_ERROR_NOMEMORY  memory allocation failed
   PCRE2_ERROR_NULL      <i>codes</i> or <i>bytes</i> is NULL
 </pre>
 PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html
index 047e242..d5ffd7c 100644
--- a/doc/html/pcre2api.html
+++ b/doc/html/pcre2api.html
@@ -1017,7 +1017,7 @@
 documentation for more details). If the limit is reached, the negative error
 code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
 is built; if it is not, the default is set very large and is essentially
-"unlimited".
+unlimited.
 </P>
 <P>
 A value for the heap limit may also be supplied by an item at the start of a
@@ -1030,19 +1030,17 @@
 limit is set, less than the default.
 </P>
 <P>
-The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
-stack for recording backtracking points. The more nested backtracking points
-there are (that is, the deeper the search tree), the more memory is needed.
-Heap memory is used only if the initial vector is too small. If the heap limit
-is set to a value less than 21 (in particular, zero) no heap memory will be
-used. In this case, only patterns that do not have a lot of nested backtracking
-can be successfully processed.
+The <b>pcre2_match()</b> function always needs some heap memory, so setting a
+value of zero guarantees a "heap limit exceeded" error. Details of how
+<b>pcre2_match()</b> uses the heap are given in the
+<a href="pcre2perform.html"><b>pcre2perform</b></a>
+documentation.
 </P>
 <P>
-Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used
-when processing pattern recursions, lookarounds, or atomic groups, and only if
-this is not big enough is heap memory used. In this case, too, setting a value
-of zero disables the use of the heap.
+For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
+processing pattern recursions, lookarounds, or atomic groups, and only if this
+is not big enough is heap memory used. In this case, setting a value of zero
+disables the use of the heap.
 <br>
 <br>
 <b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
@@ -1089,10 +1087,10 @@
 <br>
 <br>
 This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
-Each time a nested backtracking point is passed, a new memory "frame" is used
+Each time a nested backtracking point is passed, a new memory frame is used
 to remember the state of matching at that point. Thus, this parameter
 indirectly limits the amount of memory that is used in a match. However,
-because the size of each memory "frame" depends on the number of capturing
+because the size of each memory frame depends on the number of capturing
 parentheses, the actual memory limit varies from pattern to pattern. This limit
 was more useful in versions before 10.30, where function recursion was used for
 backtracking.
@@ -1383,8 +1381,7 @@
 NULL immediately. Otherwise, the variables to which these point are set to an
 error code and an offset (number of code units) within the pattern,
 respectively, when <b>pcre2_compile()</b> returns NULL because a compilation
-error has occurred. The values are not defined when compilation is successful
-and <b>pcre2_compile()</b> returns a non-NULL value.
+error has occurred.
 </P>
 <P>
 There are nearly 100 positive error codes that <b>pcre2_compile()</b> may return
@@ -1399,15 +1396,18 @@
 message"
 <a href="#geterrormessage">below)</a>
 should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
-for both positive and negative error codes in <b>pcre2.h</b>.
+for both positive and negative error codes in <b>pcre2.h</b>. When compilation
+is successful <i>errorcode</i> is set to a value that returns the message "no
+error" if passed to <b>pcre2_get_error_message()</b>.
 </P>
 <P>
 The value returned in <i>erroroffset</i> is an indication of where in the
-pattern the error occurred. It is not necessarily the furthest point in the
-pattern that was read. For example, after the error "lookbehind assertion is
-not fixed length", the error offset points to the start of the failing
-assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
-first code unit of the failing character.
+pattern an error occurred. When there is no error, zero is returned. A non-zero
+value is not necessarily the furthest point in the pattern that was read. For
+example, after the error "lookbehind assertion is not fixed length", the error
+offset points to the start of the failing assertion. For an invalid UTF-8 or
+UTF-16 string, the offset is that of the first code unit of the failing
+character.
 </P>
 <P>
 Some errors are not detected until the whole pattern has been scanned; in these
@@ -2543,7 +2543,9 @@
 A minimum of at least 1 pair is imposed by <b>pcre2_match_data_create()</b>, so
 it is always possible to return the overall matched string in the case of
 <b>pcre2_match()</b> or the longest match in the case of
-<b>pcre2_dfa_match()</b>.
+<b>pcre2_dfa_match()</b>. The maximum number of pairs is 65535; if the the first
+argument of <b>pcre2_match_data_create()</b> is greater than this, 65535 is
+used.
 </P>
 <P>
 The second argument of <b>pcre2_match_data_create()</b> is a pointer to a
@@ -3146,11 +3148,11 @@
 <pre>
   PCRE2_ERROR_NOMEMORY
 </pre>
-If a pattern contains many nested backtracking points, heap memory is used to
-remember them. This error is given when the memory allocation function (default
-or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
-if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
-also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
+Heap memory is used to remember backgracking points. This error is given when
+the memory allocation function (default or custom) fails. Note that a different
+error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
+the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
+PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
 <pre>
   PCRE2_ERROR_NULL
 </pre>
@@ -4018,9 +4020,9 @@
 </P>
 <br><a name="SEC42" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 14 December 2021
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2021 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
diff --git a/doc/html/pcre2build.html b/doc/html/pcre2build.html
index 0d12155..07472d1 100644
--- a/doc/html/pcre2build.html
+++ b/doc/html/pcre2build.html
@@ -284,12 +284,11 @@
 counting is done differently).
 </P>
 <P>
-The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
-stack to record backtracking points. The more nested backtracking points there
-are (that is, the deeper the search tree), the more memory is needed. If the
-initial vector is not large enough, heap memory is used, up to a certain limit,
-which is specified in kibibytes (units of 1024 bytes). The limit can be changed
-at run time, as described in the
+The <b>pcre2_match()</b> function uses heap memory to record backtracking
+points. The more nested backtracking points there are (that is, the deeper the
+search tree), the more memory is needed. There is an upper limit, specified in
+kibibytes (units of 1024 bytes). This limit can be changed at run time, as
+described in the
 <a href="pcre2api.html"><b>pcre2api</b></a>
 documentation. The default limit (in effect unlimited) is 20 million. You can
 change this by a setting such as
@@ -609,16 +608,16 @@
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC26" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 08 December 2021
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2021 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html
index b3252d3..29ab031 100644
--- a/doc/html/pcre2grep.html
+++ b/doc/html/pcre2grep.html
@@ -71,13 +71,15 @@
 <pre>
   pcre2grep some-pattern file1 - file3
 </pre>
-Input files are searched line by line. By default, each line that matches a
+By default, input files are searched line by line. Each line that matches a
 pattern is copied to the standard output, and if there is more than one file,
 the file name is output at the start of each line, followed by a colon.
-However, there are options that can change how <b>pcre2grep</b> behaves. In
-particular, the <b>-M</b> option makes it possible to search for strings that
-span line boundaries. What defines a line boundary is controlled by the
-<b>-N</b> (<b>--newline</b>) option.
+However, there are options that can change how <b>pcre2grep</b> behaves. For
+example, the <b>-M</b> option makes it possible to search for strings that span
+line boundaries. What defines a line boundary is controlled by the <b>-N</b>
+(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
+not file names are shown, and the <b>-Z</b> option changes the file name
+terminator to a zero byte.
 </P>
 <P>
 The amount of memory used for buffering files that are being scanned is
@@ -106,19 +108,24 @@
 <P>
 By default, as soon as one pattern matches a line, no further patterns are
 considered. However, if <b>--colour</b> (or <b>--color</b>) is used to colour the
-matching substrings, or if <b>--only-matching</b>, <b>--file-offsets</b>, or
-<b>--line-offsets</b> is used to output only the part of the line that matched
-(either shown literally, or as an offset), scanning resumes immediately
-following the match, so that further matches on the same line can be found. If
-there are multiple patterns, they are all tried on the remainder of the line,
-but patterns that follow the one that matched are not tried on the earlier
-matched part of the line.
+matching substrings, or if <b>--only-matching</b>, <b>--file-offsets</b>,
+<b>--line-offsets</b>, or <b>--output</b> is used to output only the part of the
+line that matched (either shown literally, or as an offset), the behaviour is
+different. In this situation, all the patterns are applied to the line. If
+there is more than one match, the one that begins nearest to the start of the
+subject is processed; if there is more than one match at that position, the one
+with the longest matching substring is processed; if the matching substrings
+are equal, the first match found is processed.
 </P>
 <P>
-This behaviour means that the order in which multiple patterns are specified
-can affect the output when one of the above options is used. This is no longer
-the same behaviour as GNU grep, which now manages to display earlier matches
-for later patterns (as long as there is no overlap).
+Scanning with all the patterns resumes immediately following the match, so that
+later matches on the same line can be found. Note, however, that an overlapping
+match that starts in the middle of another match will not be processed.
+</P>
+<P>
+The above behaviour was changed at release 10.41 to be more compatible with GNU
+grep. In earlier releases, <b>pcre2grep</b> did not recognize matches from
+later patterns that were earlier in the subject.
 </P>
 <P>
 Patterns that can match an empty string are accepted, but empty string
@@ -134,14 +141,15 @@
 </P>
 <br><a name="SEC3" href="#TOC1">SUPPORT FOR COMPRESSED FILES</a><br>
 <P>
-It is possible to compile <b>pcre2grep</b> so that it uses <b>libz</b> or
-<b>libbz2</b> to read compressed files whose names end in <b>.gz</b> or
+Compile-time options for <b>pcre2grep</b> can set it up to use <b>libz</b> or
+<b>libbz2</b> for reading compressed files whose names end in <b>.gz</b> or
 <b>.bz2</b>, respectively. You can find out whether your <b>pcre2grep</b> binary
 has support for one or both of these file types by running it with the
 <b>--help</b> option. If the appropriate support is not present, all files are
-treated as plain text. The standard input is always so treated. When input is
-from a compressed .gz or .bz2 file, the <b>--line-buffered</b> option is
-ignored.
+treated as plain text. The standard input is always so treated. If a file with
+a <b>.gz</b> or <b>.bz2</b> extension is not in fact compressed, it is read as a
+plain text file. When input is from a compressed .gz or .bz2 file, the
+<b>--line-buffered</b> option is ignored.
 </P>
 <br><a name="SEC4" href="#TOC1">BINARY FILES</a><br>
 <P>
@@ -178,9 +186,11 @@
 lines are output if the next match or the end of the file is reached, or if the
 processing buffer size has been set too small. If file names and/or line
 numbers are being output, a hyphen separator is used instead of a colon for the
-context lines. A line containing "--" is output between each group of lines,
-unless they are in fact contiguous in the input file. The value of <i>number</i>
-is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
+context lines (the <b>-Z</b> option can be used to change the file name
+terminator to a zero byte). A line containing "--" is output between each group
+of lines, unless they are in fact contiguous in the input file. The value of
+<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
+<b>-A</b> is ignored.
 </P>
 <P>
 <b>-a</b>, <b>--text</b>
@@ -199,9 +209,10 @@
 lines are output if the previous match or the start of the file is within
 <i>number</i> lines, or if the processing buffer size has been set too small. If
 file names and/or line numbers are being output, a hyphen separator is used
-instead of a colon for the context lines. A line containing "--" is output
-between each group of lines, unless they are in fact contiguous in the input
-file. The value of <i>number</i> is expected to be relatively small. When
+instead of a colon for the context lines (the <b>-Z</b> option can be used to
+change the file name terminator to a zero byte). A line containing "--" is
+output between each group of lines, unless they are in fact contiguous in the
+input file. The value of <i>number</i> is expected to be relatively small. When
 <b>-c</b> is used, <b>-B</b> is ignored.
 </P>
 <P>
@@ -254,12 +265,14 @@
 <P>
 <b>--colour=</b><i>value</i>, <b>--color=</b><i>value</i>
 This option specifies under what circumstances the parts of a line that matched
-a pattern should be coloured in the output. By default, the output is not
-coloured. The value (which is optional, see above) may be "never", "always", or
-"auto". In the latter case, colouring happens only if the standard output is
-connected to a terminal. More resources are used when colouring is enabled,
-because <b>pcre2grep</b> has to search for all possible matches in a line, not
-just one, in order to colour them all.
+a pattern should be coloured in the output. It is ignored if
+<b>--file-offsets</b>, <b>--line-offsets</b>, or <b>--output</b> is set. By
+default, output is not coloured. The value for the <b>--colour</b> option (which
+is optional, see above) may be "never", "always", or "auto". In the latter
+case, colouring happens only if the standard output is connected to a terminal.
+More resources are used when colouring is enabled, because <b>pcre2grep</b> has
+to search for all possible matches in a line, not just one, in order to colour
+them all.
 <br>
 <br>
 The colour that is used can be specified by setting one of the environment
@@ -307,18 +320,12 @@
 single pattern that starts with a hyphen. When <b>-e</b> is used, no argument
 pattern is taken from the command line; all arguments are treated as file
 names. There is no limit to the number of patterns. They are applied to each
-line in the order in which they are defined until one matches.
+line in the order in which they are defined.
 <br>
 <br>
 If <b>-f</b> is used with <b>-e</b>, the command line patterns are matched first,
 followed by the patterns from the file(s), independent of the order in which
-these options are specified. Note that multiple use of <b>-e</b> is not the same
-as a single pattern with alternatives. For example, X|Y finds the first
-character in a line that is X or Y, whereas if the two patterns are given
-separately, with X first, <b>pcre2grep</b> finds X if it is present, even if it
-follows Y in the line. It finds Y only if there is no X in the line. This
-matters only if you are using <b>-o</b> or <b>--colo(u)r</b> to show the part(s)
-of the line that matched.
+these options are specified.
 </P>
 <P>
 <b>--exclude</b>=<i>pattern</i>
@@ -367,23 +374,20 @@
 </P>
 <P>
 <b>-f</b> <i>filename</i>, <b>--file=</b><i>filename</i>
-Read patterns from the file, one per line, and match them against each line of
-input. As is the case with patterns on the command line, no delimiters should
-be used. What constitutes a newline when reading the file is the operating
-system's default interpretation of \n. The <b>--newline</b> option has no
-effect on this option. Trailing white space is removed from each line, and
-blank lines are ignored. An empty file contains no patterns and therefore
-matches nothing. Patterns read from a file in this way may contain binary
-zeros, which are treated as ordinary data characters. See also the comments
-about multiple patterns versus a single pattern with alternatives in the
-description of <b>-e</b> above.
+Read patterns from the file, one per line. As is the case with patterns on the
+command line, no delimiters should be used. What constitutes a newline when
+reading the file is the operating system's default interpretation of \n. The
+<b>--newline</b> option has no effect on this option. Trailing white space is
+removed from each line, and blank lines are ignored. An empty file contains no
+patterns and therefore matches nothing. Patterns read from a file in this way
+may contain binary zeros, which are treated as ordinary data characters.
 <br>
 <br>
 If this option is given more than once, all the specified files are read. A
 data line is output if any of the patterns match it. A file name can be given
 as "-" to refer to the standard input. When <b>-f</b> is used, patterns
 specified on the command line using <b>-e</b> may also be present; they are
-tested before the file's patterns. However, no other pattern is taken from the
+matched before the file's patterns. However, no pattern is taken from the
 command line; all arguments are treated as the names of paths to be searched.
 </P>
 <P>
@@ -403,28 +407,30 @@
 <b>--file-offsets</b>
 Instead of showing lines or parts of lines that match, show each match as an
 offset from the start of the file and a length, separated by a comma. In this
-mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b>
-options are ignored. If there is more than one match in a line, each of them is
-shown separately. This option is mutually exclusive with <b>--output</b>,
-<b>--line-offsets</b>, and <b>--only-matching</b>.
+mode, <b>--colour</b> has no effect, and no context is shown. That is, the
+<b>-A</b>, <b>-B</b>, and <b>-C</b> options are ignored. If there is more than one
+match in a line, each of them is shown separately. This option is mutually
+exclusive with <b>--output</b>, <b>--line-offsets</b>, and <b>--only-matching</b>.
 </P>
 <P>
 <b>-H</b>, <b>--with-filename</b>
 Force the inclusion of the file name at the start of output lines when
-searching a single file. By default, the file name is not shown in this case.
-For matching lines, the file name is followed by a colon; for context lines, a
-hyphen separator is used. If a line number is also being output, it follows the
-file name. When the <b>-M</b> option causes a pattern to match more than one
-line, only the first is preceded by the file name. This option overrides any
-previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
+searching a single file. The file name is not normally shown in this case.
+By default, for matching lines, the file name is followed by a colon; for
+context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
+change the terminator to a zero byte. If a line number is also being output,
+it follows the file name. When the <b>-M</b> option causes a pattern to match
+more than one line, only the first is preceded by the file name. This option
+overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
 </P>
 <P>
 <b>-h</b>, <b>--no-filename</b>
-Suppress the output file names when searching multiple files. By default,
-file names are shown when multiple files are searched. For matching lines, the
-file name is followed by a colon; for context lines, a hyphen separator is used.
-If a line number is also being output, it follows the file name. This option
-overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
+Suppress the output file names when searching multiple files. File names are
+normally shown when multiple files are searched. By default, for matching
+lines, the file name is followed by a colon; for context lines, a hyphen
+separator is used. The <b>-Z</b> option can be used to change the terminator to
+a zero byte. If a line number is also being output, it follows the file name.
+This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
 </P>
 <P>
 <b>--heap-limit</b>=<i>number</i>
@@ -481,18 +487,20 @@
 <b>-L</b>, <b>--files-without-match</b>
 Instead of outputting lines from the files, just output the names of the files
 that do not contain any lines that would have been output. Each file name is
-output once, on a separate line. This option overrides any previous <b>-H</b>,
-<b>-h</b>, or <b>-l</b> options.
+output once, on a separate line by default, but if the <b>-Z</b> option is set,
+they are separated by zero bytes instead of newlines. This option overrides any
+previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
 </P>
 <P>
 <b>-l</b>, <b>--files-with-matches</b>
 Instead of outputting lines from the files, just output the names of the files
 containing lines that would have been output. Each file name is output once, on
-a separate line. Searching normally stops as soon as a matching line is found
-in a file. However, if the <b>-c</b> (count) option is also used, matching
-continues in order to obtain the correct count, and those files that have at
-least one match are listed along with their counts. Using this option with
-<b>-c</b> is a way of suppressing the listing of files with no matches that
+a separate line, but if the <b>-Z</b> option is set, they are separated by zero
+bytes instead of newlines. Searching normally stops as soon as a matching line
+is found in a file. However, if the <b>-c</b> (count) option is also used,
+matching continues in order to obtain the correct count, and those files that
+have at least one match are listed along with their counts. Using this option
+with <b>-c</b> is a way of suppressing the listing of files with no matches that
 occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
 <b>-h</b>, or <b>-L</b> options.
 </P>
@@ -520,11 +528,11 @@
 Instead of showing lines or parts of lines that match, show each match as a
 line number, the offset from the start of the line, and a length. The line
 number is terminated by a colon (as usual; see the <b>-n</b> option), and the
-offset and length are separated by a comma. In this mode, no context is shown.
-That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b> options are ignored. If there is
-more than one match in a line, each of them is shown separately. This option is
-mutually exclusive with <b>--output</b>, <b>--file-offsets</b>, and
-<b>--only-matching</b>.
+offset and length are separated by a comma. In this mode, <b>--colour</b> has no
+effect, and no context is shown. That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b>
+options are ignored. If there is more than one match in a line, each of them is
+shown separately. This option is mutually exclusive with <b>--output</b>,
+<b>--file-offsets</b>, and <b>--only-matching</b>.
 </P>
 <P>
 <b>--locale</b>=<i>locale-name</i>
@@ -592,10 +600,7 @@
 <br>
 <br>
 The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
-1024 bytes), the amount of heap memory that may be used for matching. Heap
-memory is needed only if matching the pattern requires a significant number of
-nested backtracking points to be remembered. This parameter can be set to zero
-to forbid the use of heap memory altogether.
+1024 bytes), the maximum amount of heap memory that may be used for matching.
 <br>
 <br>
 The <b>--depth-limit</b> option limits the depth of nested backtracking points,
@@ -667,12 +672,12 @@
 <b>-O</b> <i>text</i>, <b>--output</b>=<i>text</i>
 When there is a match, instead of outputting the line that matched, output just
 the text specified in this option, followed by an operating-system standard
-newline. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>,
-and <b>-C</b> options are ignored. The <b>--newline</b> option has no effect on
-this option, which is mutually exclusive with <b>--only-matching</b>,
-<b>--file-offsets</b>, and <b>--line-offsets</b>. However, like
-<b>--only-matching</b>, if there is more than one match in a line, each of them
-causes a line of output.
+newline. In this mode, <b>--colour</b> has no effect, and no context is shown.
+That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b> options are ignored. The
+<b>--newline</b> option has no effect on this option, which is mutually
+exclusive with <b>--only-matching</b>, <b>--file-offsets</b>, and
+<b>--line-offsets</b>. However, like <b>--only-matching</b>, if there is more
+than one match in a line, each of them causes a line of output.
 <br>
 <br>
 Escape sequences starting with a dollar character may be used to insert the
@@ -839,6 +844,13 @@
 matched against the contents of files; it does not apply to patterns specified
 by any of the <b>--include</b> or <b>--exclude</b> options.
 </P>
+<P>
+<b>-Z</b>, <b>--null</b>
+Terminate files names in the regular output with a zero byte (the NUL
+character) instead of what would normally appear. This is useful when file
+names contain unusual characters such as colons, hyphens, or even newlines. The
+option does not apply to file names in error messages.
+</P>
 <br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
 <P>
 The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
@@ -1053,9 +1065,9 @@
 </P>
 <br><a name="SEC16" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 31 August 2021
+Last updated: 21 November 2022
 <br>
-Copyright &copy; 1997-2021 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
diff --git a/doc/html/pcre2limits.html b/doc/html/pcre2limits.html
index c8bc01b..fca5e89 100644
--- a/doc/html/pcre2limits.html
+++ b/doc/html/pcre2limits.html
@@ -71,13 +71,18 @@
 The maximum length of a string argument to a callout is the largest number a
 32-bit unsigned integer can hold.
 </P>
+<P>
+The maximum amount of heap memory used for matching is controlled by the heap
+limit, which can be set in a pattern or in a match context. The default is a
+very large number, effectively unlimited.
+</P>
 <br><b>
 AUTHOR
 </b><br>
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
@@ -86,9 +91,9 @@
 REVISION
 </b><br>
 <P>
-Last updated: 02 February 2019
+Last updated: 26 July 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
diff --git a/doc/html/pcre2perform.html b/doc/html/pcre2perform.html
index 80d716c..55fdf20 100644
--- a/doc/html/pcre2perform.html
+++ b/doc/html/pcre2perform.html
@@ -83,12 +83,31 @@
 uses very little system stack at run time. In earlier releases recursive
 function calls could use a great deal of stack, and this could cause problems,
 but this usage has been eliminated. Backtracking positions are now explicitly
-remembered in memory frames controlled by the code. An initial 20KiB vector of
-frames is allocated on the system stack (enough for about 100 frames for small
-patterns), but if this is insufficient, heap memory is used. The amount of heap
-memory can be limited; if the limit is set to zero, only the initial stack
-vector is used. Rewriting patterns to be time-efficient, as described below,
-may also reduce the memory requirements.
+remembered in memory frames controlled by the code.
+</P>
+<P>
+The size of each frame depends on the size of pointer variables and the number
+of capturing parenthesized groups in the pattern being matched. On a 64-bit
+system the frame size for a pattern with no captures is 128 bytes. For each
+capturing group the size increases by 16 bytes.
+</P>
+<P>
+Until release 10.41, an initial 20KiB frames vector was allocated on the system
+stack, but this still caused some issues for multi-thread applications where
+each thread has a very small stack. From release 10.41 backtracking memory
+frames are always held in heap memory. An initial heap allocation is obtained
+the first time any match data block is passed to <b>pcre2_match()</b>. This is
+remembered with the match data block and re-used if that block is used for
+another match. It is freed when the match data block itself is freed.
+</P>
+<P>
+The size of the initial block is the larger of 20KiB or ten times the pattern's
+frame size, unless the heap limit is less than this, in which case the heap
+limit is used. If the initial block proves to be too small during matching, it
+is replaced by a larger block, subject to the heap limit. The heap limit is
+checked only when a new block is to be allocated. Reducing the heap limit
+between calls to <b>pcre2_match()</b> with the same match data block does not
+affect the saved block.
 </P>
 <P>
 In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
@@ -245,16 +264,16 @@
 <P>
 Philip Hazel
 <br>
-University Computing Service
+Retired from University Computing Service
 <br>
 Cambridge, England.
 <br>
 </P>
 <br><a name="SEC6" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 03 February 2019
+Last updated: 27 July 2022
 <br>
-Copyright &copy; 1997-2019 University of Cambridge.
+Copyright &copy; 1997-2022 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
diff --git a/doc/html/pcre2serialize.html b/doc/html/pcre2serialize.html
index df4098e..a492305 100644
--- a/doc/html/pcre2serialize.html
+++ b/doc/html/pcre2serialize.html
@@ -94,7 +94,7 @@
 <pre>
   PCRE2_ERROR_BADDATA      the number of patterns is zero or less
   PCRE2_ERROR_BADMAGIC     mismatch of id bytes in one of the patterns
-  PCRE2_ERROR_MEMORY       memory allocation failed
+  PCRE2_ERROR_NOMEMORY     memory allocation failed
   PCRE2_ERROR_MIXEDTABLES  the patterns do not all use the same tables
   PCRE2_ERROR_NULL         the 1st, 3rd, or 4th argument is NULL
 </pre>
diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html
index 373e5df..587ea59 100644
--- a/doc/html/pcre2test.html
+++ b/doc/html/pcre2test.html
@@ -263,7 +263,7 @@
 </P>
 <P>
 <b>-LS</b>
-List scripts: write a list of recogized Unicode script names to the standard
+List scripts: write a list of recognized Unicode script names to the standard
 output, then exit with zero exit code. All other options are ignored. If both
 -C and any -Lx options are present, whichever is first is recognized.
 </P>
@@ -1241,7 +1241,8 @@
       copy=&#60;number or name&#62;      copy captured substring
       depth_limit=&#60;n&#62;            set a depth limit
       dfa                        use <b>pcre2_dfa_match()</b>
-      find_limits                find match and depth limits
+      find_limits                find heap, match and depth limits
+      find_limits_noheap         find match and depth limits
       get=&#60;number or name&#62;       extract captured substring
       getall                     extract all captured substrings
   /g  global                     global matching
@@ -1367,7 +1368,7 @@
 controlled by various modifiers listed above whose names begin with
 <b>callout_</b>. Details are given in the section entitled "Callouts"
 <a href="#callouts">below.</a>
-Testing callouts from <b>pcre2_substitute()</b> is decribed separately in
+Testing callouts from <b>pcre2_substitute()</b> is described separately in
 "Testing the substitution function"
 <a href="#substitution">below.</a>
 </P>
@@ -1564,7 +1565,7 @@
 <P>
 The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
 the appropriate limits in the match context. These values are ignored when the
-<b>find_limits</b> modifier is specified.
+<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
 </P>
 <br><b>
 Finding minimum limits
@@ -1574,8 +1575,12 @@
 calls the relevant matching function several times, setting different values in
 the match context via <b>pcre2_set_heap_limit()</b>,
 <b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
-the minimum values for each parameter that allows the match to complete without
-error. If JIT is being used, only the match limit is relevant.
+the smallest value for each parameter that allows the match to complete without
+a "limit exceeded" error. The match itself may succeed or fail. An alternative
+modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
+standard tests, because the minimum heap limit varies between systems. If JIT
+is being used, only the match limit is relevant, and the other two are
+automatically omitted.
 </P>
 <P>
 When using this modifier, the pattern should not contain any limit settings
@@ -1603,9 +1608,7 @@
 </P>
 <P>
 For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
-(units of 1024 bytes), limits the amount of heap memory used for matching. A
-value of zero disables the use of any heap memory; many simple pattern matches
-can be done without using the heap, so zero is not an unreasonable setting.
+(units of 1024 bytes), limits the amount of heap memory used for matching.
 </P>
 <br><b>
 Showing MARK names
@@ -1623,12 +1626,10 @@
 <P>
 The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
 memory allocation and freeing calls that occur during a call to
-<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
-requires a bigger vector than the default for remembering backtracking points
-(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
-many cases there will be no heap memory used and therefore no additional
-output. No heap memory is allocated during matching with JIT, so in that case
-the <b>memory</b> modifier never has any effect. For this modifier to work, the
+<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
+is used only when a match requires more internal workspace that the default
+allocation on the stack, so in many cases there will be no output. No heap
+memory is allocated during matching with JIT. For this modifier to work, the
 <b>null_context</b> modifier must not be set on both the pattern and the
 subject, though it can be set on one or the other.
 </P>
@@ -1690,7 +1691,8 @@
 If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
 testing that the matching and substitution functions behave correctly in this
 case (they use default values). This modifier cannot be used with the
-<b>find_limits</b> or <b>substitute_callout</b> modifiers.
+<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
+modifiers.
 </P>
 <P>
 Similarly, for testing purposes, if the <b>null_subject</b> or
@@ -2120,7 +2122,7 @@
 <b>jit</b>, which is different behaviour from when it is used on a pattern.
 </P>
 <P>
-The #popcopy command is analagous to the <b>pushcopy</b> modifier in that it
+The #popcopy command is analogous to the <b>pushcopy</b> modifier in that it
 makes current a copy of the topmost stack pattern, leaving the original still
 on the stack.
 </P>
@@ -2141,7 +2143,7 @@
 </P>
 <br><a name="SEC21" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 12 January 2022
+Last updated: 27 July 2022
 <br>
 Copyright &copy; 1997-2022 University of Cambridge.
 <br>
diff --git a/doc/pcre2.txt b/doc/pcre2.txt
index 641a1f9..a997f73 100644
--- a/doc/pcre2.txt
+++ b/doc/pcre2.txt
@@ -1028,7 +1028,7 @@
        pcre2jit  documentation for more details). If the limit is reached, the
        negative error code  PCRE2_ERROR_HEAPLIMIT  is  returned.  The  default
        limit  can be set when PCRE2 is built; if it is not, the default is set
-       very large and is essentially "unlimited".
+       very large and is essentially unlimited.
 
        A value for the heap limit may also be supplied by an item at the start
        of a pattern of the form
@@ -1039,19 +1039,15 @@
        less ddd is less than the limit set by the caller of pcre2_match()  or,
        if no such limit is set, less than the default.
 
-       The  pcre2_match() function starts out using a 20KiB vector on the sys-
-       tem stack for recording backtracking points. The more nested backtrack-
-       ing  points  there  are (that is, the deeper the search tree), the more
-       memory is needed.  Heap memory is used only if the  initial  vector  is
-       too small. If the heap limit is set to a value less than 21 (in partic-
-       ular, zero) no heap memory will be used. In this  case,  only  patterns
-       that  do not have a lot of nested backtracking can be successfully pro-
-       cessed.
+       The  pcre2_match() function always needs some heap memory, so setting a
+       value of zero guarantees a "heap limit exceeded" error. Details of  how
+       pcre2_match()  uses  the  heap are given in the pcre2perform documenta-
+       tion.
 
-       Similarly, for pcre2_dfa_match(), a vector on the system stack is  used
-       when  processing pattern recursions, lookarounds, or atomic groups, and
-       only if this is not big enough is heap memory used. In this case,  too,
-       setting a value of zero disables the use of the heap.
+       For pcre2_dfa_match(), a vector on the system stack is used  when  pro-
+       cessing  pattern recursions, lookarounds, or atomic groups, and only if
+       this is not big enough is heap memory used. In  this  case,  setting  a
+       value of zero disables the use of the heap.
 
        int pcre2_set_match_limit(pcre2_match_context *mcontext,
          uint32_t value);
@@ -1093,12 +1089,12 @@
 
        This   parameter   limits   the   depth   of   nested  backtracking  in
        pcre2_match().  Each time a nested backtracking point is passed, a  new
-       memory "frame" is used to remember the state of matching at that point.
+       memory  frame  is used to remember the state of matching at that point.
        Thus, this parameter indirectly limits the amount  of  memory  that  is
-       used  in  a match. However, because the size of each memory "frame" de-
-       pends on the number of capturing parentheses, the actual  memory  limit
-       varies  from pattern to pattern. This limit was more useful in versions
-       before 10.30, where function recursion was used for backtracking.
+       used in a match. However, because the size of each memory frame depends
+       on the number of capturing parentheses, the actual memory limit  varies
+       from  pattern to pattern. This limit was more useful in versions before
+       10.30, where function recursion was used for backtracking.
 
        The depth limit is not relevant, and is ignored, when matching is  done
        using JIT compiled code. However, it is supported by pcre2_dfa_match(),
@@ -1372,27 +1368,29 @@
        diately. Otherwise, the variables to which these point are  set  to  an
        error code and an offset (number of code units) within the pattern, re-
        spectively, when pcre2_compile() returns NULL because a compilation er-
-       ror  has  occurred. The values are not defined when compilation is suc-
-       cessful and pcre2_compile() returns a non-NULL value.
+       ror has occurred.
 
-       There are nearly 100 positive error codes that pcre2_compile() may  re-
-       turn  if it finds an error in the pattern. There are also some negative
-       error codes that are used for invalid UTF strings when validity  check-
-       ing  is  in  force.  These  are  the same as given by pcre2_match() and
+       There  are nearly 100 positive error codes that pcre2_compile() may re-
+       turn if it finds an error in the pattern. There are also some  negative
+       error  codes that are used for invalid UTF strings when validity check-
+       ing is in force. These are the  same  as  given  by  pcre2_match()  and
        pcre2_dfa_match(), and are described in the pcre2unicode documentation.
-       There  is  no  separate documentation for the positive error codes, be-
-       cause the textual error messages  that  are  obtained  by  calling  the
+       There is no separate documentation for the positive  error  codes,  be-
+       cause  the  textual  error  messages  that  are obtained by calling the
        pcre2_get_error_message() function (see "Obtaining a textual error mes-
-       sage" below) should be  self-explanatory.  Macro  names  starting  with
-       PCRE2_ERROR_  are defined for both positive and negative error codes in
-       pcre2.h.
+       sage"  below)  should  be  self-explanatory.  Macro names starting with
+       PCRE2_ERROR_ are defined for both positive and negative error codes  in
+       pcre2.h.  When  compilation  is  successful errorcode is set to a value
+       that returns the message "no error" if passed  to  pcre2_get_error_mes-
+       sage().
 
        The value returned in erroroffset is an indication of where in the pat-
-       tern  the  error  occurred. It is not necessarily the furthest point in
-       the pattern that was read. For example, after the error "lookbehind as-
-       sertion  is  not fixed length", the error offset points to the start of
-       the failing assertion. For an invalid UTF-8 or UTF-16 string, the  off-
-       set is that of the first code unit of the failing character.
+       tern an error occurred. When there is no error,  zero  is  returned.  A
+       non-zero  value  is  not  necessarily the furthest point in the pattern
+       that was read. For example, after the error  "lookbehind  assertion  is
+       not  fixed length", the error offset points to the start of the failing
+       assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of
+       the first code unit of the failing character.
 
        Some  errors are not detected until the whole pattern has been scanned;
        in these cases, the offset passed back is the length  of  the  pattern.
@@ -2496,7 +2494,9 @@
        A minimum of at least 1 pair is imposed  by  pcre2_match_data_create(),
        so  it  is  always possible to return the overall matched string in the
        case  of  pcre2_match()  or  the  longest  match   in   the   case   of
-       pcre2_dfa_match().
+       pcre2_dfa_match().  The  maximum  number  of pairs is 65535; if the the
+       first argument of pcre2_match_data_create() is greater than this, 65535
+       is used.
 
        The second argument of pcre2_match_data_create() is a pointer to a gen-
        eral context, which can specify custom memory management for  obtaining
@@ -3049,12 +3049,12 @@
 
          PCRE2_ERROR_NOMEMORY
 
-       If  a  pattern contains many nested backtracking points, heap memory is
-       used to remember them. This error is given when the  memory  allocation
-       function  (default  or  custom)  fails.  Note  that  a different error,
-       PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed  exceeds
-       the    heap   limit.   PCRE2_ERROR_NOMEMORY   is   also   returned   if
-       PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
+       Heap  memory  is  used  to  remember backgracking points. This error is
+       given when the memory allocation function (default  or  custom)  fails.
+       Note  that  a  different  error, PCRE2_ERROR_HEAPLIMIT, is given if the
+       amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
+       also  returned  if PCRE2_COPY_MATCHED_SUBJECT is set and memory alloca-
+       tion fails.
 
          PCRE2_ERROR_NULL
 
@@ -3858,8 +3858,8 @@
 
 REVISION
 
-       Last updated: 14 December 2021
-       Copyright (c) 1997-2021 University of Cambridge.
+       Last updated: 27 July 2022
+       Copyright (c) 1997-2022 University of Cambridge.
 ------------------------------------------------------------------------------
 
 
@@ -4116,41 +4116,40 @@
        pcre2_dfa_match() matching function, and to JIT  matching  (though  the
        counting is done differently).
 
-       The  pcre2_match() function starts out using a 20KiB vector on the sys-
-       tem stack to record backtracking points. The more  nested  backtracking
-       points there are (that is, the deeper the search tree), the more memory
-       is needed. If the initial vector is not large enough,  heap  memory  is
-       used,  up to a certain limit, which is specified in kibibytes (units of
-       1024 bytes). The limit can be changed at run time, as described in  the
-       pcre2api  documentation.  The default limit (in effect unlimited) is 20
-       million. You can change this by a setting such as
+       The  pcre2_match()  function  uses  heap  memory to record backtracking
+       points. The more nested backtracking points there  are  (that  is,  the
+       deeper  the  search tree), the more memory is needed. There is an upper
+       limit, specified in kibibytes (units of 1024 bytes). This limit can  be
+       changed  at  run  time, as described in the pcre2api documentation. The
+       default limit (in effect unlimited) is 20 million. You can change  this
+       by a setting such as
 
          --with-heap-limit=500
 
-       which limits the amount of heap to 500 KiB. This limit applies only  to
+       which  limits the amount of heap to 500 KiB. This limit applies only to
        interpretive matching in pcre2_match() and pcre2_dfa_match(), which may
-       also use the heap for internal workspace  when  processing  complicated
-       patterns.  This limit does not apply when JIT (which has its own memory
+       also  use  the  heap for internal workspace when processing complicated
+       patterns. This limit does not apply when JIT (which has its own  memory
        arrangements) is used.
 
-       You can also explicitly limit the depth of nested backtracking  in  the
+       You  can  also explicitly limit the depth of nested backtracking in the
        pcre2_match() interpreter. This limit defaults to the value that is set
-       for --with-match-limit. You can set a lower default  limit  by  adding,
+       for  --with-match-limit.  You  can set a lower default limit by adding,
        for example,
 
          --with-match-limit-depth=10000
 
-       to  the  configure  command.  This value can be overridden at run time.
-       This depth limit indirectly limits the amount of heap  memory  that  is
-       used,  but because the size of each backtracking "frame" depends on the
-       number of capturing parentheses in a pattern, the amount of  heap  that
-       is  used  before  the  limit is reached varies from pattern to pattern.
+       to the configure command. This value can be  overridden  at  run  time.
+       This  depth  limit  indirectly limits the amount of heap memory that is
+       used, but because the size of each backtracking "frame" depends on  the
+       number  of  capturing parentheses in a pattern, the amount of heap that
+       is used before the limit is reached varies  from  pattern  to  pattern.
        This limit was more useful in versions before 10.30, where function re-
        cursion was used for backtracking.
 
        As well as applying to pcre2_match(), the depth limit also controls the
-       depth of recursive function calls in pcre2_dfa_match(). These are  used
-       for  lookaround  assertions,  atomic  groups, and recursion within pat-
+       depth  of recursive function calls in pcre2_dfa_match(). These are used
+       for lookaround assertions, atomic groups,  and  recursion  within  pat-
        terns.  The limit does not apply to JIT matching.
 
 
@@ -4158,67 +4157,67 @@
 
        PCRE2 uses fixed tables for processing characters whose code points are
        less than 256. By default, PCRE2 is built with a set of tables that are
-       distributed in the file src/pcre2_chartables.c.dist. These  tables  are
+       distributed  in  the file src/pcre2_chartables.c.dist. These tables are
        for ASCII codes only. If you add
 
          --enable-rebuild-chartables
 
-       to  the  configure  command, the distributed tables are no longer used.
+       to the configure command, the distributed tables are  no  longer  used.
        Instead, a program called pcre2_dftables is compiled and run. This out-
        puts the source for new set of tables, created in the default locale of
-       your C run-time system. This method of replacing the  tables  does  not
+       your  C  run-time  system. This method of replacing the tables does not
        work if you are cross compiling, because pcre2_dftables needs to be run
        on the local host and therefore not compiled with the cross compiler.
 
        If you need to create alternative tables when cross compiling, you will
-       have  to  do so "by hand". There may also be other reasons for creating
-       tables manually.  To cause pcre2_dftables to  be  built  on  the  local
+       have to do so "by hand". There may also be other reasons  for  creating
+       tables  manually.   To  cause  pcre2_dftables  to be built on the local
        host, run a normal compiling command, and then run the program with the
        output file as its argument, for example:
 
          cc src/pcre2_dftables.c -o pcre2_dftables
          ./pcre2_dftables src/pcre2_chartables.c
 
-       This builds the tables in the default locale of the local host. If  you
+       This  builds the tables in the default locale of the local host. If you
        want to specify a locale, you must use the -L option:
 
          LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
 
        You can also specify -b (with or without -L). This causes the tables to
-       be written in binary instead of as source code. A set of binary  tables
-       can  be  loaded  into memory by an application and passed to pcre2_com-
+       be  written in binary instead of as source code. A set of binary tables
+       can be loaded into memory by an application and  passed  to  pcre2_com-
        pile() in the same way as tables created by calling pcre2_maketables().
-       The  tables are just a string of bytes, independent of hardware charac-
-       teristics such as endianness. This means they can be  bundled  with  an
-       application  that  runs in different environments, to ensure consistent
+       The tables are just a string of bytes, independent of hardware  charac-
+       teristics  such  as  endianness. This means they can be bundled with an
+       application that runs in different environments, to  ensure  consistent
        behaviour.
 
 
 USING EBCDIC CODE
 
-       PCRE2 assumes by default that it will run in an environment  where  the
-       character  code is ASCII or Unicode, which is a superset of ASCII. This
+       PCRE2  assumes  by default that it will run in an environment where the
+       character code is ASCII or Unicode, which is a superset of ASCII.  This
        is the case for most computer operating systems. PCRE2 can, however, be
        compiled to run in an 8-bit EBCDIC environment by adding
 
          --enable-ebcdic --disable-unicode
 
        to the configure command. This setting implies --enable-rebuild-charta-
-       bles. You should only use it if you know that you are in an EBCDIC  en-
+       bles.  You should only use it if you know that you are in an EBCDIC en-
        vironment (for example, an IBM mainframe operating system).
 
-       It  is  not possible to support both EBCDIC and UTF-8 codes in the same
-       version of the library. Consequently,  --enable-unicode  and  --enable-
+       It is not possible to support both EBCDIC and UTF-8 codes in  the  same
+       version  of  the  library. Consequently, --enable-unicode and --enable-
        ebcdic are mutually exclusive.
 
        The EBCDIC character that corresponds to an ASCII LF is assumed to have
-       the value 0x15 by default. However, in some EBCDIC  environments,  0x25
+       the  value  0x15 by default. However, in some EBCDIC environments, 0x25
        is used. In such an environment you should use
 
          --enable-ebcdic-nl25
 
        as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR
-       has the same value as in ASCII, namely, 0x0d.  Whichever  of  0x15  and
+       has  the  same  value  as in ASCII, namely, 0x0d. Whichever of 0x15 and
        0x25 is not chosen as LF is made to correspond to the Unicode NEL char-
        acter (which, in Unicode, is 0x85).
 
@@ -4230,47 +4229,47 @@
 PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS
 
        By default pcre2grep supports the use of callouts with string arguments
-       within  the patterns it is matching. There are two kinds: one that gen-
+       within the patterns it is matching. There are two kinds: one that  gen-
        erates output using local code, and another that calls an external pro-
-       gram  or  script.   If --disable-pcre2grep-callout-fork is added to the
-       configure command, only the first kind  of  callout  is  supported;  if
-       --disable-pcre2grep-callout  is  used,  all callouts are completely ig-
-       nored. For more details of pcre2grep callouts, see the pcre2grep  docu-
+       gram or script.  If --disable-pcre2grep-callout-fork is  added  to  the
+       configure  command,  only  the  first  kind of callout is supported; if
+       --disable-pcre2grep-callout is used, all callouts  are  completely  ig-
+       nored.  For more details of pcre2grep callouts, see the pcre2grep docu-
        mentation.
 
 
 PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT
 
-       By  default,  pcre2grep reads all files as plain text. You can build it
-       so that it recognizes files whose names end in .gz or .bz2,  and  reads
+       By default, pcre2grep reads all files as plain text. You can  build  it
+       so  that  it recognizes files whose names end in .gz or .bz2, and reads
        them with libz or libbz2, respectively, by adding one or both of
 
          --enable-pcre2grep-libz
          --enable-pcre2grep-libbz2
 
        to the configure command. These options naturally require that the rel-
-       evant libraries are installed on your system. Configuration  will  fail
+       evant  libraries  are installed on your system. Configuration will fail
        if they are not.
 
 
 PCRE2GREP BUFFER SIZE
 
-       pcre2grep  uses an internal buffer to hold a "window" on the file it is
+       pcre2grep uses an internal buffer to hold a "window" on the file it  is
        scanning, in order to be able to output "before" and "after" lines when
        it finds a match. The default starting size of the buffer is 20KiB. The
-       buffer itself is three times this size, but because of the  way  it  is
+       buffer  itself  is  three times this size, but because of the way it is
        used for holding "before" lines, the longest line that is guaranteed to
        be processable is the notional buffer size. If a longer line is encoun-
-       tered,  pcre2grep  automatically  expands the buffer, up to a specified
-       maximum size, whose default is 1MiB or the starting size, whichever  is
-       the  larger. You can change the default parameter values by adding, for
+       tered, pcre2grep automatically expands the buffer, up  to  a  specified
+       maximum  size, whose default is 1MiB or the starting size, whichever is
+       the larger. You can change the default parameter values by adding,  for
        example,
 
          --with-pcre2grep-bufsize=51200
          --with-pcre2grep-max-bufsize=2097152
 
-       to the configure command. The caller of pcre2grep  can  override  these
-       values  by  using  --buffer-size  and  --max-buffer-size on the command
+       to  the  configure  command. The caller of pcre2grep can override these
+       values by using --buffer-size  and  --max-buffer-size  on  the  command
        line.
 
 
@@ -4281,26 +4280,26 @@
          --enable-pcre2test-libreadline
          --enable-pcre2test-libedit
 
-       to the configure command, pcre2test is linked with the libreadline  or-
-       libedit  library,  respectively, and when its input is from a terminal,
-       it reads it using the readline() function. This  provides  line-editing
-       and  history  facilities.  Note that libreadline is GPL-licensed, so if
-       you distribute a binary of pcre2test linked in this way, there  may  be
+       to  the configure command, pcre2test is linked with the libreadline or-
+       libedit library, respectively, and when its input is from  a  terminal,
+       it  reads  it using the readline() function. This provides line-editing
+       and history facilities. Note that libreadline is  GPL-licensed,  so  if
+       you  distribute  a binary of pcre2test linked in this way, there may be
        licensing issues. These can be avoided by linking instead with libedit,
        which has a BSD licence.
 
-       Setting --enable-pcre2test-libreadline causes the -lreadline option  to
-       be  added to the pcre2test build. In many operating environments with a
-       sytem-installed readline library this is sufficient. However,  in  some
+       Setting  --enable-pcre2test-libreadline causes the -lreadline option to
+       be added to the pcre2test build. In many operating environments with  a
+       sytem-installed  readline  library this is sufficient. However, in some
        environments (e.g. if an unmodified distribution version of readline is
-       in use), some extra configuration may be necessary.  The  INSTALL  file
+       in  use),  some  extra configuration may be necessary. The INSTALL file
        for libreadline says this:
 
          "Readline uses the termcap functions, but does not link with
          the termcap or curses library itself, allowing applications
          which link with readline the to choose an appropriate library."
 
-       If  your environment has not been set up so that an appropriate library
+       If your environment has not been set up so that an appropriate  library
        is automatically included, you may need to add something like
 
          LIBS="-ncurses"
@@ -4314,7 +4313,7 @@
 
          --enable-debug
 
-       to the configure command, additional debugging code is included in  the
+       to  the configure command, additional debugging code is included in the
        build. This feature is intended for use by the PCRE2 maintainers.
 
 
@@ -4324,14 +4323,14 @@
 
          --enable-valgrind
 
-       to  the  configure command, PCRE2 will use valgrind annotations to mark
-       certain memory regions as unaddressable. This allows it to  detect  in-
+       to the configure command, PCRE2 will use valgrind annotations  to  mark
+       certain  memory  regions as unaddressable. This allows it to detect in-
        valid memory accesses, and is mostly useful for debugging PCRE2 itself.
 
 
 CODE COVERAGE REPORTING
 
-       If  your  C  compiler is gcc, you can build a version of PCRE2 that can
+       If your C compiler is gcc, you can build a version of  PCRE2  that  can
        generate a code coverage report for its test suite. To enable this, you
        must install lcov version 1.6 or above. Then specify
 
@@ -4340,20 +4339,20 @@
        to the configure command and build PCRE2 in the usual way.
 
        Note that using ccache (a caching C compiler) is incompatible with code
-       coverage reporting. If you have configured ccache to run  automatically
+       coverage  reporting. If you have configured ccache to run automatically
        on your system, you must set the environment variable
 
          CCACHE_DISABLE=1
 
        before running make to build PCRE2, so that ccache is not used.
 
-       When  --enable-coverage  is  used,  the  following addition targets are
+       When --enable-coverage is used,  the  following  addition  targets  are
        added to the Makefile:
 
          make coverage
 
-       This creates a fresh coverage report for the PCRE2 test  suite.  It  is
-       equivalent  to running "make coverage-reset", "make coverage-baseline",
+       This  creates  a  fresh coverage report for the PCRE2 test suite. It is
+       equivalent to running "make coverage-reset", "make  coverage-baseline",
        "make check", and then "make coverage-report".
 
          make coverage-reset
@@ -4370,73 +4369,73 @@
 
          make coverage-clean-report
 
-       This removes the generated coverage report without cleaning the  cover-
+       This  removes the generated coverage report without cleaning the cover-
        age data itself.
 
          make coverage-clean-data
 
-       This  removes  the captured coverage data without removing the coverage
+       This removes the captured coverage data without removing  the  coverage
        files created at compile time (*.gcno).
 
          make coverage-clean
 
-       This cleans all coverage data including the generated coverage  report.
-       For  more  information about code coverage, see the gcov and lcov docu-
+       This  cleans all coverage data including the generated coverage report.
+       For more information about code coverage, see the gcov and  lcov  docu-
        mentation.
 
 
 DISABLING THE Z AND T FORMATTING MODIFIERS
 
-       The C99 standard defines formatting modifiers z and t  for  size_t  and
-       ptrdiff_t  values, respectively. By default, PCRE2 uses these modifiers
+       The  C99  standard  defines formatting modifiers z and t for size_t and
+       ptrdiff_t values, respectively. By default, PCRE2 uses these  modifiers
        in environments other than old versions of Microsoft Visual Studio when
-       __STDC_VERSION__  is  defined  and has a value greater than or equal to
-       199901L (indicating support for C99).  However, there is at  least  one
+       __STDC_VERSION__ is defined and has a value greater than  or  equal  to
+       199901L  (indicating  support for C99).  However, there is at least one
        environment that claims to be C99 but does not support these modifiers.
        If
 
          --disable-percent-zt
 
        is specified, no use is made of the z or t modifiers. Instead of %td or
-       %zu,  a  suitable  format is used depending in the size of long for the
+       %zu, a suitable format is used depending in the size of  long  for  the
        platform.
 
 
 SUPPORT FOR FUZZERS
 
-       There is a special option for use by people who  want  to  run  fuzzing
+       There  is  a  special  option for use by people who want to run fuzzing
        tests on PCRE2:
 
          --enable-fuzz-support
 
        At present this applies only to the 8-bit library. If set, it causes an
-       extra library called libpcre2-fuzzsupport.a to be built,  but  not  in-
-       stalled.  This  contains  a single function called LLVMFuzzerTestOneIn-
-       put() whose arguments are a pointer to a string and the length  of  the
-       string.  When  called,  this  function tries to compile the string as a
-       pattern, and if that succeeds, to match it.  This is done both with  no
-       options  and  with some random options bits that are generated from the
+       extra  library  called  libpcre2-fuzzsupport.a to be built, but not in-
+       stalled. This contains a single  function  called  LLVMFuzzerTestOneIn-
+       put()  whose  arguments are a pointer to a string and the length of the
+       string. When called, this function tries to compile  the  string  as  a
+       pattern,  and if that succeeds, to match it.  This is done both with no
+       options and with some random options bits that are generated  from  the
        string.
 
-       Setting --enable-fuzz-support also causes  a  binary  called  pcre2fuz-
-       zcheck  to be created. This is normally run under valgrind or used when
+       Setting  --enable-fuzz-support  also  causes  a binary called pcre2fuz-
+       zcheck to be created. This is normally run under valgrind or used  when
        PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing
-       function  and  outputs  information  about  what it is doing. The input
-       strings are specified by arguments: if an argument starts with "="  the
-       rest  of it is a literal input string. Otherwise, it is assumed to be a
+       function and outputs information about what  it  is  doing.  The  input
+       strings  are specified by arguments: if an argument starts with "=" the
+       rest of it is a literal input string. Otherwise, it is assumed to be  a
        file name, and the contents of the file are the test string.
 
 
 OBSOLETE OPTION
 
-       In versions of PCRE2 prior to 10.30, there were two  ways  of  handling
-       backtracking  in the pcre2_match() function. The default was to use the
+       In  versions  of  PCRE2 prior to 10.30, there were two ways of handling
+       backtracking in the pcre2_match() function. The default was to use  the
        system stack, but if
 
          --disable-stack-for-recursion
 
-       was set, memory on the heap was used. From release 10.30  onwards  this
-       has  changed  (the  stack  is  no longer used) and this option now does
+       was  set,  memory on the heap was used. From release 10.30 onwards this
+       has changed (the stack is no longer used)  and  this  option  now  does
        nothing except give a warning.
 
 
@@ -4448,14 +4447,14 @@
 AUTHOR
 
        Philip Hazel
-       University Computing Service
+       Retired from University Computing Service
        Cambridge, England.
 
 
 REVISION
 
-       Last updated: 08 December 2021
-       Copyright (c) 1997-2021 University of Cambridge.
+       Last updated: 27 July 2022
+       Copyright (c) 1997-2022 University of Cambridge.
 ------------------------------------------------------------------------------
 
 
@@ -5594,18 +5593,22 @@
        The maximum length of a string argument to a  callout  is  the  largest
        number a 32-bit unsigned integer can hold.
 
+       The  maximum  amount  of heap memory used for matching is controlled by
+       the heap limit, which can be set in a pattern or in  a  match  context.
+       The default is a very large number, effectively unlimited.
+
 
 AUTHOR
 
        Philip Hazel
-       University Computing Service
+       Retired from University Computing Service
        Cambridge, England.
 
 
 REVISION
 
-       Last updated: 02 February 2019
-       Copyright (c) 1997-2019 University of Cambridge.
+       Last updated: 26 July 2022
+       Copyright (c) 1997-2022 University of Cambridge.
 ------------------------------------------------------------------------------
 
 
@@ -9771,152 +9774,169 @@
        sive function calls could use a great deal of  stack,  and  this  could
        cause  problems, but this usage has been eliminated. Backtracking posi-
        tions are now explicitly remembered in memory frames controlled by  the
-       code.  An  initial  20KiB  vector  of frames is allocated on the system
-       stack (enough for about 100 frames for small patterns), but if this  is
-       insufficient,  heap  memory  is  used. The amount of heap memory can be
-       limited; if the limit is set to zero, only the initial stack vector  is
-       used.  Rewriting patterns to be time-efficient, as described below, may
-       also reduce the memory requirements.
+       code.
 
-       In contrast to  pcre2_match(),  pcre2_dfa_match()  does  use  recursive
-       function  calls,  but only for processing atomic groups, lookaround as-
+       The size of each frame depends on the size of pointer variables and the
+       number of capturing parenthesized groups in the pattern being  matched.
+       On a 64-bit system the frame size for a pattern with no captures is 128
+       bytes. For each capturing group the size increases by 16 bytes.
+
+       Until release 10.41, an initial 20KiB frames vector  was  allocated  on
+       the  system  stack,  but this still caused some issues for multi-thread
+       applications where each thread has a very  small  stack.  From  release
+       10.41  backtracking  memory  frames  are always held in heap memory. An
+       initial heap allocation is obtained the first time any match data block
+       is  passed  to  pcre2_match().  This  is remembered with the match data
+       block and re-used if that block is used for another match. It is  freed
+       when the match data block itself is freed.
+
+       The  size  of the initial block is the larger of 20KiB or ten times the
+       pattern's frame size, unless the heap limit is less than this, in which
+       case  the  heap  limit  is  used. If the initial block proves to be too
+       small during matching, it is replaced by a larger block, subject to the
+       heap  limit.  The  heap limit is checked only when a new block is to be
+       allocated. Reducing the heap limit between calls to pcre2_match()  with
+       the same match data block does not affect the saved block.
+
+       In  contrast  to  pcre2_match(),  pcre2_dfa_match()  does use recursive
+       function calls, but only for processing atomic groups,  lookaround  as-
        sertions, and recursion within the pattern. The original version of the
-       code  used  to  allocate  quite large internal workspace vectors on the
-       stack, which caused some problems for  some  patterns  in  environments
-       with  small  stacks.  From release 10.32 the code for pcre2_dfa_match()
-       has been re-factored to use heap memory  when  necessary  for  internal
-       workspace  when  recursing,  though  recursive function calls are still
+       code used to allocate quite large internal  workspace  vectors  on  the
+       stack,  which  caused  some  problems for some patterns in environments
+       with small stacks. From release 10.32 the  code  for  pcre2_dfa_match()
+       has  been  re-factored  to  use heap memory when necessary for internal
+       workspace when recursing, though recursive  function  calls  are  still
        used.
 
-       The "match depth" parameter can be used to limit the depth of  function
-       recursion,  and  the  "match  heap"  parameter  to limit heap memory in
+       The  "match depth" parameter can be used to limit the depth of function
+       recursion, and the "match heap"  parameter  to  limit  heap  memory  in
        pcre2_dfa_match().
 
 
 PROCESSING TIME
 
-       Certain items in regular expression patterns are processed  more  effi-
+       Certain  items  in regular expression patterns are processed more effi-
        ciently than others. It is more efficient to use a character class like
-       [aeiou]  than  a  set  of   single-character   alternatives   such   as
-       (a|e|i|o|u).  In  general,  the simplest construction that provides the
+       [aeiou]   than   a   set   of  single-character  alternatives  such  as
+       (a|e|i|o|u). In general, the simplest construction  that  provides  the
        required behaviour is usually the most efficient. Jeffrey Friedl's book
-       contains  a  lot  of useful general discussion about optimizing regular
+       contains a lot of useful general discussion  about  optimizing  regular
        expressions for efficient performance. This document contains a few ob-
        servations about PCRE2.
 
-       Using  Unicode  character  properties  (the  \p, \P, and \X escapes) is
-       slow, because PCRE2 has to use a multi-stage table lookup  whenever  it
-       needs  a  character's  property. If you can find an alternative pattern
+       Using Unicode character properties (the \p,  \P,  and  \X  escapes)  is
+       slow,  because  PCRE2 has to use a multi-stage table lookup whenever it
+       needs a character's property. If you can find  an  alternative  pattern
        that does not use character properties, it will probably be faster.
 
-       By default, the escape sequences \b, \d, \s,  and  \w,  and  the  POSIX
-       character  classes  such  as  [:alpha:]  do not use Unicode properties,
+       By  default,  the  escape  sequences  \b, \d, \s, and \w, and the POSIX
+       character classes such as [:alpha:]  do  not  use  Unicode  properties,
        partly for backwards compatibility, and partly for performance reasons.
-       However,  you  can  set  the PCRE2_UCP option or start the pattern with
-       (*UCP) if you want Unicode character properties to be  used.  This  can
-       double  the  matching  time  for  items  such  as \d, when matched with
-       pcre2_match(); the performance loss is less with a DFA  matching  func-
+       However, you can set the PCRE2_UCP option or  start  the  pattern  with
+       (*UCP)  if  you  want Unicode character properties to be used. This can
+       double the matching time for  items  such  as  \d,  when  matched  with
+       pcre2_match();  the  performance loss is less with a DFA matching func-
        tion, and in both cases there is not much difference for \b.
 
-       When  a pattern begins with .* not in atomic parentheses, nor in paren-
-       theses that are the subject of a backreference,  and  the  PCRE2_DOTALL
-       option  is  set,  the pattern is implicitly anchored by PCRE2, since it
-       can match only at the start of a subject string.  If  the  pattern  has
+       When a pattern begins with .* not in atomic parentheses, nor in  paren-
+       theses  that  are  the subject of a backreference, and the PCRE2_DOTALL
+       option is set, the pattern is implicitly anchored by  PCRE2,  since  it
+       can  match  only  at  the start of a subject string. If the pattern has
        multiple top-level branches, they must all be anchorable. The optimiza-
-       tion can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is  au-
+       tion  can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is au-
        tomatically disabled if the pattern contains (*PRUNE) or (*SKIP).
 
-       If  PCRE2_DOTALL  is  not set, PCRE2 cannot make this optimization, be-
-       cause the dot metacharacter does not then match a newline, and  if  the
-       subject  string contains newlines, the pattern may match from the char-
+       If PCRE2_DOTALL is not set, PCRE2 cannot make  this  optimization,  be-
+       cause  the  dot metacharacter does not then match a newline, and if the
+       subject string contains newlines, the pattern may match from the  char-
        acter immediately following one of them instead of from the very start.
        For example, the pattern
 
          .*second
 
-       matches  the subject "first\nand second" (where \n stands for a newline
-       character), with the match starting at the seventh character. In  order
-       to  do  this, PCRE2 has to retry the match starting after every newline
+       matches the subject "first\nand second" (where \n stands for a  newline
+       character),  with the match starting at the seventh character. In order
+       to do this, PCRE2 has to retry the match starting after  every  newline
        in the subject.
 
-       If you are using such a pattern with subject strings that do  not  con-
-       tain   newlines,   the   best   performance   is  obtained  by  setting
-       PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate  ex-
-       plicit  anchoring.  That saves PCRE2 from having to scan along the sub-
+       If  you  are using such a pattern with subject strings that do not con-
+       tain  newlines,  the  best   performance   is   obtained   by   setting
+       PCRE2_DOTALL,  or starting the pattern with ^.* or ^.*? to indicate ex-
+       plicit anchoring. That saves PCRE2 from having to scan along  the  sub-
        ject looking for a newline to restart at.
 
-       Beware of patterns that contain nested indefinite  repeats.  These  can
-       take  a  long time to run when applied to a string that does not match.
+       Beware  of  patterns  that contain nested indefinite repeats. These can
+       take a long time to run when applied to a string that does  not  match.
        Consider the pattern fragment
 
          ^(a+)*
 
-       This can match "aaaa" in 16 different ways, and this  number  increases
-       very  rapidly  as the string gets longer. (The * repeat can match 0, 1,
-       2, 3, or 4 times, and for each of those cases other than 0 or 4, the  +
-       repeats  can  match  different numbers of times.) When the remainder of
-       the pattern is such that the entire match is going to fail,  PCRE2  has
-       in  principle to try every possible variation, and this can take an ex-
+       This  can  match "aaaa" in 16 different ways, and this number increases
+       very rapidly as the string gets longer. (The * repeat can match  0,  1,
+       2,  3, or 4 times, and for each of those cases other than 0 or 4, the +
+       repeats can match different numbers of times.) When  the  remainder  of
+       the  pattern  is such that the entire match is going to fail, PCRE2 has
+       in principle to try every possible variation, and this can take an  ex-
        tremely long time, even for relatively short strings.
 
        An optimization catches some of the more simple cases such as
 
          (a+)*b
 
-       where a literal character follows. Before  embarking  on  the  standard
-       matching  procedure, PCRE2 checks that there is a "b" later in the sub-
-       ject string, and if there is not, it fails the match immediately.  How-
-       ever,  when  there  is no following literal this optimization cannot be
+       where  a  literal  character  follows. Before embarking on the standard
+       matching procedure, PCRE2 checks that there is a "b" later in the  sub-
+       ject  string, and if there is not, it fails the match immediately. How-
+       ever, when there is no following literal this  optimization  cannot  be
        used. You can see the difference by comparing the behaviour of
 
          (a+)*\d
 
-       with the pattern above. The former gives  a  failure  almost  instantly
-       when  applied  to  a  whole  line of "a" characters, whereas the latter
+       with  the  pattern  above.  The former gives a failure almost instantly
+       when applied to a whole line of  "a"  characters,  whereas  the  latter
        takes an appreciable time with strings longer than about 20 characters.
 
        In many cases, the solution to this kind of performance issue is to use
-       an  atomic group or a possessive quantifier. This can often reduce mem-
+       an atomic group or a possessive quantifier. This can often reduce  mem-
        ory requirements as well. As another example, consider this pattern:
 
          ([^<]|<(?!inet))+
 
-       It matches from wherever it starts until it encounters "<inet"  or  the
-       end  of  the  data,  and is the kind of pattern that might be used when
+       It  matches  from wherever it starts until it encounters "<inet" or the
+       end of the data, and is the kind of pattern that  might  be  used  when
        processing an XML file. Each iteration of the outer parentheses matches
-       either  one  character that is not "<" or a "<" that is not followed by
-       "inet". However, each time a parenthesis is processed,  a  backtracking
-       position  is  passed,  so this formulation uses a memory frame for each
+       either one character that is not "<" or a "<" that is not  followed  by
+       "inet".  However,  each time a parenthesis is processed, a backtracking
+       position is passed, so this formulation uses a memory  frame  for  each
        matched character. For a long string, a lot of memory is required. Con-
-       sider  now  this  rewritten  pattern,  which  matches  exactly the same
+       sider now this  rewritten  pattern,  which  matches  exactly  the  same
        strings:
 
          ([^<]++|<(?!inet))+
 
        This runs much faster, because sequences of characters that do not con-
        tain "<" are "swallowed" in one item inside the parentheses, and a pos-
-       sessive quantifier is used to stop any backtracking into  the  runs  of
-       non-"<"  characters.  This  version also uses a lot less memory because
-       entry to a new set of parentheses happens only  when  a  "<"  character
-       that  is  not  followed by "inet" is encountered (and we assume this is
+       sessive  quantifier  is  used to stop any backtracking into the runs of
+       non-"<" characters. This version also uses a lot  less  memory  because
+       entry  to  a  new  set of parentheses happens only when a "<" character
+       that is not followed by "inet" is encountered (and we  assume  this  is
        relatively rare).
 
        This example shows that one way of optimizing performance when matching
-       long  subject strings is to write repeated parenthesized subpatterns to
+       long subject strings is to write repeated parenthesized subpatterns  to
        match more than one character whenever possible.
 
    SETTING RESOURCE LIMITS
 
-       You can set limits on the amount of processing that  takes  place  when
-       matching,  and  on  the amount of heap memory that is used. The default
+       You  can  set  limits on the amount of processing that takes place when
+       matching, and on the amount of heap memory that is  used.  The  default
        values of the limits are very large, and unlikely ever to operate. They
-       can  be  changed  when  PCRE2  is  built, and they can also be set when
-       pcre2_match() or pcre2_dfa_match() is called. For details of these  in-
-       terfaces,  see  the  pcre2build  documentation and the section entitled
+       can be changed when PCRE2 is built, and  they  can  also  be  set  when
+       pcre2_match()  or pcre2_dfa_match() is called. For details of these in-
+       terfaces, see the pcre2build documentation  and  the  section  entitled
        "The match context" in the pcre2api documentation.
 
-       The pcre2test test program has a modifier called  "find_limits"  which,
-       if  applied  to  a  subject line, causes it to find the smallest limits
+       The  pcre2test  test program has a modifier called "find_limits" which,
+       if applied to a subject line, causes it to  find  the  smallest  limits
        that allow a pattern to match. This is done by repeatedly matching with
        different limits.
 
@@ -9924,14 +9944,14 @@
 AUTHOR
 
        Philip Hazel
-       University Computing Service
+       Retired from University Computing Service
        Cambridge, England.
 
 
 REVISION
 
-       Last updated: 03 February 2019
-       Copyright (c) 1997-2019 University of Cambridge.
+       Last updated: 27 July 2022
+       Copyright (c) 1997-2022 University of Cambridge.
 ------------------------------------------------------------------------------
 
 
@@ -10434,7 +10454,7 @@
 
          PCRE2_ERROR_BADDATA      the number of patterns is zero or less
          PCRE2_ERROR_BADMAGIC     mismatch of id bytes in one of the patterns
-         PCRE2_ERROR_MEMORY       memory allocation failed
+         PCRE2_ERROR_NOMEMORY     memory allocation failed
          PCRE2_ERROR_MIXEDTABLES  the patterns do not all use the same tables
          PCRE2_ERROR_NULL         the 1st, 3rd, or 4th argument is NULL
 
diff --git a/doc/pcre2_compile.3 b/doc/pcre2_compile.3
index 58a60c1..5a07b8b 100644
--- a/doc/pcre2_compile.3
+++ b/doc/pcre2_compile.3
@@ -1,4 +1,4 @@
-.TH PCRE2_COMPILE 3 "23 May 2019" "PCRE2 10.34"
+.TH PCRE2_COMPILE 3 "22 April 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@@ -80,8 +80,17 @@
 .\"
 function.
 .P
-The yield of this function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected.
+If either of \fIerrorcode\fP or \fIerroroffset\fP is NULL, the function returns
+NULL immediately. Otherwise, the yield of this function is a pointer to a
+private data structure that contains the compiled pattern, or NULL if an error
+was detected. In the error case, a text error message can be obtained by
+passing the value returned via the \fIerrorcode\fP argument to the the
+\fBpcre2_get_error_message()\fP function. The offset (in code units) where the
+error was encountered is returned via the \fIerroroffset\fP argument.
+.P
+If there is no error, the value passed via \fIerrorcode\fP returns the message
+"no error" if passed to \fBpcre2_get_error_message()\fP, and the value passed
+via \fIerroroffset\fP is zero.
 .P
 There is a complete description of the PCRE2 native API, with more detail on
 each option, in the
diff --git a/doc/pcre2_match_data_create_from_pattern.3 b/doc/pcre2_match_data_create_from_pattern.3
index 37486dd..829bf6c 100644
--- a/doc/pcre2_match_data_create_from_pattern.3
+++ b/doc/pcre2_match_data_create_from_pattern.3
@@ -21,7 +21,7 @@
 vector" (ovector) within the match data block, and are used to identify the
 matched string and any captured substrings when matching with
 \fBpcre2_match()\fP. If you are using \fBpcre2_dfa_match()\fP, which uses the
-outut vector in a different way, you should use \fBpcre2_match_data_create()\fP
+output vector in a different way, you should use \fBpcre2_match_data_create()\fP
 instead of this function.
 .P
 The second argument points to a general context, for custom memory management,
diff --git a/doc/pcre2_serialize_decode.3 b/doc/pcre2_serialize_decode.3
index b67a112..611113f 100644
--- a/doc/pcre2_serialize_decode.3
+++ b/doc/pcre2_serialize_decode.3
@@ -36,7 +36,7 @@
   PCRE2_ERROR_BADDATA   \fInumber_of_codes\fP is zero or less
   PCRE2_ERROR_BADMAGIC  mismatch of id bytes in \fIbytes\fP
   PCRE2_ERROR_BADMODE   mismatch of variable unit size or PCRE version
-  PCRE2_ERROR_MEMORY    memory allocation failed
+  PCRE2_ERROR_NOMEMORY  memory allocation failed
   PCRE2_ERROR_NULL      \fIcodes\fP or \fIbytes\fP is NULL
 .sp
 PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
index edde3db..28c6033 100644
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@@ -1,4 +1,4 @@
-.TH PCRE2API 3 "14 December 2021" "PCRE2 10.40"
+.TH PCRE2API 3 "27 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
@@ -953,7 +953,7 @@
 documentation for more details). If the limit is reached, the negative error
 code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
 is built; if it is not, the default is set very large and is essentially
-"unlimited".
+unlimited.
 .P
 A value for the heap limit may also be supplied by an item at the start of a
 pattern of the form
@@ -964,18 +964,18 @@
 less than the limit set by the caller of \fBpcre2_match()\fP or, if no such
 limit is set, less than the default.
 .P
-The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
-stack for recording backtracking points. The more nested backtracking points
-there are (that is, the deeper the search tree), the more memory is needed.
-Heap memory is used only if the initial vector is too small. If the heap limit
-is set to a value less than 21 (in particular, zero) no heap memory will be
-used. In this case, only patterns that do not have a lot of nested backtracking
-can be successfully processed.
+The \fBpcre2_match()\fP function always needs some heap memory, so setting a
+value of zero guarantees a "heap limit exceeded" error. Details of how
+\fBpcre2_match()\fP uses the heap are given in the
+.\" HREF
+\fBpcre2perform\fP
+.\"
+documentation.
 .P
-Similarly, for \fBpcre2_dfa_match()\fP, a vector on the system stack is used
-when processing pattern recursions, lookarounds, or atomic groups, and only if
-this is not big enough is heap memory used. In this case, too, setting a value
-of zero disables the use of the heap.
+For \fBpcre2_dfa_match()\fP, a vector on the system stack is used when
+processing pattern recursions, lookarounds, or atomic groups, and only if this
+is not big enough is heap memory used. In this case, setting a value of zero
+disables the use of the heap.
 .sp
 .nf
 .B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP,
@@ -1019,10 +1019,10 @@
 .fi
 .sp
 This parameter limits the depth of nested backtracking in \fBpcre2_match()\fP.
-Each time a nested backtracking point is passed, a new memory "frame" is used
+Each time a nested backtracking point is passed, a new memory frame is used
 to remember the state of matching at that point. Thus, this parameter
 indirectly limits the amount of memory that is used in a match. However,
-because the size of each memory "frame" depends on the number of capturing
+because the size of each memory frame depends on the number of capturing
 parentheses, the actual memory limit varies from pattern to pattern. This limit
 was more useful in versions before 10.30, where function recursion was used for
 backtracking.
@@ -1323,8 +1323,7 @@
 NULL immediately. Otherwise, the variables to which these point are set to an
 error code and an offset (number of code units) within the pattern,
 respectively, when \fBpcre2_compile()\fP returns NULL because a compilation
-error has occurred. The values are not defined when compilation is successful
-and \fBpcre2_compile()\fP returns a non-NULL value.
+error has occurred.
 .P
 There are nearly 100 positive error codes that \fBpcre2_compile()\fP may return
 if it finds an error in the pattern. There are also some negative error codes
@@ -1343,14 +1342,17 @@
 below)
 .\"
 should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
-for both positive and negative error codes in \fBpcre2.h\fP.
+for both positive and negative error codes in \fBpcre2.h\fP. When compilation
+is successful \fIerrorcode\fP is set to a value that returns the message "no
+error" if passed to \fBpcre2_get_error_message()\fP.
 .P
 The value returned in \fIerroroffset\fP is an indication of where in the
-pattern the error occurred. It is not necessarily the furthest point in the
-pattern that was read. For example, after the error "lookbehind assertion is
-not fixed length", the error offset points to the start of the failing
-assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
-first code unit of the failing character.
+pattern an error occurred. When there is no error, zero is returned. A non-zero
+value is not necessarily the furthest point in the pattern that was read. For
+example, after the error "lookbehind assertion is not fixed length", the error
+offset points to the start of the failing assertion. For an invalid UTF-8 or
+UTF-16 string, the offset is that of the first code unit of the failing
+character.
 .P
 Some errors are not detected until the whole pattern has been scanned; in these
 cases, the offset passed back is the length of the pattern. Note that the
@@ -2517,7 +2519,9 @@
 A minimum of at least 1 pair is imposed by \fBpcre2_match_data_create()\fP, so
 it is always possible to return the overall matched string in the case of
 \fBpcre2_match()\fP or the longest match in the case of
-\fBpcre2_dfa_match()\fP.
+\fBpcre2_dfa_match()\fP. The maximum number of pairs is 65535; if the the first
+argument of \fBpcre2_match_data_create()\fP is greater than this, 65535 is
+used.
 .P
 The second argument of \fBpcre2_match_data_create()\fP is a pointer to a
 general context, which can specify custom memory management for obtaining the
@@ -3160,11 +3164,11 @@
 .sp
   PCRE2_ERROR_NOMEMORY
 .sp
-If a pattern contains many nested backtracking points, heap memory is used to
-remember them. This error is given when the memory allocation function (default
-or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
-if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
-also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
+Heap memory is used to remember backgracking points. This error is given when
+the memory allocation function (default or custom) fails. Note that a different
+error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
+the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
+PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
 .sp
   PCRE2_ERROR_NULL
 .sp
@@ -4025,6 +4029,6 @@
 .rs
 .sp
 .nf
-Last updated: 14 December 2021
-Copyright (c) 1997-2021 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
diff --git a/doc/pcre2build.3 b/doc/pcre2build.3
index 5fca3dc..36728c8 100644
--- a/doc/pcre2build.3
+++ b/doc/pcre2build.3
@@ -1,4 +1,4 @@
-.TH PCRE2BUILD 3 "08 December 2021" "PCRE2 10.40"
+.TH PCRE2BUILD 3 "27 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .
@@ -278,12 +278,11 @@
 \fBpcre2_dfa_match()\fP matching function, and to JIT matching (though the
 counting is done differently).
 .P
-The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
-stack to record backtracking points. The more nested backtracking points there
-are (that is, the deeper the search tree), the more memory is needed. If the
-initial vector is not large enough, heap memory is used, up to a certain limit,
-which is specified in kibibytes (units of 1024 bytes). The limit can be changed
-at run time, as described in the
+The \fBpcre2_match()\fP function uses heap memory to record backtracking
+points. The more nested backtracking points there are (that is, the deeper the
+search tree), the more memory is needed. There is an upper limit, specified in
+kibibytes (units of 1024 bytes). This limit can be changed at run time, as
+described in the
 .\" HREF
 \fBpcre2api\fP
 .\"
@@ -625,7 +624,7 @@
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@@ -634,6 +633,6 @@
 .rs
 .sp
 .nf
-Last updated: 08 December 2021
-Copyright (c) 1997-2021 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
diff --git a/doc/pcre2demo.3 b/doc/pcre2demo.3
index 57615a5..89c9497 100644
--- a/doc/pcre2demo.3
+++ b/doc/pcre2demo.3
@@ -1,3 +1,6 @@
+.SH NAME
+// - A demonstration C program for PCRE2 - //
+.sp
 .\" Start example.
 .de EX
 .  nr mE \\n(.f
diff --git a/doc/pcre2grep.1 b/doc/pcre2grep.1
index 1081591..956633d 100644
--- a/doc/pcre2grep.1
+++ b/doc/pcre2grep.1
@@ -1,4 +1,4 @@
-.TH PCRE2GREP 1 "31 August 2021" "PCRE2 10.38"
+.TH PCRE2GREP 1 "21 November 2022" "PCRE2 10.41"
 .SH NAME
 pcre2grep - a grep with Perl-compatible regular expressions.
 .SH SYNOPSIS
@@ -43,13 +43,15 @@
 .sp
   pcre2grep some-pattern file1 - file3
 .sp
-Input files are searched line by line. By default, each line that matches a
+By default, input files are searched line by line. Each line that matches a
 pattern is copied to the standard output, and if there is more than one file,
 the file name is output at the start of each line, followed by a colon.
-However, there are options that can change how \fBpcre2grep\fP behaves. In
-particular, the \fB-M\fP option makes it possible to search for strings that
-span line boundaries. What defines a line boundary is controlled by the
-\fB-N\fP (\fB--newline\fP) option.
+However, there are options that can change how \fBpcre2grep\fP behaves. For
+example, the \fB-M\fP option makes it possible to search for strings that span
+line boundaries. What defines a line boundary is controlled by the \fB-N\fP
+(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
+not file names are shown, and the \fB-Z\fP option changes the file name
+terminator to a zero byte.
 .P
 The amount of memory used for buffering files that are being scanned is
 controlled by parameters that can be set by the \fB--buffer-size\fP and
@@ -74,18 +76,22 @@
 .P
 By default, as soon as one pattern matches a line, no further patterns are
 considered. However, if \fB--colour\fP (or \fB--color\fP) is used to colour the
-matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP, or
-\fB--line-offsets\fP is used to output only the part of the line that matched
-(either shown literally, or as an offset), scanning resumes immediately
-following the match, so that further matches on the same line can be found. If
-there are multiple patterns, they are all tried on the remainder of the line,
-but patterns that follow the one that matched are not tried on the earlier
-matched part of the line.
+matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP,
+\fB--line-offsets\fP, or \fB--output\fP is used to output only the part of the
+line that matched (either shown literally, or as an offset), the behaviour is
+different. In this situation, all the patterns are applied to the line. If
+there is more than one match, the one that begins nearest to the start of the
+subject is processed; if there is more than one match at that position, the one
+with the longest matching substring is processed; if the matching substrings
+are equal, the first match found is processed.
 .P
-This behaviour means that the order in which multiple patterns are specified
-can affect the output when one of the above options is used. This is no longer
-the same behaviour as GNU grep, which now manages to display earlier matches
-for later patterns (as long as there is no overlap).
+Scanning with all the patterns resumes immediately following the match, so that
+later matches on the same line can be found. Note, however, that an overlapping
+match that starts in the middle of another match will not be processed.
+.P
+The above behaviour was changed at release 10.41 to be more compatible with GNU
+grep. In earlier releases, \fBpcre2grep\fP did not recognize matches from
+later patterns that were earlier in the subject.
 .P
 Patterns that can match an empty string are accepted, but empty string
 matches are never recognized. An example is the pattern "(super)?(man)?", in
@@ -101,14 +107,15 @@
 .SH "SUPPORT FOR COMPRESSED FILES"
 .rs
 .sp
-It is possible to compile \fBpcre2grep\fP so that it uses \fBlibz\fP or
-\fBlibbz2\fP to read compressed files whose names end in \fB.gz\fP or
+Compile-time options for \fBpcre2grep\fP can set it up to use \fBlibz\fP or
+\fBlibbz2\fP for reading compressed files whose names end in \fB.gz\fP or
 \fB.bz2\fP, respectively. You can find out whether your \fBpcre2grep\fP binary
 has support for one or both of these file types by running it with the
 \fB--help\fP option. If the appropriate support is not present, all files are
-treated as plain text. The standard input is always so treated. When input is
-from a compressed .gz or .bz2 file, the \fB--line-buffered\fP option is
-ignored.
+treated as plain text. The standard input is always so treated. If a file with
+a \fB.gz\fP or \fB.bz2\fP extension is not in fact compressed, it is read as a
+plain text file. When input is from a compressed .gz or .bz2 file, the
+\fB--line-buffered\fP option is ignored.
 .
 .
 .SH "BINARY FILES"
@@ -149,9 +156,11 @@
 lines are output if the next match or the end of the file is reached, or if the
 processing buffer size has been set too small. If file names and/or line
 numbers are being output, a hyphen separator is used instead of a colon for the
-context lines. A line containing "--" is output between each group of lines,
-unless they are in fact contiguous in the input file. The value of \fInumber\fP
-is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
+context lines (the \fB-Z\fP option can be used to change the file name
+terminator to a zero byte). A line containing "--" is output between each group
+of lines, unless they are in fact contiguous in the input file. The value of
+\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
+\fB-A\fP is ignored.
 .TP
 \fB-a\fP, \fB--text\fP
 Treat binary files as text. This is equivalent to
@@ -167,9 +176,10 @@
 lines are output if the previous match or the start of the file is within
 \fInumber\fP lines, or if the processing buffer size has been set too small. If
 file names and/or line numbers are being output, a hyphen separator is used
-instead of a colon for the context lines. A line containing "--" is output
-between each group of lines, unless they are in fact contiguous in the input
-file. The value of \fInumber\fP is expected to be relatively small. When
+instead of a colon for the context lines (the \fB-Z\fP option can be used to
+change the file name terminator to a zero byte). A line containing "--" is
+output between each group of lines, unless they are in fact contiguous in the
+input file. The value of \fInumber\fP is expected to be relatively small. When
 \fB-c\fP is used, \fB-B\fP is ignored.
 .TP
 \fB--binary-files=\fP\fIword\fP
@@ -215,12 +225,14 @@
 .TP
 \fB--colour=\fP\fIvalue\fP, \fB--color=\fP\fIvalue\fP
 This option specifies under what circumstances the parts of a line that matched
-a pattern should be coloured in the output. By default, the output is not
-coloured. The value (which is optional, see above) may be "never", "always", or
-"auto". In the latter case, colouring happens only if the standard output is
-connected to a terminal. More resources are used when colouring is enabled,
-because \fBpcre2grep\fP has to search for all possible matches in a line, not
-just one, in order to colour them all.
+a pattern should be coloured in the output. It is ignored if
+\fB--file-offsets\fP, \fB--line-offsets\fP, or \fB--output\fP is set. By
+default, output is not coloured. The value for the \fB--colour\fP option (which
+is optional, see above) may be "never", "always", or "auto". In the latter
+case, colouring happens only if the standard output is connected to a terminal.
+More resources are used when colouring is enabled, because \fBpcre2grep\fP has
+to search for all possible matches in a line, not just one, in order to colour
+them all.
 .sp
 The colour that is used can be specified by setting one of the environment
 variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR, PCREGREP_COLOUR, or
@@ -262,17 +274,11 @@
 single pattern that starts with a hyphen. When \fB-e\fP is used, no argument
 pattern is taken from the command line; all arguments are treated as file
 names. There is no limit to the number of patterns. They are applied to each
-line in the order in which they are defined until one matches.
+line in the order in which they are defined.
 .sp
 If \fB-f\fP is used with \fB-e\fP, the command line patterns are matched first,
 followed by the patterns from the file(s), independent of the order in which
-these options are specified. Note that multiple use of \fB-e\fP is not the same
-as a single pattern with alternatives. For example, X|Y finds the first
-character in a line that is X or Y, whereas if the two patterns are given
-separately, with X first, \fBpcre2grep\fP finds X if it is present, even if it
-follows Y in the line. It finds Y only if there is no X in the line. This
-matters only if you are using \fB-o\fP or \fB--colo(u)r\fP to show the part(s)
-of the line that matched.
+these options are specified.
 .TP
 \fB--exclude\fP=\fIpattern\fP
 Files (but not directories) whose names match the pattern are skipped without
@@ -316,22 +322,19 @@
 \fB--exclude\fP options.
 .TP
 \fB-f\fP \fIfilename\fP, \fB--file=\fP\fIfilename\fP
-Read patterns from the file, one per line, and match them against each line of
-input. As is the case with patterns on the command line, no delimiters should
-be used. What constitutes a newline when reading the file is the operating
-system's default interpretation of \en. The \fB--newline\fP option has no
-effect on this option. Trailing white space is removed from each line, and
-blank lines are ignored. An empty file contains no patterns and therefore
-matches nothing. Patterns read from a file in this way may contain binary
-zeros, which are treated as ordinary data characters. See also the comments
-about multiple patterns versus a single pattern with alternatives in the
-description of \fB-e\fP above.
+Read patterns from the file, one per line. As is the case with patterns on the
+command line, no delimiters should be used. What constitutes a newline when
+reading the file is the operating system's default interpretation of \en. The
+\fB--newline\fP option has no effect on this option. Trailing white space is
+removed from each line, and blank lines are ignored. An empty file contains no
+patterns and therefore matches nothing. Patterns read from a file in this way
+may contain binary zeros, which are treated as ordinary data characters.
 .sp
 If this option is given more than once, all the specified files are read. A
 data line is output if any of the patterns match it. A file name can be given
 as "-" to refer to the standard input. When \fB-f\fP is used, patterns
 specified on the command line using \fB-e\fP may also be present; they are
-tested before the file's patterns. However, no other pattern is taken from the
+matched before the file's patterns. However, no pattern is taken from the
 command line; all arguments are treated as the names of paths to be searched.
 .TP
 \fB--file-list\fP=\fIfilename\fP
@@ -349,26 +352,28 @@
 \fB--file-offsets\fP
 Instead of showing lines or parts of lines that match, show each match as an
 offset from the start of the file and a length, separated by a comma. In this
-mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP
-options are ignored. If there is more than one match in a line, each of them is
-shown separately. This option is mutually exclusive with \fB--output\fP,
-\fB--line-offsets\fP, and \fB--only-matching\fP.
+mode, \fB--colour\fP has no effect, and no context is shown. That is, the
+\fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is more than one
+match in a line, each of them is shown separately. This option is mutually
+exclusive with \fB--output\fP, \fB--line-offsets\fP, and \fB--only-matching\fP.
 .TP
 \fB-H\fP, \fB--with-filename\fP
 Force the inclusion of the file name at the start of output lines when
-searching a single file. By default, the file name is not shown in this case.
-For matching lines, the file name is followed by a colon; for context lines, a
-hyphen separator is used. If a line number is also being output, it follows the
-file name. When the \fB-M\fP option causes a pattern to match more than one
-line, only the first is preceded by the file name. This option overrides any
-previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
+searching a single file. The file name is not normally shown in this case.
+By default, for matching lines, the file name is followed by a colon; for
+context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
+change the terminator to a zero byte. If a line number is also being output,
+it follows the file name. When the \fB-M\fP option causes a pattern to match
+more than one line, only the first is preceded by the file name. This option
+overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
 .TP
 \fB-h\fP, \fB--no-filename\fP
-Suppress the output file names when searching multiple files. By default,
-file names are shown when multiple files are searched. For matching lines, the
-file name is followed by a colon; for context lines, a hyphen separator is used.
-If a line number is also being output, it follows the file name. This option
-overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
+Suppress the output file names when searching multiple files. File names are
+normally shown when multiple files are searched. By default, for matching
+lines, the file name is followed by a colon; for context lines, a hyphen
+separator is used. The \fB-Z\fP option can be used to change the terminator to
+a zero byte. If a line number is also being output, it follows the file name.
+This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
 .TP
 \fB--heap-limit\fP=\fInumber\fP
 See \fB--match-limit\fP below.
@@ -417,17 +422,19 @@
 \fB-L\fP, \fB--files-without-match\fP
 Instead of outputting lines from the files, just output the names of the files
 that do not contain any lines that would have been output. Each file name is
-output once, on a separate line. This option overrides any previous \fB-H\fP,
-\fB-h\fP, or \fB-l\fP options.
+output once, on a separate line by default, but if the \fB-Z\fP option is set,
+they are separated by zero bytes instead of newlines. This option overrides any
+previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
 .TP
 \fB-l\fP, \fB--files-with-matches\fP
 Instead of outputting lines from the files, just output the names of the files
 containing lines that would have been output. Each file name is output once, on
-a separate line. Searching normally stops as soon as a matching line is found
-in a file. However, if the \fB-c\fP (count) option is also used, matching
-continues in order to obtain the correct count, and those files that have at
-least one match are listed along with their counts. Using this option with
-\fB-c\fP is a way of suppressing the listing of files with no matches that
+a separate line, but if the \fB-Z\fP option is set, they are separated by zero
+bytes instead of newlines. Searching normally stops as soon as a matching line
+is found in a file. However, if the \fB-c\fP (count) option is also used,
+matching continues in order to obtain the correct count, and those files that
+have at least one match are listed along with their counts. Using this option
+with \fB-c\fP is a way of suppressing the listing of files with no matches that
 occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
 \fB-h\fP, or \fB-L\fP options.
 .TP
@@ -452,11 +459,11 @@
 Instead of showing lines or parts of lines that match, show each match as a
 line number, the offset from the start of the line, and a length. The line
 number is terminated by a colon (as usual; see the \fB-n\fP option), and the
-offset and length are separated by a comma. In this mode, no context is shown.
-That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is
-more than one match in a line, each of them is shown separately. This option is
-mutually exclusive with \fB--output\fP, \fB--file-offsets\fP, and
-\fB--only-matching\fP.
+offset and length are separated by a comma. In this mode, \fB--colour\fP has no
+effect, and no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP
+options are ignored. If there is more than one match in a line, each of them is
+shown separately. This option is mutually exclusive with \fB--output\fP,
+\fB--file-offsets\fP, and \fB--only-matching\fP.
 .TP
 \fB--locale\fP=\fIlocale-name\fP
 This option specifies a locale to be used for pattern matching. It overrides
@@ -516,10 +523,7 @@
 value set by \fB--match-limit\fP is reached, an error occurs.
 .sp
 The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
-1024 bytes), the amount of heap memory that may be used for matching. Heap
-memory is needed only if matching the pattern requires a significant number of
-nested backtracking points to be remembered. This parameter can be set to zero
-to forbid the use of heap memory altogether.
+1024 bytes), the maximum amount of heap memory that may be used for matching.
 .sp
 The \fB--depth-limit\fP option limits the depth of nested backtracking points,
 which indirectly limits the amount of memory that is used. The amount of memory
@@ -582,12 +586,12 @@
 \fB-O\fP \fItext\fP, \fB--output\fP=\fItext\fP
 When there is a match, instead of outputting the line that matched, output just
 the text specified in this option, followed by an operating-system standard
-newline. In this mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP,
-and \fB-C\fP options are ignored. The \fB--newline\fP option has no effect on
-this option, which is mutually exclusive with \fB--only-matching\fP,
-\fB--file-offsets\fP, and \fB--line-offsets\fP. However, like
-\fB--only-matching\fP, if there is more than one match in a line, each of them
-causes a line of output.
+newline. In this mode, \fB--colour\fP has no effect, and no context is shown.
+That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. The
+\fB--newline\fP option has no effect on this option, which is mutually
+exclusive with \fB--only-matching\fP, \fB--file-offsets\fP, and
+\fB--line-offsets\fP. However, like \fB--only-matching\fP, if there is more
+than one match in a line, each of them causes a line of output.
 .sp
 Escape sequences starting with a dollar character may be used to insert the
 contents of the matched part of the line and/or captured substrings into the
@@ -732,6 +736,12 @@
 pattern and ")$" at the end. This option applies only to the patterns that are
 matched against the contents of files; it does not apply to patterns specified
 by any of the \fB--include\fP or \fB--exclude\fP options.
+.TP
+\fB-Z\fP, \fB--null\fP
+Terminate files names in the regular output with a zero byte (the NUL
+character) instead of what would normally appear. This is useful when file
+names contain unusual characters such as colons, hyphens, or even newlines. The
+option does not apply to file names in error messages.
 .
 .
 .SH "ENVIRONMENT VARIABLES"
@@ -960,6 +970,6 @@
 .rs
 .sp
 .nf
-Last updated: 31 August 2021
-Copyright (c) 1997-2021 University of Cambridge.
+Last updated: 21 November 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
diff --git a/doc/pcre2grep.txt b/doc/pcre2grep.txt
index 8e9c757..adc1d89 100644
--- a/doc/pcre2grep.txt
+++ b/doc/pcre2grep.txt
@@ -42,13 +42,15 @@
 
          pcre2grep some-pattern file1 - file3
 
-       Input files are searched line by  line.  By  default,  each  line  that
+       By default, input files are searched  line  by  line.  Each  line  that
        matches  a  pattern  is  copied to the standard output, and if there is
        more than one file, the file name is output at the start of each  line,
        followed  by  a  colon.  However, there are options that can change how
-       pcre2grep behaves. In particular, the -M option makes  it  possible  to
+       pcre2grep behaves. For example, the -M  option  makes  it  possible  to
        search  for  strings  that  span  line  boundaries. What defines a line
-       boundary is controlled by the -N (--newline) option.
+       boundary is controlled by the -N (--newline) option. The -h and -H  op-
+       tions  control  whether  or not file names are shown, and the -Z option
+       changes the file name terminator to a zero byte.
 
        The amount of memory used for buffering files that are being scanned is
        controlled  by  parameters  that  can  be  set by the --buffer-size and
@@ -74,107 +76,117 @@
 
        By default, as soon as one pattern matches a line, no further  patterns
        are considered. However, if --colour (or --color) is used to colour the
-       matching substrings, or if --only-matching, --file-offsets, or  --line-
-       offsets  is  used to output only the part of the line that matched (ei-
-       ther shown literally, or as an offset),  scanning  resumes  immediately
-       following  the  match,  so that further matches on the same line can be
-       found. If there are multiple patterns, they are all tried  on  the  re-
-       mainder  of the line, but patterns that follow the one that matched are
-       not tried on the earlier matched part of the line.
+       matching substrings, or if --only-matching, --file-offsets, --line-off-
+       sets,  or  --output  is  used  to output only the part of the line that
+       matched (either shown literally, or as an  offset),  the  behaviour  is
+       different. In this situation, all the patterns are applied to the line.
+       If there is more than one match, the one that  begins  nearest  to  the
+       start  of  the subject is processed; if there is more than one match at
+       that position, the one with the  longest  matching  substring  is  pro-
+       cessed;  if the matching substrings are equal, the first match found is
+       processed.
 
-       This behaviour means that the order  in  which  multiple  patterns  are
-       specified  can affect the output when one of the above options is used.
-       This is no longer the same behaviour as GNU grep, which now manages  to
-       display  earlier  matches  for  later  patterns (as long as there is no
-       overlap).
+       Scanning with all the patterns resumes immediately following the match,
+       so  that  later  matches  on the same line can be found. Note, however,
+       that an overlapping match that starts in the middle  of  another  match
+       will not be processed.
 
-       Patterns that can match an empty string are accepted, but empty  string
-       matches   are  never  recognized.  An  example  is  the  pattern  "(su-
-       per)?(man)?", in which all components are optional. This pattern  finds
-       all  occurrences  of  both  "super"  and "man"; the output differs from
-       matching with "super|man" when only the matching substrings  are  being
+       The  above behaviour was changed at release 10.41 to be more compatible
+       with GNU grep. In earlier releases, pcre2grep did not recognize matches
+       from later patterns that were earlier in the subject.
+
+       Patterns  that can match an empty string are accepted, but empty string
+       matches  are  never  recognized.  An  example  is  the  pattern   "(su-
+       per)?(man)?",  in which all components are optional. This pattern finds
+       all occurrences of both "super" and  "man";  the  output  differs  from
+       matching  with  "super|man" when only the matching substrings are being
        shown.
 
-       If  the  LC_ALL or LC_CTYPE environment variable is set, pcre2grep uses
+       If the LC_ALL or LC_CTYPE environment variable is set,  pcre2grep  uses
        the value to set a locale when calling the PCRE2 library.  The --locale
        option can be used to override this.
 
 
 SUPPORT FOR COMPRESSED FILES
 
-       It  is  possible to compile pcre2grep so that it uses libz or libbz2 to
-       read compressed files whose names end in .gz or .bz2, respectively. You
-       can  find out whether your pcre2grep binary has support for one or both
-       of these file types by running it with the --help option. If the appro-
-       priate support is not present, all files are treated as plain text. The
-       standard input is always so treated. When input is  from  a  compressed
-       .gz or .bz2 file, the --line-buffered option is ignored.
+       Compile-time options for pcre2grep can set it up to use libz or  libbz2
+       for  reading  compressed  files whose names end in .gz or .bz2, respec-
+       tively. You can find out whether your pcre2grep binary has support  for
+       one  or  both of these file types by running it with the --help option.
+       If the appropriate support is not present, all  files  are  treated  as
+       plain  text.  The standard input is always so treated. If a file with a
+       .gz or .bz2 extension is not in fact compressed, it is read as a  plain
+       text  file.  When  input  is  from  a  compressed .gz or .bz2 file, the
+       --line-buffered option is ignored.
 
 
 BINARY FILES
 
-       By  default,  a  file that contains a binary zero byte within the first
-       1024 bytes is identified as a binary file, and is processed  specially.
-       However,  if  the  newline  type is specified as NUL, that is, the line
+       By default, a file that contains a binary zero byte  within  the  first
+       1024  bytes is identified as a binary file, and is processed specially.
+       However, if the newline type is specified as NUL,  that  is,  the  line
        terminator is a binary zero, the test for a binary file is not applied.
-       See  the  --binary-files  option for a means of changing the way binary
+       See the --binary-files option for a means of changing  the  way  binary
        files are handled.
 
 
 BINARY ZEROS IN PATTERNS
 
-       Patterns passed from the command line are strings that  are  terminated
-       by  a  binary zero, so cannot contain internal zeros. However, patterns
+       Patterns  passed  from the command line are strings that are terminated
+       by a binary zero, so cannot contain internal zeros.  However,  patterns
        that are read from a file via the -f option may contain binary zeros.
 
 
 OPTIONS
 
-       The order in which some of the options appear can  affect  the  output.
-       For  example,  both  the  -H and -l options affect the printing of file
-       names. Whichever comes later in the command line will be the  one  that
-       takes  effect.  Similarly,  except  where  noted below, if an option is
-       given twice, the later setting is used. Numerical  values  for  options
-       may  be  followed  by  K  or  M,  to  signify multiplication by 1024 or
+       The  order  in  which some of the options appear can affect the output.
+       For example, both the -H and -l options affect  the  printing  of  file
+       names.  Whichever  comes later in the command line will be the one that
+       takes effect. Similarly, except where noted  below,  if  an  option  is
+       given  twice,  the  later setting is used. Numerical values for options
+       may be followed by K  or  M,  to  signify  multiplication  by  1024  or
        1024*1024 respectively.
 
        --        This terminates the list of options. It is useful if the next
-                 item  on  the command line starts with a hyphen but is not an
-                 option. This allows for the processing of patterns  and  file
+                 item on the command line starts with a hyphen but is  not  an
+                 option.  This  allows for the processing of patterns and file
                  names that start with hyphens.
 
        -A number, --after-context=number
-                 Output  up  to  number  lines  of context after each matching
-                 line. Fewer lines are output if the next match or the end  of
-                 the  file  is  reached,  or if the processing buffer size has
+                 Output up to number lines  of  context  after  each  matching
+                 line.  Fewer lines are output if the next match or the end of
+                 the file is reached, or if the  processing  buffer  size  has
                  been set too small. If file names and/or line numbers are be-
                  ing output, a hyphen separator is used instead of a colon for
-                 the context lines. A line containing "--" is  output  between
-                 each  group  of  lines, unless they are in fact contiguous in
-                 the input file. The value of number is expected to  be  rela-
-                 tively small. When -c is used, -A is ignored.
+                 the  context  lines  (the -Z option can be used to change the
+                 file name terminator to a zero byte). A line containing  "--"
+                 is  output  between  each  group of lines, unless they are in
+                 fact contiguous in the input file. The value of number is ex-
+                 pected  to  be  relatively  small. When -c is used, -A is ig-
+                 nored.
 
        -a, --text
-                 Treat  binary  files as text. This is equivalent to --binary-
+                 Treat binary files as text. This is equivalent  to  --binary-
                  files=text.
 
        --allow-lookaround-bsk
                  PCRE2 now forbids the use of \K in lookarounds by default, in
-                 line  with  Perl.   This  option  causes pcre2grep to set the
-                 PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option, which  enables  this
+                 line with Perl.  This option  causes  pcre2grep  to  set  the
+                 PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK  option,  which enables this
                  somewhat dangerous usage.
 
        -B number, --before-context=number
-                 Output  up  to  number  lines of context before each matching
-                 line. Fewer lines are output if the  previous  match  or  the
-                 start  of the file is within number lines, or if the process-
-                 ing buffer size has been set too small. If file names  and/or
+                 Output up to number lines of  context  before  each  matching
+                 line.  Fewer  lines  are  output if the previous match or the
+                 start of the file is within number lines, or if the  process-
+                 ing  buffer size has been set too small. If file names and/or
                  line numbers are being output, a hyphen separator is used in-
-                 stead of a colon for the context  lines.  A  line  containing
-                 "--"  is  output between each group of lines, unless they are
-                 in fact contiguous in the input file. The value of number  is
-                 expected  to  be relatively small. When -c is used, -B is ig-
-                 nored.
+                 stead  of a colon for the context lines (the -Z option can be
+                 used to change the file name terminator to a  zero  byte).  A
+                 line  containing  "--" is output between each group of lines,
+                 unless they are in fact contiguous in  the  input  file.  The
+                 value  of  number is expected to be relatively small. When -c
+                 is used, -B is ignored.
 
        --binary-files=word
                  Specify how binary files are to be processed. If the word  is
@@ -226,48 +238,49 @@
        --colour=value, --color=value
                  This option specifies under what circumstances the parts of a
                  line that matched a pattern should be coloured in the output.
-                 By default, the output is not coloured. The value  (which  is
-                 optional,  see above) may be "never", "always", or "auto". In
-                 the latter case, colouring happens only if the standard  out-
-                 put  is connected to a terminal. More resources are used when
-                 colouring is enabled, because pcre2grep has to search for all
-                 possible  matches in a line, not just one, in order to colour
-                 them all.
+                 It is ignored if --file-offsets, --line-offsets, or  --output
+                 is set. By default, output is not coloured. The value for the
+                 --colour  option  (which  is  optional,  see  above)  may  be
+                 "never",  "always",  or "auto". In the latter case, colouring
+                 happens only if the standard output is connected to a  termi-
+                 nal.   More resources are used when colouring is enabled, be-
+                 cause pcre2grep has to search for all possible matches  in  a
+                 line, not just one, in order to colour them all.
 
-                 The colour that is used can be specified by  setting  one  of
-                 the  environment variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR,
+                 The  colour  that  is used can be specified by setting one of
+                 the environment variables PCRE2GREP_COLOUR,  PCRE2GREP_COLOR,
                  PCREGREP_COLOUR, or PCREGREP_COLOR, which are checked in that
                  order.  If  none  of  these  are  set,  pcre2grep  looks  for
-                 GREP_COLORS or GREP_COLOR (in that order). The value  of  the
-                 variable  should  be  a string of two numbers, separated by a
-                 semicolon, except in the  case  of  GREP_COLORS,  which  must
+                 GREP_COLORS  or  GREP_COLOR (in that order). The value of the
+                 variable should be a string of two numbers,  separated  by  a
+                 semicolon,  except  in  the  case  of GREP_COLORS, which must
                  start with "ms=" or "mt=" followed by two semicolon-separated
-                 colours, terminated by the end of the string or by  a  colon.
-                 If  GREP_COLORS  does not start with "ms=" or "mt=" it is ig-
+                 colours,  terminated  by the end of the string or by a colon.
+                 If GREP_COLORS does not start with "ms=" or "mt=" it  is  ig-
                  nored, and GREP_COLOR is checked.
 
-                 If the string obtained from one of the above  variables  con-
+                 If  the  string obtained from one of the above variables con-
                  tains any characters other than semicolon or digits, the set-
                  ting is ignored and the default colour is used. The string is
                  copied directly into the control string for setting colour on
-                 a terminal, so it is your responsibility to ensure  that  the
-                 values  make  sense.  If  no relevant environment variable is
+                 a  terminal,  so it is your responsibility to ensure that the
+                 values make sense. If no  relevant  environment  variable  is
                  set, the default is "1;31", which gives red.
 
        -D action, --devices=action
-                 If an input path is not a regular file or a  directory,  "ac-
-                 tion"  specifies  how it is to be processed. Valid values are
+                 If  an  input path is not a regular file or a directory, "ac-
+                 tion" specifies how it is to be processed. Valid  values  are
                  "read" (the default) or "skip" (silently skip the path).
 
        -d action, --directories=action
                  If an input path is a directory, "action" specifies how it is
-                 to  be  processed.   Valid  values are "read" (the default in
-                 non-Windows environments, for compatibility with  GNU  grep),
-                 "recurse"  (equivalent to the -r option), or "skip" (silently
-                 skip the path, the default in Windows environments).  In  the
-                 "read"  case,  directories  are read as if they were ordinary
-                 files. In some operating systems the effect of reading a  di-
-                 rectory  like  this is an immediate end-of-file; in others it
+                 to be processed.  Valid values are  "read"  (the  default  in
+                 non-Windows  environments,  for compatibility with GNU grep),
+                 "recurse" (equivalent to the -r option), or "skip"  (silently
+                 skip  the  path, the default in Windows environments). In the
+                 "read" case, directories are read as if  they  were  ordinary
+                 files.  In some operating systems the effect of reading a di-
+                 rectory like this is an immediate end-of-file; in  others  it
                  may provoke an error.
 
        --depth-limit=number
@@ -276,91 +289,80 @@
        -e pattern, --regex=pattern, --regexp=pattern
                  Specify a pattern to be matched. This option can be used mul-
                  tiple times in order to specify several patterns. It can also
-                 be used as a way of specifying a single pattern  that  starts
-                 with  a hyphen. When -e is used, no argument pattern is taken
-                 from the command line; all  arguments  are  treated  as  file
-                 names.  There is no limit to the number of patterns. They are
-                 applied to each line in the order in which they  are  defined
-                 until one matches.
+                 be  used  as a way of specifying a single pattern that starts
+                 with a hyphen. When -e is used, no argument pattern is  taken
+                 from  the  command  line;  all  arguments are treated as file
+                 names. There is no limit to the number of patterns. They  are
+                 applied to each line in the order in which they are defined.
 
                  If  -f is used with -e, the command line patterns are matched
                  first, followed by the patterns from the file(s), independent
-                 of  the order in which these options are specified. Note that
-                 multiple use of -e is not the same as a single  pattern  with
-                 alternatives. For example, X|Y finds the first character in a
-                 line that is X or Y, whereas if the two  patterns  are  given
-                 separately, with X first, pcre2grep finds X if it is present,
-                 even if it follows Y in the line. It finds Y only if there is
-                 no  X  in  the line. This matters only if you are using -o or
-                 --colo(u)r to show the part(s) of the line that matched.
+                 of the order in which these options are specified.
 
        --exclude=pattern
                  Files (but not directories) whose names match the pattern are
-                 skipped  without  being processed. This applies to all files,
-                 whether listed on the command  line,  obtained  from  --file-
+                 skipped without being processed. This applies to  all  files,
+                 whether  listed  on  the  command line, obtained from --file-
                  list, or by scanning a directory. The pattern is a PCRE2 reg-
-                 ular expression, and is matched against the  final  component
+                 ular  expression,  and is matched against the final component
                  of the file name, not the entire path. The -F, -w, and -x op-
-                 tions do not apply to this pattern. The option may  be  given
+                 tions  do  not apply to this pattern. The option may be given
                  any number of times in order to specify multiple patterns. If
-                 a file name matches both an --include and an  --exclude  pat-
+                 a  file  name matches both an --include and an --exclude pat-
                  tern, it is excluded. There is no short form for this option.
 
        --exclude-from=filename
-                 Treat  each  non-empty  line  of  the file as the data for an
+                 Treat each non-empty line of the file  as  the  data  for  an
                  --exclude option. What constitutes a newline when reading the
-                 file  is the operating system's default. The --newline option
-                 has no effect on this option. This option may be  given  more
+                 file is the operating system's default. The --newline  option
+                 has  no  effect on this option. This option may be given more
                  than once in order to specify a number of files to read.
 
        --exclude-dir=pattern
                  Directories whose names match the pattern are skipped without
-                 being processed, whatever the setting of the --recursive  op-
-                 tion.  This applies to all directories, whether listed on the
-                 command line, obtained from --file-list,  or  by  scanning  a
-                 parent  directory. The pattern is a PCRE2 regular expression,
-                 and is matched against the final component of  the  directory
-                 name,  not the entire path. The -F, -w, and -x options do not
-                 apply to this pattern. The option may be given any number  of
-                 times  in order to specify more than one pattern. If a direc-
-                 tory matches both --include-dir and --exclude-dir, it is  ex-
+                 being  processed, whatever the setting of the --recursive op-
+                 tion. This applies to all directories, whether listed on  the
+                 command  line,  obtained  from  --file-list, or by scanning a
+                 parent directory. The pattern is a PCRE2 regular  expression,
+                 and  is  matched against the final component of the directory
+                 name, not the entire path. The -F, -w, and -x options do  not
+                 apply  to this pattern. The option may be given any number of
+                 times in order to specify more than one pattern. If a  direc-
+                 tory  matches both --include-dir and --exclude-dir, it is ex-
                  cluded. There is no short form for this option.
 
        -F, --fixed-strings
-                 Interpret  each  data-matching  pattern  as  a  list of fixed
-                 strings, separated by newlines, instead of as a  regular  ex-
+                 Interpret each data-matching  pattern  as  a  list  of  fixed
+                 strings,  separated  by newlines, instead of as a regular ex-
                  pression. What constitutes a newline for this purpose is con-
                  trolled by the --newline option. The -w (match as a word) and
-                 -x  (match whole line) options can be used with -F.  They ap-
-                 ply to each of the fixed strings. A line is selected  if  any
+                 -x (match whole line) options can be used with -F.  They  ap-
+                 ply  to  each of the fixed strings. A line is selected if any
                  of the fixed strings are found in it (subject to -w or -x, if
-                 present). This option applies only to the patterns  that  are
-                 matched  against  the contents of files; it does not apply to
-                 patterns specified by any of the --include or  --exclude  op-
+                 present).  This  option applies only to the patterns that are
+                 matched against the contents of files; it does not  apply  to
+                 patterns  specified  by any of the --include or --exclude op-
                  tions.
 
        -f filename, --file=filename
-                 Read  patterns  from  the  file, one per line, and match them
-                 against each line of input. As is the case with  patterns  on
-                 the  command line, no delimiters should be used. What consti-
-                 tutes a newline when reading the file is the  operating  sys-
-                 tem's  default interpretation of \n. The --newline option has
-                 no effect on this option. Trailing  white  space  is  removed
-                 from  each  line,  and blank lines are ignored. An empty file
-                 contains no patterns and therefore matches nothing.  Patterns
-                 read  from a file in this way may contain binary zeros, which
-                 are treated as ordinary data characters. See  also  the  com-
-                 ments  about  multiple  patterns versus a single pattern with
-                 alternatives in the description of -e above.
+                 Read patterns from the file, one per line.  As  is  the  case
+                 with  patterns  on  the command line, no delimiters should be
+                 used. What constitutes a newline when reading the file is the
+                 operating  system's  default interpretation of \n. The --new-
+                 line option has no effect  on  this  option.  Trailing  white
+                 space is removed from each line, and blank lines are ignored.
+                 An empty file contains  no  patterns  and  therefore  matches
+                 nothing.  Patterns  read  from a file in this way may contain
+                 binary zeros, which are treated as ordinary data characters.
 
                  If this option is given more than  once,  all  the  specified
                  files  are read. A data line is output if any of the patterns
                  match it. A file name can be given as "-"  to  refer  to  the
                  standard  input.  When  -f is used, patterns specified on the
-                 command line using -e may also be present;  they  are  tested
-                 before  the  file's  patterns.  However,  no other pattern is
-                 taken from the command line; all arguments are treated as the
-                 names of paths to be searched.
+                 command line using -e may also be present; they  are  matched
+                 before the file's patterns. However, no pattern is taken from
+                 the command line; all arguments are treated as the  names  of
+                 paths to be searched.
 
        --file-list=filename
                  Read  a  list  of  files  and/or  directories  that are to be
@@ -379,30 +381,34 @@
        --file-offsets
                  Instead of showing lines or parts of lines that  match,  show
                  each  match  as  an  offset  from the start of the file and a
-                 length, separated by a comma. In this  mode,  no  context  is
-                 shown.  That  is,  the -A, -B, and -C options are ignored. If
-                 there is more than one match in a line, each of them is shown
-                 separately.  This option is mutually exclusive with --output,
-                 --line-offsets, and --only-matching.
+                 length, separated by a comma. In this mode, --colour  has  no
+                 effect,  and no context is shown. That is, the -A, -B, and -C
+                 options are ignored. If there is more than  one  match  in  a
+                 line,  each of them is shown separately. This option is mutu-
+                 ally exclusive with  --output,  --line-offsets,  and  --only-
+                 matching.
 
        -H, --with-filename
-                 Force the inclusion of the file name at the start  of  output
-                 lines when searching a single file. By default, the file name
-                 is not shown in this case.  For matching lines, the file name
-                 is followed by a colon; for context lines, a hyphen separator
-                 is used. If a line number is also being  output,  it  follows
-                 the  file  name. When the -M option causes a pattern to match
-                 more than one line, only the first is preceded  by  the  file
-                 name.  This  option  overrides any previous -h, -l, or -L op-
-                 tions.
+                 Force  the  inclusion of the file name at the start of output
+                 lines when searching a single file. The file name is not nor-
+                 mally  shown  in  this case.  By default, for matching lines,
+                 the file name is followed by a colon; for  context  lines,  a
+                 hyphen separator is used. The -Z option can be used to change
+                 the terminator to a zero byte. If a line number is also being
+                 output, it follows the file name. When the -M option causes a
+                 pattern to match more than one line, only the first  is  pre-
+                 ceded  by  the  file name. This option overrides any previous
+                 -h, -l, or -L options.
 
        -h, --no-filename
                  Suppress the output file names when searching multiple files.
-                 By  default,  file  names  are  shown when multiple files are
-                 searched. For matching lines, the file name is followed by  a
-                 colon;  for  context lines, a hyphen separator is used.  If a
-                 line number is also being output, it follows the  file  name.
-                 This option overrides any previous -H, -L, or -l options.
+                 File  names  are  normally  shown  when  multiple  files  are
+                 searched. By default, for matching lines, the  file  name  is
+                 followed by a colon; for context lines, a hyphen separator is
+                 used. The -Z option can be used to change the terminator to a
+                 zero  byte. If a line number is also being output, it follows
+                 the file name.  This option overrides any previous -H, -L, or
+                 -l options.
 
        --heap-limit=number
                  See --match-limit below.
@@ -455,21 +461,23 @@
                  Instead  of  outputting lines from the files, just output the
                  names of the files that do not contain any lines  that  would
                  have  been  output. Each file name is output once, on a sepa-
-                 rate line. This option overrides any previous -H, -h,  or  -l
-                 options.
+                 rate line by default, but if the -Z option is set,  they  are
+                 separated  by  zero  bytes  instead  of newlines. This option
+                 overrides any previous -H, -h, or -l options.
 
        -l, --files-with-matches
-                 Instead  of  outputting lines from the files, just output the
+                 Instead of outputting lines from the files, just  output  the
                  names of the files containing lines that would have been out-
-                 put.  Each  file  name  is  output  once, on a separate line.
-                 Searching normally stops as soon as a matching line is  found
-                 in  a  file.  However, if the -c (count) option is also used,
-                 matching continues in order to obtain the correct count,  and
-                 those  files  that  have  at least one match are listed along
-                 with their counts. Using this option with -c is a way of sup-
-                 pressing  the  listing  of  files with no matches that occurs
-                 with -c on its own. This option overrides  any  previous  -H,
-                 -h, or -L options.
+                 put. Each file name is output once, on a separate  line,  but
+                 if the -Z option is set, they are separated by zero bytes in-
+                 stead of newlines. Searching normally  stops  as  soon  as  a
+                 matching  line is found in a file. However, if the -c (count)
+                 option is also used, matching continues in  order  to  obtain
+                 the  correct  count,  and  those files that have at least one
+                 match are listed along with their counts. Using  this  option
+                 with  -c is a way of suppressing the listing of files with no
+                 matches that occurs with -c on its own. This option overrides
+                 any previous -H, -h, or -L options.
 
        --label=name
                  This option supplies a name to be used for the standard input
@@ -495,11 +503,11 @@
                  each match as a line number, the offset from the start of the
                  line, and a length. The line number is terminated by a  colon
                  (as  usual; see the -n option), and the offset and length are
-                 separated by a comma. In this  mode,  no  context  is  shown.
-                 That  is, the -A, -B, and -C options are ignored. If there is
-                 more than one match in a line, each of them  is  shown  sepa-
-                 rately.  This  option  is  mutually  exclusive with --output,
-                 --file-offsets, and --only-matching.
+                 separated by a comma. In this mode, --colour has  no  effect,
+                 and  no context is shown. That is, the -A, -B, and -C options
+                 are ignored. If there is more than one match in a line,  each
+                 of  them  is shown separately. This option is mutually exclu-
+                 sive with --output, --file-offsets, and --only-matching.
 
        --locale=locale-name
                  This option specifies a locale to be used for pattern  match-
@@ -571,29 +579,26 @@
                  an error occurs.
 
                  The  --heap-limit  option specifies, as a number of kibibytes
-                 (units of 1024 bytes), the amount of heap memory that may  be
-                 used for matching. Heap memory is needed only if matching the
-                 pattern requires a significant number of nested  backtracking
-                 points to be remembered. This parameter can be set to zero to
-                 forbid the use of heap memory altogether.
+                 (units of 1024 bytes), the maximum amount of heap memory that
+                 may be used for matching.
 
-                 The --depth-limit option limits the  depth  of  nested  back-
+                 The  --depth-limit  option  limits  the depth of nested back-
                  tracking points, which indirectly limits the amount of memory
                  that is used. The amount of memory needed for each backtrack-
-                 ing  point  depends on the number of capturing parentheses in
+                 ing point depends on the number of capturing  parentheses  in
                  the pattern, so the amount of memory that is used before this
-                 limit  acts  varies from pattern to pattern. This limit is of
+                 limit acts varies from pattern to pattern. This limit  is  of
                  use only if it is set smaller than --match-limit.
 
-                 There are no short forms for these options. The default  lim-
-                 its  can  be  set when the PCRE2 library is compiled; if they
-                 are not specified, the defaults are very large and so  effec-
+                 There  are no short forms for these options. The default lim-
+                 its can be set when the PCRE2 library is  compiled;  if  they
+                 are  not specified, the defaults are very large and so effec-
                  tively unlimited.
 
        --max-buffer-size=number
-                 This  limits  the  expansion  of the processing buffer, whose
-                 initial size can be set by --buffer-size. The maximum  buffer
-                 size  is  silently  forced to be no smaller than the starting
+                 This limits the expansion of  the  processing  buffer,  whose
+                 initial  size can be set by --buffer-size. The maximum buffer
+                 size is silently forced to be no smaller  than  the  starting
                  buffer size.
 
        -N newline-type, --newline=newline-type
@@ -602,216 +607,223 @@
 
                    pcre2grep -N CRLF 'some pattern' <file>
 
-                 The  newline  type may be specified in upper, lower, or mixed
-                 case. If the newline type is NUL, lines are separated by  bi-
-                 nary  zero characters. The other types are the single-charac-
-                 ter sequences CR (carriage return)  and  LF  (linefeed),  the
-                 two-character  sequence CRLF, an "anycrlf" type, which recog-
-                 nizes any of the preceding three types, and  an  "any"  type,
-                 for  which any Unicode line ending sequence is assumed to end
-                 a line. The Unicode sequences are the three  just  mentioned,
-                 plus  VT  (vertical tab, U+000B), FF (form feed, U+000C), NEL
-                 (next line, U+0085), LS  (line  separator,  U+2028),  and  PS
+                 The newline type may be specified in upper, lower,  or  mixed
+                 case.  If the newline type is NUL, lines are separated by bi-
+                 nary zero characters. The other types are the  single-charac-
+                 ter  sequences  CR  (carriage  return) and LF (linefeed), the
+                 two-character sequence CRLF, an "anycrlf" type, which  recog-
+                 nizes  any  of  the preceding three types, and an "any" type,
+                 for which any Unicode line ending sequence is assumed to  end
+                 a  line.  The Unicode sequences are the three just mentioned,
+                 plus VT (vertical tab, U+000B), FF (form feed,  U+000C),  NEL
+                 (next  line,  U+0085),  LS  (line  separator, U+2028), and PS
                  (paragraph separator, U+2029).
 
-                 When  the  PCRE2  library is built, a default line-ending se-
-                 quence is specified.  This is normally the standard  sequence
-                 for  the operating system. Unless otherwise specified by this
+                 When the PCRE2 library is built, a  default  line-ending  se-
+                 quence  is specified.  This is normally the standard sequence
+                 for the operating system. Unless otherwise specified by  this
                  option, pcre2grep uses the library's default.
 
-                 This option makes it possible to use pcre2grep to scan  files
+                 This  option makes it possible to use pcre2grep to scan files
                  that have come from other environments without having to mod-
-                 ify their line endings. If the data  that  is  being  scanned
-                 does  not  agree  with  the  convention  set  by this option,
-                 pcre2grep may behave in strange ways. Note that  this  option
-                 does  not apply to files specified by the -f, --exclude-from,
-                 or --include-from options, which are expected to use the  op-
+                 ify  their  line  endings.  If the data that is being scanned
+                 does not agree  with  the  convention  set  by  this  option,
+                 pcre2grep  may  behave in strange ways. Note that this option
+                 does not apply to files specified by the -f,  --exclude-from,
+                 or  --include-from options, which are expected to use the op-
                  erating system's standard newline sequence.
 
        -n, --line-number
                  Precede each output line by its line number in the file, fol-
-                 lowed by a colon for matching lines or a hyphen  for  context
+                 lowed  by  a colon for matching lines or a hyphen for context
                  lines. If the file name is also being output, it precedes the
-                 line number. When the -M option causes  a  pattern  to  match
-                 more  than  one  line, only the first is preceded by its line
+                 line  number.  When  the  -M option causes a pattern to match
+                 more than one line, only the first is preceded  by  its  line
                  number. This option is forced if --line-offsets is used.
 
-       --no-jit  If the PCRE2 library is built with support  for  just-in-time
+       --no-jit  If  the  PCRE2 library is built with support for just-in-time
                  compiling (which speeds up matching), pcre2grep automatically
                  makes use of this, unless it was explicitly disabled at build
-                 time.  This  option  can be used to disable the use of JIT at
-                 run time. It is provided for testing and working round  prob-
+                 time. This option can be used to disable the use  of  JIT  at
+                 run  time. It is provided for testing and working round prob-
                  lems.  It should never be needed in normal use.
 
        -O text, --output=text
-                 When  there  is  a match, instead of outputting the line that
-                 matched, output just the text specified in this option,  fol-
-                 lowed  by an operating-system standard newline. In this mode,
-                 no context is shown. That is, the -A, -B, and -C options  are
-                 ignored.  The  --newline option has no effect on this option,
-                 which is mutually exclusive with --only-matching, --file-off-
-                 sets,  and  --line-offsets. However, like --only-matching, if
-                 there is more than one match in a line, each of them causes a
-                 line of output.
+                 When there is a match, instead of outputting  the  line  that
+                 matched,  output just the text specified in this option, fol-
+                 lowed by an operating-system standard newline. In this  mode,
+                 --colour  has  no  effect, and no context is shown.  That is,
+                 the -A, -B, and -C options are ignored. The --newline  option
+                 has  no  effect  on  this option, which is mutually exclusive
+                 with  --only-matching,  --file-offsets,  and  --line-offsets.
+                 However,  like  --only-matching,  if  there  is more than one
+                 match in a line, each of them causes a line of output.
 
                  Escape sequences starting with a dollar character may be used
                  to insert the contents of the matched part of the line and/or
                  captured substrings into the text.
 
-                 $<digits>  or  ${<digits>}  is  replaced by the captured sub-
-                 string of the given  decimal  number;  zero  substitutes  the
+                 $<digits> or ${<digits>} is replaced  by  the  captured  sub-
+                 string  of  the  given  decimal  number; zero substitutes the
                  whole match. If the number is greater than the number of cap-
-                 turing substrings, or if the capture is unset,  the  replace-
+                 turing  substrings,  or if the capture is unset, the replace-
                  ment is empty.
 
-                 $a  is replaced by bell; $b by backspace; $e by escape; $f by
-                 form feed; $n by newline; $r by carriage return; $t  by  tab;
+                 $a is replaced by bell; $b by backspace; $e by escape; $f  by
+                 form  feed;  $n by newline; $r by carriage return; $t by tab;
                  $v by vertical tab.
 
                  $o<digits> or $o{<digits>} is replaced by the character whose
-                 code point is the given octal number. In the first  form,  up
-                 to  three  octal  digits are processed.  When more digits are
-                 needed in Unicode mode to specify a wide character, the  sec-
+                 code  point  is the given octal number. In the first form, up
+                 to three octal digits are processed.  When  more  digits  are
+                 needed  in Unicode mode to specify a wide character, the sec-
                  ond form must be used.
 
-                 $x<digits>  or $x{<digits>} is replaced by the character rep-
-                 resented by the given hexadecimal number. In the first  form,
-                 up  to two hexadecimal digits are processed. When more digits
-                 are needed in Unicode mode to specify a wide  character,  the
+                 $x<digits> or $x{<digits>} is replaced by the character  rep-
+                 resented  by the given hexadecimal number. In the first form,
+                 up to two hexadecimal digits are processed. When more  digits
+                 are  needed  in Unicode mode to specify a wide character, the
                  second form must be used.
 
-                 Any  other character is substituted by itself. In particular,
+                 Any other character is substituted by itself. In  particular,
                  $$ is replaced by a single dollar.
 
        -o, --only-matching
                  Show only the part of the line that matched a pattern instead
-                 of  the  whole  line. In this mode, no context is shown. That
-                 is, the -A, -B, and -C options are ignored. If there is  more
-                 than  one  match in a line, each of them is shown separately,
-                 on a separate line of output. If -o is combined with -v  (in-
-                 vert  the  sense of the match to find non-matching lines), no
-                 output is generated, but the return  code  is  set  appropri-
-                 ately.  If  the matched portion of the line is empty, nothing
-                 is output unless the file  name  or  line  number  are  being
-                 printed,  in  which case they are shown on an otherwise empty
+                 of the whole line. In this mode, no context  is  shown.  That
+                 is,  the -A, -B, and -C options are ignored. If there is more
+                 than one match in a line, each of them is  shown  separately,
+                 on  a separate line of output. If -o is combined with -v (in-
+                 vert the sense of the match to find non-matching  lines),  no
+                 output  is  generated,  but  the return code is set appropri-
+                 ately. If the matched portion of the line is  empty,  nothing
+                 is  output  unless  the  file  name  or line number are being
+                 printed, in which case they are shown on an  otherwise  empty
                  line.  This  option  is  mutually  exclusive  with  --output,
                  --file-offsets and --line-offsets.
 
        -onumber, --only-matching=number
-                 Show  only  the  part  of the line that matched the capturing
+                 Show only the part of the line  that  matched  the  capturing
                  parentheses of the given number. Up to 50 capturing parenthe-
-                 ses  are  supported by default. This limit can be changed via
-                 the --om-capture option. A pattern may contain any number  of
-                 capturing  parentheses, but only those whose number is within
-                 the limit can be accessed by -o. An error occurs if the  num-
+                 ses are supported by default. This limit can be  changed  via
+                 the  --om-capture option. A pattern may contain any number of
+                 capturing parentheses, but only those whose number is  within
+                 the  limit can be accessed by -o. An error occurs if the num-
                  ber specified by -o is greater than the limit.
 
                  -o0 is the same as -o without a number. Because these options
-                 can be given without an argument (see above), if an  argument
-                 is  present, it must be given in the same shell item, for ex-
-                 ample, -o3 or --only-matching=2. The comments given  for  the
-                 non-argument  case  above  also  apply to this option. If the
-                 specified capturing parentheses do not exist in the  pattern,
-                 or  were  not  set in the match, nothing is output unless the
+                 can  be given without an argument (see above), if an argument
+                 is present, it must be given in the same shell item, for  ex-
+                 ample,  -o3  or --only-matching=2. The comments given for the
+                 non-argument case above also apply to  this  option.  If  the
+                 specified  capturing parentheses do not exist in the pattern,
+                 or were not set in the match, nothing is  output  unless  the
                  file name or line number are being output.
 
-                 If this option is given multiple times,  multiple  substrings
-                 are  output  for  each  match,  in  the order the options are
-                 given, and all on one line. For example, -o3 -o1  -o3  causes
-                 the  substrings  matched by capturing parentheses 3 and 1 and
-                 then 3 again to be output. By default, there is no  separator
+                 If  this  option is given multiple times, multiple substrings
+                 are output for each match,  in  the  order  the  options  are
+                 given,  and  all on one line. For example, -o3 -o1 -o3 causes
+                 the substrings matched by capturing parentheses 3 and  1  and
+                 then  3 again to be output. By default, there is no separator
                  (but see the next but one option).
 
        --om-capture=number
-                 Set  the number of capturing parentheses that can be accessed
+                 Set the number of capturing parentheses that can be  accessed
                  by -o. The default is 50.
 
        --om-separator=text
-                 Specify a separating string for multiple occurrences  of  -o.
-                 The  default is an empty string. Separating strings are never
+                 Specify  a  separating string for multiple occurrences of -o.
+                 The default is an empty string. Separating strings are  never
                  coloured.
 
        -q, --quiet
                  Work quietly, that is, display nothing except error messages.
-                 The  exit  status  indicates  whether or not any matches were
+                 The exit status indicates whether or  not  any  matches  were
                  found.
 
        -r, --recursive
-                 If any given path is a directory, recursively scan the  files
-                 it  contains, taking note of any --include and --exclude set-
-                 tings. By default, a directory is read as a normal  file;  in
-                 some  operating  systems this gives an immediate end-of-file.
-                 This option is a shorthand for setting the -d option to  "re-
+                 If  any given path is a directory, recursively scan the files
+                 it contains, taking note of any --include and --exclude  set-
+                 tings.  By  default, a directory is read as a normal file; in
+                 some operating systems this gives an  immediate  end-of-file.
+                 This  option is a shorthand for setting the -d option to "re-
                  curse".
 
        --recursion-limit=number
-                 This  is  an obsolete synonym for --depth-limit. See --match-
+                 This is an obsolete synonym for --depth-limit.  See  --match-
                  limit above for details.
 
        -s, --no-messages
-                 Suppress error  messages  about  non-existent  or  unreadable
-                 files.  Such  files  are quietly skipped. However, the return
+                 Suppress  error  messages  about  non-existent  or unreadable
+                 files. Such files are quietly skipped.  However,  the  return
                  code is still 2, even if matches were found in other files.
 
        -t, --total-count
-                 This option is useful when scanning more than  one  file.  If
-                 used  on its own, -t suppresses all output except for a grand
-                 total number of matching lines (or non-matching lines  if  -v
+                 This  option  is  useful when scanning more than one file. If
+                 used on its own, -t suppresses all output except for a  grand
+                 total  number  of matching lines (or non-matching lines if -v
                  is used) in all the files. If -t is used with -c, a grand to-
-                 tal is output except when the previous  output  is  just  one
-                 line.  In  other words, it is not output when just one file's
-                 count is listed. If file names are being  output,  the  grand
-                 total  is preceded by "TOTAL:". Otherwise, it appears as just
-                 another number. The -t option is ignored when  used  with  -L
-                 (list  files  without matches), because the grand total would
+                 tal  is  output  except  when the previous output is just one
+                 line. In other words, it is not output when just  one  file's
+                 count  is  listed.  If file names are being output, the grand
+                 total is preceded by "TOTAL:". Otherwise, it appears as  just
+                 another  number.  The  -t option is ignored when used with -L
+                 (list files without matches), because the grand  total  would
                  always be zero.
 
        -u, --utf Operate in UTF-8 mode. This option is available only if PCRE2
                  has been compiled with UTF-8 support. All patterns (including
-                 those for any --exclude and --include options) and all  lines
-                 that  are  scanned must be valid strings of UTF-8 characters.
+                 those  for any --exclude and --include options) and all lines
+                 that are scanned must be valid strings of  UTF-8  characters.
                  If an invalid UTF-8 string is encountered, an error occurs.
 
        -U, --utf-allow-invalid
-                 As --utf, but in addition subject lines may  contain  invalid
-                 UTF-8  code  unit sequences. These can never form part of any
-                 pattern match. Patterns themselves, however,  must  still  be
+                 As  --utf,  but in addition subject lines may contain invalid
+                 UTF-8 code unit sequences. These can never form part  of  any
+                 pattern  match.  Patterns  themselves, however, must still be
                  valid UTF-8 strings. This facility allows valid UTF-8 strings
                  to be sought within arbitrary byte sequences in executable or
-                 other  binary  files. For more details about matching in non-
+                 other binary files. For more details about matching  in  non-
                  valid UTF-8 strings, see the pcre2unicode(3) documentation.
 
        -V, --version
-                 Write the version numbers of pcre2grep and the PCRE2  library
-                 to  the  standard  output and then exit. Anything else on the
+                 Write  the version numbers of pcre2grep and the PCRE2 library
+                 to the standard output and then exit. Anything  else  on  the
                  command line is ignored.
 
        -v, --invert-match
-                 Invert the sense of the match, so that  lines  which  do  not
-                 match  any  of the patterns are the ones that are found. When
-                 this option is  set,  options  such  as  --only-matching  and
-                 --output,  which specify parts of a match that are to be out-
+                 Invert  the  sense  of  the match, so that lines which do not
+                 match any of the patterns are the ones that are  found.  When
+                 this  option  is  set,  options  such  as --only-matching and
+                 --output, which specify parts of a match that are to be  out-
                  put, are ignored.
 
        -w, --word-regex, --word-regexp
                  Force the patterns only to match "words". That is, there must
-                 be  a  word  boundary  at  the  start and end of each matched
-                 string. This is equivalent to having "\b(?:" at the start  of
-                 each  pattern, and ")\b" at the end. This option applies only
-                 to the patterns that are  matched  against  the  contents  of
-                 files;  it does not apply to patterns specified by any of the
+                 be a word boundary at the  start  and  end  of  each  matched
+                 string.  This is equivalent to having "\b(?:" at the start of
+                 each pattern, and ")\b" at the end. This option applies  only
+                 to  the  patterns  that  are  matched against the contents of
+                 files; it does not apply to patterns specified by any of  the
                  --include or --exclude options.
 
        -x, --line-regex, --line-regexp
-                 Force the patterns to start matching only at  the  beginnings
-                 of  lines,  and  in  addition,  require  them to match entire
+                 Force  the  patterns to start matching only at the beginnings
+                 of lines, and in  addition,  require  them  to  match  entire
                  lines. In multiline mode the match may be more than one line.
                  This is equivalent to having "^(?:" at the start of each pat-
-                 tern and ")$" at the end. This option  applies  only  to  the
-                 patterns  that  are matched against the contents of files; it
-                 does not apply to patterns specified by any of the  --include
+                 tern  and  ")$"  at  the end. This option applies only to the
+                 patterns that are matched against the contents of  files;  it
+                 does  not apply to patterns specified by any of the --include
                  or --exclude options.
 
+       -Z, --null
+                 Terminate files names in the regular output with a zero  byte
+                 (the  NUL  character)  instead of what would normally appear.
+                 This is useful when file  names  contain  unusual  characters
+                 such  as  colons,  hyphens, or even newlines. The option does
+                 not apply to file names in error messages.
+
 
 ENVIRONMENT VARIABLES
 
@@ -823,137 +835,137 @@
 
 NEWLINES
 
-       The -N (--newline) option allows pcre2grep to scan files  with  newline
-       conventions  that differ from the default. This option affects only the
-       way scanned files are processed. It does not affect the  interpretation
-       of  files  specified  by  the -f, --file-list, --exclude-from, or --in-
+       The  -N  (--newline) option allows pcre2grep to scan files with newline
+       conventions that differ from the default. This option affects only  the
+       way  scanned files are processed. It does not affect the interpretation
+       of files specified by the -f,  --file-list,  --exclude-from,  or  --in-
        clude-from options.
 
-       Any parts of the scanned input files that are written to  the  standard
-       output  are copied with whatever newline sequences they have in the in-
-       put. However, if the final line of a file is output, and  it  does  not
-       end  with  a newline sequence, a newline sequence is added. If the new-
-       line setting is CR, LF, CRLF or NUL, that line ending  is  output;  for
+       Any  parts  of the scanned input files that are written to the standard
+       output are copied with whatever newline sequences they have in the  in-
+       put.  However,  if  the final line of a file is output, and it does not
+       end with a newline sequence, a newline sequence is added. If  the  new-
+       line  setting  is  CR, LF, CRLF or NUL, that line ending is output; for
        the other settings (ANYCRLF or ANY) a single NL is used.
 
-       The  newline  setting does not affect the way in which pcre2grep writes
-       newlines in informational messages to the  standard  output  and  error
-       streams.   Under  Windows,  the standard output is set to be binary, so
-       that "\r\n" at the ends of output lines that are copied from the  input
-       is  not converted to "\r\r\n" by the C I/O library. This means that any
-       messages written to the standard output must end with "\r\n".  For  all
-       other  operating  systems,  and  for all messages to the standard error
+       The newline setting does not affect the way in which  pcre2grep  writes
+       newlines  in  informational  messages  to the standard output and error
+       streams.  Under Windows, the standard output is set to  be  binary,  so
+       that  "\r\n" at the ends of output lines that are copied from the input
+       is not converted to "\r\r\n" by the C I/O library. This means that  any
+       messages  written  to the standard output must end with "\r\n". For all
+       other operating systems, and for all messages  to  the  standard  error
        stream, "\n" is used.
 
 
 OPTIONS COMPATIBILITY
 
        Many of the short and long forms of pcre2grep's options are the same as
-       in  the GNU grep program. Any long option of the form --xxx-regexp (GNU
+       in the GNU grep program. Any long option of the form --xxx-regexp  (GNU
        terminology) is also available as --xxx-regex (PCRE2 terminology). How-
-       ever,  the  --depth-limit,  --file-list,  --file-offsets, --heap-limit,
-       --include-dir, --line-offsets, --locale,  --match-limit,  -M,  --multi-
-       line,  -N,  --newline,  --om-separator,  --output,  -u,  --utf, -U, and
+       ever, the  --depth-limit,  --file-list,  --file-offsets,  --heap-limit,
+       --include-dir,  --line-offsets,  --locale,  --match-limit, -M, --multi-
+       line, -N, --newline,  --om-separator,  --output,  -u,  --utf,  -U,  and
        --utf-allow-invalid options are specific to pcre2grep, as is the use of
        the --only-matching option with a capturing parentheses number.
 
-       Although  most  of the common options work the same way, a few are dif-
-       ferent in pcre2grep. For example, the --include option's argument is  a
-       glob  for GNU grep, but a regular expression for pcre2grep. If both the
-       -c and -l options are given, GNU grep lists only  file  names,  without
+       Although most of the common options work the same way, a few  are  dif-
+       ferent  in pcre2grep. For example, the --include option's argument is a
+       glob for GNU grep, but a regular expression for pcre2grep. If both  the
+       -c  and  -l  options are given, GNU grep lists only file names, without
        counts, but pcre2grep gives the counts as well.
 
 
 OPTIONS WITH DATA
 
        There are four different ways in which an option with data can be spec-
-       ified.  If a short form option is used, the  data  may  follow  immedi-
+       ified.   If  a  short  form option is used, the data may follow immedi-
        ately, or (with one exception) in the next command line item. For exam-
        ple:
 
          -f/some/file
          -f /some/file
 
-       The exception is the -o option, which may appear with or without  data.
-       Because  of this, if data is present, it must follow immediately in the
+       The  exception is the -o option, which may appear with or without data.
+       Because of this, if data is present, it must follow immediately in  the
        same item, for example -o3.
 
-       If a long form option is used, the data may appear in the same  command
-       line  item,  separated by an equals character, or (with two exceptions)
+       If  a long form option is used, the data may appear in the same command
+       line item, separated by an equals character, or (with  two  exceptions)
        it may appear in the next command line item. For example:
 
          --file=/some/file
          --file /some/file
 
-       Note, however, that if you want to supply a file name beginning with  ~
-       as  data  in a shell command, and have the shell expand ~ to a home di-
-       rectory, you must separate the file name from the option,  because  the
+       Note,  however, that if you want to supply a file name beginning with ~
+       as data in a shell command, and have the shell expand ~ to a  home  di-
+       rectory,  you  must separate the file name from the option, because the
        shell does not treat ~ specially unless it is at the start of an item.
 
-       The  exceptions  to the above are the --colour (or --color) and --only-
-       matching options, for which the data is optional. If one of  these  op-
-       tions  does  have  data,  it  must be given in the first form, using an
+       The exceptions to the above are the --colour (or --color)  and  --only-
+       matching  options,  for which the data is optional. If one of these op-
+       tions does have data, it must be given in  the  first  form,  using  an
        equals character. Otherwise pcre2grep will assume that it has no data.
 
 
 USING PCRE2'S CALLOUT FACILITY
 
-       pcre2grep has, by default, support for  calling  external  programs  or
-       scripts  or  echoing  specific strings during matching by making use of
-       PCRE2's callout facility. However, this support can  be  completely  or
-       partially  disabled  when  pcre2grep is built. You can find out whether
-       your binary has support for callouts by running it with the --help  op-
-       tion.  If  callout support is completely disabled, all callouts in pat-
+       pcre2grep  has,  by  default,  support for calling external programs or
+       scripts or echoing specific strings during matching by  making  use  of
+       PCRE2's  callout  facility.  However, this support can be completely or
+       partially disabled when pcre2grep is built. You can  find  out  whether
+       your  binary has support for callouts by running it with the --help op-
+       tion. If callout support is completely disabled, all callouts  in  pat-
        terns are ignored by pcre2grep.  If the facility is partially disabled,
-       calling  external  programs is not supported, and callouts that request
+       calling external programs is not supported, and callouts  that  request
        it are ignored.
 
-       A callout in a PCRE2 pattern is of the form (?C<arg>) where  the  argu-
-       ment  is either a number or a quoted string (see the pcre2callout docu-
-       mentation for details). Numbered callouts  are  ignored  by  pcre2grep;
+       A  callout  in a PCRE2 pattern is of the form (?C<arg>) where the argu-
+       ment is either a number or a quoted string (see the pcre2callout  docu-
+       mentation  for  details).  Numbered  callouts are ignored by pcre2grep;
        only callouts with string arguments are useful.
 
    Echoing a specific string
 
-       Starting  the  callout  string with a pipe character invokes an echoing
+       Starting the callout string with a pipe character  invokes  an  echoing
        facility that avoids calling an external program or script. This facil-
-       ity  is  always  available,  provided that callouts were not completely
-       disabled when pcre2grep was built. The rest of the  callout  string  is
-       processed  as  a zero-terminated string, which means it should not con-
-       tain any internal binary zeros. It is written  to  the  output,  having
-       first  been  passed through the same escape processing as text from the
-       --output (-O) option (see above). However, $0 cannot be used to  insert
-       a  matched  substring  because the match is still in progress. Instead,
-       the single character '0' is inserted. Any syntax errors in  the  string
-       (for  example,  a  dollar not followed by another character) causes the
-       callout to be ignored. No terminator is added to the output string,  so
-       if  you want a newline, you must include it explicitly using the escape
+       ity is always available, provided that  callouts  were  not  completely
+       disabled  when  pcre2grep  was built. The rest of the callout string is
+       processed as a zero-terminated string, which means it should  not  con-
+       tain  any  internal  binary  zeros. It is written to the output, having
+       first been passed through the same escape processing as text  from  the
+       --output  (-O) option (see above). However, $0 cannot be used to insert
+       a matched substring because the match is still  in  progress.  Instead,
+       the  single  character '0' is inserted. Any syntax errors in the string
+       (for example, a dollar not followed by another  character)  causes  the
+       callout  to be ignored. No terminator is added to the output string, so
+       if you want a newline, you must include it explicitly using the  escape
        $n. For example:
 
          pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
 
-       Matching continues normally after the string is output. If you want  to
-       see  only  the  callout output but not any output from an actual match,
+       Matching  continues normally after the string is output. If you want to
+       see only the callout output but not any output from  an  actual  match,
        you should end the pattern with (*FAIL).
 
    Calling external programs or scripts
 
        This facility can be independently disabled when pcre2grep is built. It
-       is  supported for Windows, where a call to _spawnvp() is used, for VMS,
-       where lib$spawn() is used, and  for  any  Unix-like  environment  where
+       is supported for Windows, where a call to _spawnvp() is used, for  VMS,
+       where  lib$spawn()  is  used,  and  for any Unix-like environment where
        fork() and execv() are available.
 
        If the callout string does not start with a pipe (vertical bar) charac-
-       ter, it is parsed into a list of substrings separated by  pipe  charac-
-       ters.  The first substring must be an executable name, with the follow-
+       ter,  it  is parsed into a list of substrings separated by pipe charac-
+       ters. The first substring must be an executable name, with the  follow-
        ing substrings specifying arguments:
 
          executable_name|arg1|arg2|...
 
-       Any substring (including the executable name) may  contain  escape  se-
-       quences  started  by  a dollar character. These are the same as for the
+       Any  substring  (including  the executable name) may contain escape se-
+       quences started by a dollar character. These are the same  as  for  the
        --output (-O) option documented above, except that $0 cannot insert the
-       matched  string  because  the  match is still in progress. Instead, the
+       matched string because the match is still  in  progress.  Instead,  the
        character '0' is inserted. If you need a literal dollar or pipe charac-
        ter in any substring, use $$ or $| respectively. Here is an example:
 
@@ -968,43 +980,43 @@
            Arg1: [1] [234] [4] Arg2: |1| ()
            12345
 
-       The  parameters  for the system call that is used to run the program or
+       The parameters for the system call that is used to run the  program  or
        script are zero-terminated strings. This means that binary zero charac-
-       ters  in the callout argument will cause premature termination of their
-       substrings, and therefore should not be present. Any syntax  errors  in
-       the  string  (for  example, a dollar not followed by another character)
+       ters in the callout argument will cause premature termination of  their
+       substrings,  and  therefore should not be present. Any syntax errors in
+       the string (for example, a dollar not followed  by  another  character)
        causes the callout to be ignored.  If running the program fails for any
-       reason  (including the non-existence of the executable), a local match-
+       reason (including the non-existence of the executable), a local  match-
        ing failure occurs and the matcher backtracks in the normal way.
 
 
 MATCHING ERRORS
 
-       It is possible to supply a regular expression that takes  a  very  long
-       time  to  fail  to  match certain lines. Such patterns normally involve
-       nested indefinite repeats, for example: (a+)*\d when matched against  a
-       line  of a's with no final digit. The PCRE2 matching function has a re-
-       source limit that causes it to abort in these  circumstances.  If  this
-       happens,  pcre2grep  outputs  an error message and the line that caused
-       the problem to the standard error stream. If there  are  more  than  20
+       It  is  possible  to supply a regular expression that takes a very long
+       time to fail to match certain lines.  Such  patterns  normally  involve
+       nested  indefinite repeats, for example: (a+)*\d when matched against a
+       line of a's with no final digit. The PCRE2 matching function has a  re-
+       source  limit  that  causes it to abort in these circumstances. If this
+       happens, pcre2grep outputs an error message and the  line  that  caused
+       the  problem  to  the  standard error stream. If there are more than 20
        such errors, pcre2grep gives up.
 
-       The  --match-limit  option  of pcre2grep can be used to set the overall
-       resource limit. There are also other limits that affect the  amount  of
-       memory  used  during  matching;  see the discussion of --heap-limit and
+       The --match-limit option of pcre2grep can be used to  set  the  overall
+       resource  limit.  There are also other limits that affect the amount of
+       memory used during matching; see the  discussion  of  --heap-limit  and
        --depth-limit above.
 
 
 DIAGNOSTICS
 
        Exit status is 0 if any matches were found, 1 if no matches were found,
-       and  2  for syntax errors, overlong lines, non-existent or inaccessible
-       files (even if matches were found in other files) or too many  matching
+       and 2 for syntax errors, overlong lines, non-existent  or  inaccessible
+       files  (even if matches were found in other files) or too many matching
        errors. Using the -s option to suppress error messages about inaccessi-
        ble files does not affect the return code.
 
-       When  run  under  VMS,  the  return  code  is  placed  in  the   symbol
-       PCRE2GREP_RC  because  VMS  does  not  distinguish  between exit(0) and
+       When   run  under  VMS,  the  return  code  is  placed  in  the  symbol
+       PCRE2GREP_RC because VMS  does  not  distinguish  between  exit(0)  and
        exit(1).
 
 
@@ -1022,5 +1034,5 @@
 
 REVISION
 
-       Last updated: 31 August 2021
-       Copyright (c) 1997-2021 University of Cambridge.
+       Last updated: 21 November 2022
+       Copyright (c) 1997-2022 University of Cambridge.
diff --git a/doc/pcre2limits.3 b/doc/pcre2limits.3
index 9bf3626..9281aa6 100644
--- a/doc/pcre2limits.3
+++ b/doc/pcre2limits.3
@@ -1,4 +1,4 @@
-.TH PCRE2LIMITS 3 "03 February 2019" "PCRE2 10.33"
+.TH PCRE2LIMITS 3 "26 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "SIZE AND OTHER LIMITATIONS"
@@ -51,6 +51,10 @@
 .P
 The maximum length of a string argument to a callout is the largest number a
 32-bit unsigned integer can hold.
+.P
+The maximum amount of heap memory used for matching is controlled by the heap
+limit, which can be set in a pattern or in a match context. The default is a
+very large number, effectively unlimited.
 .
 .
 .SH AUTHOR
@@ -58,7 +62,7 @@
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@@ -67,6 +71,6 @@
 .rs
 .sp
 .nf
-Last updated: 02 February 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 26 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
diff --git a/doc/pcre2perform.3 b/doc/pcre2perform.3
index 040369a..72aa67a 100644
--- a/doc/pcre2perform.3
+++ b/doc/pcre2perform.3
@@ -1,4 +1,4 @@
-.TH PCRE2PERFORM 3 "03 February 2019" "PCRE2 10.33"
+.TH PCRE2PERFORM 3 "27 July 2022" "PCRE2 10.41"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 PERFORMANCE"
@@ -69,12 +69,28 @@
 uses very little system stack at run time. In earlier releases recursive
 function calls could use a great deal of stack, and this could cause problems,
 but this usage has been eliminated. Backtracking positions are now explicitly
-remembered in memory frames controlled by the code. An initial 20KiB vector of
-frames is allocated on the system stack (enough for about 100 frames for small
-patterns), but if this is insufficient, heap memory is used. The amount of heap
-memory can be limited; if the limit is set to zero, only the initial stack
-vector is used. Rewriting patterns to be time-efficient, as described below,
-may also reduce the memory requirements.
+remembered in memory frames controlled by the code.
+.P
+The size of each frame depends on the size of pointer variables and the number
+of capturing parenthesized groups in the pattern being matched. On a 64-bit
+system the frame size for a pattern with no captures is 128 bytes. For each
+capturing group the size increases by 16 bytes.
+.P
+Until release 10.41, an initial 20KiB frames vector was allocated on the system
+stack, but this still caused some issues for multi-thread applications where
+each thread has a very small stack. From release 10.41 backtracking memory
+frames are always held in heap memory. An initial heap allocation is obtained
+the first time any match data block is passed to \fBpcre2_match()\fP. This is
+remembered with the match data block and re-used if that block is used for
+another match. It is freed when the match data block itself is freed.
+.P
+The size of the initial block is the larger of 20KiB or ten times the pattern's
+frame size, unless the heap limit is less than this, in which case the heap
+limit is used. If the initial block proves to be too small during matching, it
+is replaced by a larger block, subject to the heap limit. The heap limit is
+checked only when a new block is to be allocated. Reducing the heap limit
+between calls to \fBpcre2_match()\fP with the same match data block does not
+affect the saved block.
 .P
 In contrast to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP does use recursive
 function calls, but only for processing atomic groups, lookaround assertions,
@@ -230,7 +246,7 @@
 .sp
 .nf
 Philip Hazel
-University Computing Service
+Retired from University Computing Service
 Cambridge, England.
 .fi
 .
@@ -239,6 +255,6 @@
 .rs
 .sp
 .nf
-Last updated: 03 February 2019
-Copyright (c) 1997-2019 University of Cambridge.
+Last updated: 27 July 2022
+Copyright (c) 1997-2022 University of Cambridge.
 .fi
diff --git a/doc/pcre2serialize.3 b/doc/pcre2serialize.3
index 987bc3a..a94f13b 100644
--- a/doc/pcre2serialize.3
+++ b/doc/pcre2serialize.3
@@ -81,7 +81,7 @@
 .sp
   PCRE2_ERROR_BADDATA      the number of patterns is zero or less
   PCRE2_ERROR_BADMAGIC     mismatch of id bytes in one of the patterns
-  PCRE2_ERROR_MEMORY       memory allocation failed
+  PCRE2_ERROR_NOMEMORY     memory allocation failed
   PCRE2_ERROR_MIXEDTABLES  the patterns do not all use the same tables
   PCRE2_ERROR_NULL         the 1st, 3rd, or 4th argument is NULL
 .sp
diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
index d374f3e..3ac42e0 100644
--- a/doc/pcre2test.1
+++ b/doc/pcre2test.1
@@ -1,4 +1,4 @@
-.TH PCRE2TEST 1 "12 January 2022" "PCRE 10.40"
+.TH PCRE2TEST 1 "27 July 2022" "PCRE 10.41"
 .SH NAME
 pcre2test - a program for testing Perl-compatible regular expressions.
 .SH SYNOPSIS
@@ -219,7 +219,7 @@
 -C and any -Lx options are present, whichever is first is recognized.
 .TP 10
 \fB-LS\fP
-List scripts: write a list of recogized Unicode script names to the standard
+List scripts: write a list of recognized Unicode script names to the standard
 output, then exit with zero exit code. All other options are ignored. If both
 -C and any -Lx options are present, whichever is first is recognized.
 .TP 10
@@ -1206,7 +1206,8 @@
       copy=<number or name>      copy captured substring
       depth_limit=<n>            set a depth limit
       dfa                        use \fBpcre2_dfa_match()\fP
-      find_limits                find match and depth limits
+      find_limits                find heap, match and depth limits
+      find_limits_noheap         find match and depth limits
       get=<number or name>       extract captured substring
       getall                     extract all captured substrings
   /g  global                     global matching
@@ -1333,7 +1334,7 @@
 .\" </a>
 below.
 .\"
-Testing callouts from \fBpcre2_substitute()\fP is decribed separately in
+Testing callouts from \fBpcre2_substitute()\fP is described separately in
 "Testing the substitution function"
 .\" HTML <a href="#substitution">
 .\" </a>
@@ -1528,7 +1529,7 @@
 .sp
 The \fBheap_limit\fP, \fBmatch_limit\fP, and \fBdepth_limit\fP modifiers set
 the appropriate limits in the match context. These values are ignored when the
-\fBfind_limits\fP modifier is specified.
+\fBfind_limits\fP or \fBfind_limits_noheap\fP modifier is specified.
 .
 .
 .SS "Finding minimum limits"
@@ -1538,8 +1539,12 @@
 calls the relevant matching function several times, setting different values in
 the match context via \fBpcre2_set_heap_limit()\fP,
 \fBpcre2_set_match_limit()\fP, or \fBpcre2_set_depth_limit()\fP until it finds
-the minimum values for each parameter that allows the match to complete without
-error. If JIT is being used, only the match limit is relevant.
+the smallest value for each parameter that allows the match to complete without
+a "limit exceeded" error. The match itself may succeed or fail. An alternative
+modifier, \fBfind_limits_noheap\fP, omits the heap limit. This is used in the
+standard tests, because the minimum heap limit varies between systems. If JIT
+is being used, only the match limit is relevant, and the other two are
+automatically omitted.
 .P
 When using this modifier, the pattern should not contain any limit settings
 such as (*LIMIT_MATCH=...) within it. If such a setting is present and is
@@ -1563,9 +1568,7 @@
 overall amount of computing resource that is used.
 .P
 For both kinds of matching, the \fIheap_limit\fP number, which is in kibibytes
-(units of 1024 bytes), limits the amount of heap memory used for matching. A
-value of zero disables the use of any heap memory; many simple pattern matches
-can be done without using the heap, so zero is not an unreasonable setting.
+(units of 1024 bytes), limits the amount of heap memory used for matching.
 .
 .
 .SS "Showing MARK names"
@@ -1584,12 +1587,10 @@
 .sp
 The \fBmemory\fP modifier causes \fBpcre2test\fP to log the sizes of all heap
 memory allocation and freeing calls that occur during a call to
-\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. These occur only when a match
-requires a bigger vector than the default for remembering backtracking points
-(\fBpcre2_match()\fP) or for internal workspace (\fBpcre2_dfa_match()\fP). In
-many cases there will be no heap memory used and therefore no additional
-output. No heap memory is allocated during matching with JIT, so in that case
-the \fBmemory\fP modifier never has any effect. For this modifier to work, the
+\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. In the latter case, heap memory
+is used only when a match requires more internal workspace that the default
+allocation on the stack, so in many cases there will be no output. No heap
+memory is allocated during matching with JIT. For this modifier to work, the
 \fBnull_context\fP modifier must not be set on both the pattern and the
 subject, though it can be set on one or the other.
 .
@@ -1649,7 +1650,8 @@
 If the \fBnull_context\fP modifier is set, however, NULL is passed. This is for
 testing that the matching and substitution functions behave correctly in this
 case (they use default values). This modifier cannot be used with the
-\fBfind_limits\fP or \fBsubstitute_callout\fP modifiers.
+\fBfind_limits\fP, \fBfind_limits_noheap\fP, or \fBsubstitute_callout\fP
+modifiers.
 .P
 Similarly, for testing purposes, if the \fBnull_subject\fP or
 \fBnull_replacement\fP modifier is set, the subject or replacement string
@@ -2091,7 +2093,7 @@
 If \fBjitverify\fP is used with #pop, it does not automatically imply
 \fBjit\fP, which is different behaviour from when it is used on a pattern.
 .P
-The #popcopy command is analagous to the \fBpushcopy\fP modifier in that it
+The #popcopy command is analogous to the \fBpushcopy\fP modifier in that it
 makes current a copy of the topmost stack pattern, leaving the original still
 on the stack.
 .
@@ -2119,6 +2121,6 @@
 .rs
 .sp
 .nf
-Last updated: 12 January 2022
+Last updated: 27 July 2022
 Copyright (c) 1997-2022 University of Cambridge.
 .fi
diff --git a/doc/pcre2test.txt b/doc/pcre2test.txt
index ed7dd20..b03b11f 100644
--- a/doc/pcre2test.txt
+++ b/doc/pcre2test.txt
@@ -205,7 +205,7 @@
                  All other options are ignored. If both -C and any -Lx options
                  are present, whichever is first is recognized.
 
-       -LS       List  scripts: write a list of recogized Unicode script names
+       -LS       List scripts: write a list of recognized Unicode script names
                  to the standard output, then exit with zero  exit  code.  All
                  other options are ignored. If both -C and any -Lx options are
                  present, whichever is first is recognized.
@@ -1111,7 +1111,8 @@
              copy=<number or name>      copy captured substring
              depth_limit=<n>            set a depth limit
              dfa                        use pcre2_dfa_match()
-             find_limits                find match and depth limits
+             find_limits                find heap, match and depth limits
+             find_limits_noheap         find match and depth limits
              get=<number or name>       extract captured substring
              getall                     extract all captured substrings
          /g  global                     global matching
@@ -1230,7 +1231,7 @@
        ing functions, unless callout_none is specified. Its behaviour  can  be
        controlled  by  various  modifiers  listed above whose names begin with
        callout_. Details are given in the section entitled  "Callouts"  below.
-       Testing  callouts  from  pcre2_substitute()  is  decribed separately in
+       Testing  callouts  from  pcre2_substitute()  is described separately in
        "Testing the substitution function" below.
 
    Finding all matches in a string
@@ -1411,7 +1412,7 @@
 
        The heap_limit, match_limit, and depth_limit modifiers set  the  appro-
        priate  limits  in the match context. These values are ignored when the
-       find_limits modifier is specified.
+       find_limits or find_limits_noheap modifier is specified.
 
    Finding minimum limits
 
@@ -1419,8 +1420,12 @@
        calls  the  relevant matching function several times, setting different
        values   in   the    match    context    via    pcre2_set_heap_limit(),
        pcre2_set_match_limit(),  or pcre2_set_depth_limit() until it finds the
-       minimum values for each parameter that allows  the  match  to  complete
-       without error. If JIT is being used, only the match limit is relevant.
+       smallest value for each parameter that allows  the  match  to  complete
+       without a "limit exceeded" error. The match itself may succeed or fail.
+       An alternative modifier, find_limits_noheap, omits the heap limit. This
+       is  used  in  the standard tests, because the minimum heap limit varies
+       between systems. If JIT is being used, only the match  limit  is  rele-
+       vant, and the other two are automatically omitted.
 
        When using this modifier, the pattern should not contain any limit set-
        tings such as (*LIMIT_MATCH=...)  within  it.  If  such  a  setting  is
@@ -1446,9 +1451,7 @@
 
        For  both  kinds  of  matching,  the  heap_limit  number,  which  is in
        kibibytes (units of 1024 bytes), limits the amount of heap memory  used
-       for matching. A value of zero disables the use of any heap memory; many
-       simple pattern matches can be done without using the heap, so  zero  is
-       not an unreasonable setting.
+       for matching.
 
    Showing MARK names
 
@@ -1463,13 +1466,11 @@
 
        The  memory modifier causes pcre2test to log the sizes of all heap mem-
        ory  allocation  and  freeing  calls  that  occur  during  a  call   to
-       pcre2_match()  or  pcre2_dfa_match(). These occur only when a match re-
-       quires a bigger vector than the default  for  remembering  backtracking
-       points  (pcre2_match())  or for internal workspace (pcre2_dfa_match()).
-       In many cases there will be no heap memory used and therefore no  addi-
-       tional output. No heap memory is allocated during matching with JIT, so
-       in that case the memory modifier never has any effect. For  this  modi-
-       fier  to  work,  the  null_context modifier must not be set on both the
+       pcre2_match()  or pcre2_dfa_match(). In the latter case, heap memory is
+       used only when a match requires more internal workspace  that  the  de-
+       fault  allocation  on the stack, so in many cases there will be no out-
+       put. No heap memory is allocated during matching  with  JIT.  For  this
+       modifier to work, the null_context modifier must not be set on both the
        pattern and the subject, though it can be set on one or the other.
 
    Setting a starting offset
@@ -1518,45 +1519,46 @@
        null_context  modifier  is  set,  however,  NULL is passed. This is for
        testing that the matching and substitution functions  behave  correctly
        in  this  case  (they use default values). This modifier cannot be used
-       with the find_limits or substitute_callout modifiers.
+       with the find_limits, find_limits_noheap, or  substitute_callout  modi-
+       fiers.
 
-       Similarly, for testing purposes, if the null_subject  or  null_replace-
-       ment  modifier  is  set, the subject or replacement string pointers are
+       Similarly,  for  testing purposes, if the null_subject or null_replace-
+       ment modifier is set, the subject or replacement  string  pointers  are
        passed as NULL, respectively, to the relevant functions.
 
 
 THE ALTERNATIVE MATCHING FUNCTION
 
-       By default,  pcre2test  uses  the  standard  PCRE2  matching  function,
+       By  default,  pcre2test  uses  the  standard  PCRE2  matching function,
        pcre2_match() to match each subject line. PCRE2 also supports an alter-
-       native matching function, pcre2_dfa_match(), which operates in  a  dif-
-       ferent  way, and has some restrictions. The differences between the two
+       native  matching  function, pcre2_dfa_match(), which operates in a dif-
+       ferent way, and has some restrictions. The differences between the  two
        functions are described in the pcre2matching documentation.
 
-       If the dfa modifier is set, the alternative matching function is  used.
-       This  function  finds all possible matches at a given point in the sub-
-       ject. If, however, the dfa_shortest modifier is set,  processing  stops
-       after  the  first  match is found. This is always the shortest possible
+       If  the dfa modifier is set, the alternative matching function is used.
+       This function finds all possible matches at a given point in  the  sub-
+       ject.  If,  however, the dfa_shortest modifier is set, processing stops
+       after the first match is found. This is always  the  shortest  possible
        match.
 
 
 DEFAULT OUTPUT FROM pcre2test
 
-       This section describes the output when the  normal  matching  function,
+       This  section  describes  the output when the normal matching function,
        pcre2_match(), is being used.
 
-       When  a  match  succeeds,  pcre2test  outputs the list of captured sub-
-       strings, starting with number 0 for the string that matched  the  whole
+       When a match succeeds, pcre2test outputs  the  list  of  captured  sub-
+       strings,  starting  with number 0 for the string that matched the whole
        pattern.  Otherwise, it outputs "No match" when the return is PCRE2_ER-
-       ROR_NOMATCH, or "Partial match:" followed  by  the  partially  matching
-       substring  when  the  return is PCRE2_ERROR_PARTIAL. (Note that this is
-       the entire substring that was inspected during the  partial  match;  it
-       may  include  characters  before the actual match start if a lookbehind
+       ROR_NOMATCH,  or  "Partial  match:"  followed by the partially matching
+       substring when the return is PCRE2_ERROR_PARTIAL. (Note  that  this  is
+       the  entire  substring  that was inspected during the partial match; it
+       may include characters before the actual match start  if  a  lookbehind
        assertion, \K, \b, or \B was involved.)
 
        For any other return, pcre2test outputs the PCRE2 negative error number
-       and  a  short  descriptive  phrase. If the error is a failed UTF string
-       check, the code unit offset of the start of the  failing  character  is
+       and a short descriptive phrase. If the error is  a  failed  UTF  string
+       check,  the  code  unit offset of the start of the failing character is
        also output. Here is an example of an interactive pcre2test run.
 
          $ pcre2test
@@ -1572,8 +1574,8 @@
        Unset capturing substrings that are not followed by one that is set are
        not shown by pcre2test unless the allcaptures modifier is specified. In
        the following example, there are two capturing substrings, but when the
-       first data line is matched, the second, unset substring is  not  shown.
-       An  "internal" unset substring is shown as "<unset>", as for the second
+       first  data  line is matched, the second, unset substring is not shown.
+       An "internal" unset substring is shown as "<unset>", as for the  second
        data line.
 
            re> /(a)|(b)/
@@ -1585,11 +1587,11 @@
           1: <unset>
           2: b
 
-       If the strings contain any non-printing characters, they are output  as
-       \xhh  escapes  if  the  value is less than 256 and UTF mode is not set.
+       If  the strings contain any non-printing characters, they are output as
+       \xhh escapes if the value is less than 256 and UTF  mode  is  not  set.
        Otherwise they are output as \x{hh...} escapes. See below for the defi-
-       nition  of  non-printing  characters. If the aftertext modifier is set,
-       the output for substring 0 is followed by the the rest of  the  subject
+       nition of non-printing characters. If the aftertext  modifier  is  set,
+       the  output  for substring 0 is followed by the the rest of the subject
        string, identified by "0+" like this:
 
            re> /cat/aftertext
@@ -1609,8 +1611,8 @@
           0: ipp
           1: pp
 
-       "No match" is output only if the first match attempt fails. Here is  an
-       example  of  a  failure  message (the offset 4 that is specified by the
+       "No  match" is output only if the first match attempt fails. Here is an
+       example of a failure message (the offset 4 that  is  specified  by  the
        offset modifier is past the end of the subject string):
 
            re> /xyz/
@@ -1618,7 +1620,7 @@
          Error -24 (bad offset value)
 
        Note that whereas patterns can be continued over several lines (a plain
-       ">"  prompt  is used for continuations), subject lines may not. However
+       ">" prompt is used for continuations), subject lines may  not.  However
        newlines can be included in a subject by means of the \n escape (or \r,
        \r\n, etc., depending on the newline sequence setting).
 
@@ -1626,7 +1628,7 @@
 OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
 
        When the alternative matching function, pcre2_dfa_match(), is used, the
-       output consists of a list of all the matches that start  at  the  first
+       output  consists  of  a list of all the matches that start at the first
        point in the subject where there is at least one match. For example:
 
            re> /(tang|tangerine|tan)/
@@ -1635,11 +1637,11 @@
           1: tang
           2: tan
 
-       Using  the normal matching function on this data finds only "tang". The
-       longest matching string is always given first (and numbered zero).  Af-
-       ter  a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol-
+       Using the normal matching function on this data finds only "tang".  The
+       longest  matching string is always given first (and numbered zero). Af-
+       ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:",  fol-
        lowed by the partially matching substring. Note that this is the entire
-       substring  that  was inspected during the partial match; it may include
+       substring that was inspected during the partial match; it  may  include
        characters before the actual match start if a lookbehind assertion, \b,
        or \B was involved. (\K is not supported for DFA matching.)
 
@@ -1655,16 +1657,16 @@
           1: tan
           0: tan
 
-       The alternative matching function does not support  substring  capture,
-       so  the  modifiers  that are concerned with captured substrings are not
+       The  alternative  matching function does not support substring capture,
+       so the modifiers that are concerned with captured  substrings  are  not
        relevant.
 
 
 RESTARTING AFTER A PARTIAL MATCH
 
-       When the alternative matching function has given  the  PCRE2_ERROR_PAR-
+       When  the  alternative matching function has given the PCRE2_ERROR_PAR-
        TIAL return, indicating that the subject partially matched the pattern,
-       you can restart the match with additional subject data by means of  the
+       you  can restart the match with additional subject data by means of the
        dfa_restart modifier. For example:
 
            re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
@@ -1673,37 +1675,37 @@
          data> n05\=dfa,dfa_restart
           0: n05
 
-       For  further  information  about partial matching, see the pcre2partial
+       For further information about partial matching,  see  the  pcre2partial
        documentation.
 
 
 CALLOUTS
 
        If the pattern contains any callout requests, pcre2test's callout func-
-       tion  is  called during matching unless callout_none is specified. This
+       tion is called during matching unless callout_none is  specified.  This
        works with both matching functions, and with JIT, though there are some
-       differences  in behaviour. The output for callouts with numerical argu-
+       differences in behaviour. The output for callouts with numerical  argu-
        ments and those with string arguments is slightly different.
 
    Callouts with numerical arguments
 
        By default, the callout function displays the callout number, the start
-       and  current positions in the subject text at the callout time, and the
+       and current positions in the subject text at the callout time, and  the
        next pattern item to be tested. For example:
 
          --->pqrabcdef
            0    ^  ^     \d
 
-       This output indicates that callout number 0 occurred for  a  match  at-
-       tempt  starting at the fourth character of the subject string, when the
-       pointer was at the seventh character, and when the  next  pattern  item
-       was  \d.  Just  one circumflex is output if the start and current posi-
+       This  output  indicates  that callout number 0 occurred for a match at-
+       tempt starting at the fourth character of the subject string, when  the
+       pointer  was  at  the seventh character, and when the next pattern item
+       was \d. Just one circumflex is output if the start  and  current  posi-
        tions are the same, or if the current position precedes the start posi-
        tion, which can happen if the callout is in a lookbehind assertion.
 
        Callouts numbered 255 are assumed to be automatic callouts, inserted as
        a result of the auto_callout pattern modifier. In this case, instead of
-       showing  the  callout  number, the offset in the pattern, preceded by a
+       showing the callout number, the offset in the pattern,  preceded  by  a
        plus, is output. For example:
 
            re> /\d?[A-E]\*/auto_callout
@@ -1730,17 +1732,17 @@
          +12 ^  ^
           0: abc
 
-       The mark changes between matching "a" and "b", but stays the  same  for
-       the  rest  of  the match, so nothing more is output. If, as a result of
-       backtracking, the mark reverts to being unset, the  text  "<unset>"  is
+       The  mark  changes between matching "a" and "b", but stays the same for
+       the rest of the match, so nothing more is output. If, as  a  result  of
+       backtracking,  the  mark  reverts to being unset, the text "<unset>" is
        output.
 
    Callouts with string arguments
 
        The output for a callout with a string argument is similar, except that
-       instead of outputting a callout number before the position  indicators,
-       the  callout string and its offset in the pattern string are output be-
-       fore the reflection of the subject string, and the  subject  string  is
+       instead  of outputting a callout number before the position indicators,
+       the callout string and its offset in the pattern string are output  be-
+       fore  the  reflection  of the subject string, and the subject string is
        reflected for each callout. For example:
 
            re> /^ab(?C'first')cd(?C"second")ef/
@@ -1756,26 +1758,26 @@
 
    Callout modifiers
 
-       The  callout  function in pcre2test returns zero (carry on matching) by
-       default, but you can use a callout_fail modifier in a subject  line  to
+       The callout function in pcre2test returns zero (carry on  matching)  by
+       default,  but  you can use a callout_fail modifier in a subject line to
        change this and other parameters of the callout (see below).
 
        If the callout_capture modifier is set, the current captured groups are
        output when a callout occurs. This is useful only for non-DFA matching,
-       as  pcre2_dfa_match()  does  not  support capturing, so no captures are
+       as pcre2_dfa_match() does not support capturing,  so  no  captures  are
        ever shown.
 
        The normal callout output, showing the callout number or pattern offset
-       (as  described above) is suppressed if the callout_no_where modifier is
+       (as described above) is suppressed if the callout_no_where modifier  is
        set.
 
-       When using the interpretive  matching  function  pcre2_match()  without
-       JIT,  setting  the callout_extra modifier causes additional output from
-       pcre2test's callout function to be generated. For the first callout  in
-       a  match  attempt at a new starting position in the subject, "New match
-       attempt" is output. If there has been a backtrack since the last  call-
+       When  using  the  interpretive  matching function pcre2_match() without
+       JIT, setting the callout_extra modifier causes additional  output  from
+       pcre2test's  callout function to be generated. For the first callout in
+       a match attempt at a new starting position in the subject,  "New  match
+       attempt"  is output. If there has been a backtrack since the last call-
        out (or start of matching if this is the first callout), "Backtrack" is
-       output, followed by "No other matching paths" if  the  backtrack  ended
+       output,  followed  by  "No other matching paths" if the backtrack ended
        the previous match attempt. For example:
 
           re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess
@@ -1812,86 +1814,86 @@
           +1    ^    a+
          No match
 
-       Notice  that  various  optimizations must be turned off if you want all
-       possible matching paths to be  scanned.  If  no_start_optimize  is  not
-       used,  there  is an immediate "no match", without any callouts, because
-       the starting optimization fails to find "b" in the  subject,  which  it
-       knows  must  be  present for any match. If no_auto_possess is not used,
-       the "a+" item is turned into "a++", which reduces the number  of  back-
+       Notice that various optimizations must be turned off if  you  want  all
+       possible  matching  paths  to  be  scanned. If no_start_optimize is not
+       used, there is an immediate "no match", without any  callouts,  because
+       the  starting  optimization  fails to find "b" in the subject, which it
+       knows must be present for any match. If no_auto_possess  is  not  used,
+       the  "a+"  item is turned into "a++", which reduces the number of back-
        tracks.
 
-       The  callout_extra modifier has no effect if used with the DFA matching
+       The callout_extra modifier has no effect if used with the DFA  matching
        function, or with JIT.
 
    Return values from callouts
 
-       The default return from the callout  function  is  zero,  which  allows
+       The  default  return  from  the  callout function is zero, which allows
        matching to continue. The callout_fail modifier can be given one or two
        numbers. If there is only one number, 1 is returned instead of 0 (caus-
        ing matching to backtrack) when a callout of that number is reached. If
-       two numbers (<n>:<m>) are given, 1 is  returned  when  callout  <n>  is
-       reached  and  there  have been at least <m> callouts. The callout_error
+       two  numbers  (<n>:<m>)  are  given,  1 is returned when callout <n> is
+       reached and there have been at least <m>  callouts.  The  callout_error
        modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, caus-
-       ing  the entire matching process to be aborted. If both these modifiers
-       are set for the same callout number,  callout_error  takes  precedence.
-       Note  that  callouts  with string arguments are always given the number
+       ing the entire matching process to be aborted. If both these  modifiers
+       are  set  for  the same callout number, callout_error takes precedence.
+       Note that callouts with string arguments are always  given  the  number
        zero.
 
-       The callout_data modifier can be given an unsigned or a  negative  num-
-       ber.   This  is  set  as the "user data" that is passed to the matching
-       function, and passed back when the callout  function  is  invoked.  Any
-       value  other  than  zero  is  used as a return from pcre2test's callout
+       The  callout_data  modifier can be given an unsigned or a negative num-
+       ber.  This is set as the "user data" that is  passed  to  the  matching
+       function,  and  passed  back  when the callout function is invoked. Any
+       value other than zero is used as  a  return  from  pcre2test's  callout
        function.
 
        Inserting callouts can be helpful when using pcre2test to check compli-
-       cated  regular expressions. For further information about callouts, see
+       cated regular expressions. For further information about callouts,  see
        the pcre2callout documentation.
 
 
 NON-PRINTING CHARACTERS
 
        When pcre2test is outputting text in the compiled version of a pattern,
-       bytes  other  than 32-126 are always treated as non-printing characters
+       bytes other than 32-126 are always treated as  non-printing  characters
        and are therefore shown as hex escapes.
 
-       When pcre2test is outputting text that is a matched part of  a  subject
-       string,  it behaves in the same way, unless a different locale has been
-       set for the pattern (using the locale modifier). In this case, the  is-
+       When  pcre2test  is outputting text that is a matched part of a subject
+       string, it behaves in the same way, unless a different locale has  been
+       set  for the pattern (using the locale modifier). In this case, the is-
        print() function is used to distinguish printing and non-printing char-
        acters.
 
 
 SAVING AND RESTORING COMPILED PATTERNS
 
-       It is possible to save compiled patterns  on  disc  or  elsewhere,  and
+       It  is  possible  to  save  compiled patterns on disc or elsewhere, and
        reload them later, subject to a number of restrictions. JIT data cannot
-       be saved. The host on which the patterns are reloaded must  be  running
+       be  saved.  The host on which the patterns are reloaded must be running
        the same version of PCRE2, with the same code unit width, and must also
-       have the same endianness, pointer width  and  PCRE2_SIZE  type.  Before
-       compiled  patterns  can be saved they must be serialized, that is, con-
-       verted to a stream of bytes. A single byte stream may contain any  num-
-       ber  of compiled patterns, but they must all use the same character ta-
-       bles. A single copy of the tables is included in the byte  stream  (its
+       have  the  same  endianness,  pointer width and PCRE2_SIZE type. Before
+       compiled patterns can be saved they must be serialized, that  is,  con-
+       verted  to a stream of bytes. A single byte stream may contain any num-
+       ber of compiled patterns, but they must all use the same character  ta-
+       bles.  A  single copy of the tables is included in the byte stream (its
        size is 1088 bytes).
 
-       The  functions whose names begin with pcre2_serialize_ are used for se-
-       rializing and de-serializing. They are described in the  pcre2serialize
-       documentation.  In  this  section we describe the features of pcre2test
+       The functions whose names begin with pcre2_serialize_ are used for  se-
+       rializing  and de-serializing. They are described in the pcre2serialize
+       documentation. In this section we describe the  features  of  pcre2test
        that can be used to test these functions.
 
-       Note that "serialization" in PCRE2 does not convert  compiled  patterns
-       to  an  abstract  format  like Java or .NET. It just makes a reloadable
+       Note  that  "serialization" in PCRE2 does not convert compiled patterns
+       to an abstract format like Java or .NET. It  just  makes  a  reloadable
        byte code stream.  Hence the restrictions on reloading mentioned above.
 
-       In pcre2test, when a pattern with push modifier  is  successfully  com-
-       piled,  it  is  pushed onto a stack of compiled patterns, and pcre2test
-       expects the next line to contain a new pattern (or command) instead  of
+       In  pcre2test,  when  a pattern with push modifier is successfully com-
+       piled, it is pushed onto a stack of compiled  patterns,  and  pcre2test
+       expects  the next line to contain a new pattern (or command) instead of
        a subject line. By contrast, the pushcopy modifier causes a copy of the
-       compiled pattern to be stacked, leaving the original available for  im-
-       mediate  matching.  By using push and/or pushcopy, a number of patterns
-       can be compiled and retained. These  modifiers  are  incompatible  with
+       compiled  pattern to be stacked, leaving the original available for im-
+       mediate matching. By using push and/or pushcopy, a number  of  patterns
+       can  be  compiled  and  retained. These modifiers are incompatible with
        posix, and control modifiers that act at match time are ignored (with a
-       message) for the stacked patterns. The jitverify modifier applies  only
+       message)  for the stacked patterns. The jitverify modifier applies only
        at compile time.
 
        The command
@@ -1899,21 +1901,21 @@
          #save <filename>
 
        causes all the stacked patterns to be serialized and the result written
-       to the named file. Afterwards, all the stacked patterns are freed.  The
+       to  the named file. Afterwards, all the stacked patterns are freed. The
        command
 
          #load <filename>
 
-       reads  the  data in the file, and then arranges for it to be de-serial-
-       ized, with the resulting compiled patterns added to the pattern  stack.
-       The  pattern  on the top of the stack can be retrieved by the #pop com-
-       mand, which must be followed by  lines  of  subjects  that  are  to  be
-       matched  with  the pattern, terminated as usual by an empty line or end
-       of file. This command may be followed by  a  modifier  list  containing
-       only  control  modifiers that act after a pattern has been compiled. In
-       particular, hex, posix, posix_nosub, push, and  pushcopy  are  not  al-
-       lowed,  nor  are  any option-setting modifiers.  The JIT modifiers are,
-       however permitted. Here is an example that saves and reloads  two  pat-
+       reads the data in the file, and then arranges for it to  be  de-serial-
+       ized,  with the resulting compiled patterns added to the pattern stack.
+       The pattern on the top of the stack can be retrieved by the  #pop  com-
+       mand,  which  must  be  followed  by  lines  of subjects that are to be
+       matched with the pattern, terminated as usual by an empty line  or  end
+       of  file.  This  command  may be followed by a modifier list containing
+       only control modifiers that act after a pattern has been  compiled.  In
+       particular,  hex,  posix,  posix_nosub,  push, and pushcopy are not al-
+       lowed, nor are any option-setting modifiers.  The  JIT  modifiers  are,
+       however  permitted.  Here is an example that saves and reloads two pat-
        terns.
 
          /abc/push
@@ -1926,10 +1928,10 @@
          #pop jit,bincode
          abc
 
-       If  jitverify  is  used with #pop, it does not automatically imply jit,
+       If jitverify is used with #pop, it does not  automatically  imply  jit,
        which is different behaviour from when it is used on a pattern.
 
-       The #popcopy command is analagous to the pushcopy modifier in  that  it
+       The  #popcopy  command is analogous to the pushcopy modifier in that it
        makes current a copy of the topmost stack pattern, leaving the original
        still on the stack.
 
@@ -1949,5 +1951,5 @@
 
 REVISION
 
-       Last updated: 12 January 2022
+       Last updated: 27 July 2022
        Copyright (c) 1997-2022 University of Cambridge.
diff --git a/index.md b/index.md
index 7295ded..d3fff17 100644
--- a/index.md
+++ b/index.md
@@ -14,14 +14,14 @@
 ## Download
 
 As well as downloading from the 
-[GitHub site](https://github.com/PhilipHazel/pcre2), you can download PCRE2 
+[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2 
 or the older, unmaintained PCRE1 library from an 
 [*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
 
 You can check out the PCRE2 source code via Git or Subversion:
 
-    git clone https://github.com/PhilipHazel/pcre2.git
-    svn co    https://github.com/PhilipHazel/pcre2.git
+    git clone https://github.com/PCRE2Project/pcre2.git
+    svn co    https://github.com/PCRE2Project/pcre2.git
 
 ## Contributed Ports
 
@@ -36,7 +36,7 @@
 ## Documentation
 
 You can read the PCRE2 documentation 
-[here](https://philiphazel.github.io/pcre2/doc/html/index.html).
+[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
 
 Comparisons to Perl's regular expression semantics can be found in the
 community authored Wikipedia entry for PCRE.
diff --git a/maint/README b/maint/README
index f21ff87..3d341b8 100644
--- a/maint/README
+++ b/maint/README
@@ -78,9 +78,9 @@
   A short, freestanding C program for converting a Unicode code point into a
   sequence of bytes in the UTF-8 encoding, and vice versa. If its argument is a
   hex number such as 0x1234, it outputs a list of the equivalent UTF-8 bytes.
-  If its argument is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it
-  treats them as a UTF-8 character and outputs the equivalent code point in
-  hex. See comments at its head for details.
+  If its argument is a sequence of concatenated UTF-8 bytes (e.g. 12e188b4) it
+  treats them as a UTF-8 string and outputs the equivalent code points in hex.
+  See comments at its head for details.
 
 
 Updating to a new Unicode release
@@ -94,8 +94,9 @@
 Note: Previously, it was necessary to update lists of scripts and their 
 abbreviations by hand before running the Python scripts. This is no longer
 necessary because the scripts have been upgraded to extract this information
-themselves. Also, there used to be explicit lists of script in two of the man
-pages. This is no longer the case.
+themselves. Also, there used to be explicit lists of scripts in two of the man
+pages. This is no longer the case; the pcre2test program can now output a list 
+of supported scripts.
 
 You can give an output file name as an argument to the following scripts, but
 by default:
@@ -129,8 +130,8 @@
 Preparing for a PCRE2 release
 =============================
 
-This section contains a checklist of things that I consult before building a
-distribution for a new release.
+This section contains a checklist of things that I do before building a new
+release.
 
 . Ensure that the version number and version date are correct in configure.ac.
 
@@ -139,17 +140,16 @@
 
 . If new build options or new source files have been added, ensure that they
   are added to the CMake files as well as to the autoconf files. The relevant
-  files are CMakeLists.txt and config-cmake.h.in. After making a release
-  tarball, test it out with CMake if there have been changes here.
+  files are CMakeLists.txt and config-cmake.h.in. After making a release, test
+  it out with CMake if there have been changes here.
 
 . Run ./autogen.sh to ensure everything is up-to-date.
 
 . Compile and test with many different config options, and combinations of
   options. Also, test with valgrind by running "RunTest valgrind" and
-  "RunGrepTest valgrind" (which takes quite a long time). The script
-  maint/ManyConfigTests now encapsulates this testing. It runs tests with
-  different configurations, and it also runs some of them with valgrind, all of
-  which can take quite some time.
+  "RunGrepTest valgrind". The script maint/ManyConfigTests now encapsulates
+  this testing. It runs tests with different configurations, and it also runs
+  some of them with valgrind, all of which can take quite some time.
 
 . Run tests in both 32-bit and 64-bit environments if possible. I can no longer
   run 32-bit tests.
@@ -164,7 +164,8 @@
   -fsanitize=signed-integer-overflow
 
 . Do a test build using CMake. Remove src/config.h first, lest it override the
-  version that CMake creates. Do NOT use parallel make.
+  version that CMake creates. Also do a CMake unity build to check that it 
+  still works: [c]cmake -DCMAKE_UNITY_BUILD=ON sets up a unity build.
 
 . Run perltest.sh on the test data for tests 1 and 4. The output should match
   the PCRE2 test output, apart from the version identification at the start of
@@ -183,11 +184,12 @@
   systems. For example, on Solaris it is helpful to test using Sun's cc
   compiler as a change from gcc. Adding -xarch=v9 to the cc options does a
   64-bit test, but it also needs -S 64 for pcre2test to increase the stack size
-  for test 2. Since I retired I can no longer do much of this, but instead I
-  rely on putting out release candidates for testing by the community.
+  for test 2. Since I retired I can no longer do much of this. There are 
+  automated tests under Ubuntu, Alpine, and Windows that are now set up as 
+  GitHub actions. Check that they are running clean.
 
 . The buildbots at http://buildfarm.opencsw.org/ do some automated testing
-  of PCRE2 and should be checked before putting out a release.
+  of PCRE2 and should also be checked before putting out a release.
 
 
 Updating version info for libtool
@@ -243,10 +245,11 @@
 sources and refreshes the HTML documentation. Update the GitHub repository with
 "git push".
 
-Once PrepareRelease has run clean, run "make distcheck" to create the tarball
+Once PrepareRelease has run clean, run "make distcheck" to create the tarballs
 and the zipball. I then sign these files. Double-check with "git status" that
-the repository is fully up-to-date, then create a new tag on GitHub. Upload the
-tarball, zipball, and the signatures as "assets" of the GitHub release.
+the repository is fully up-to-date, then create a new tag and a release on
+GitHub. Upload the tarballs, zipball, and the signatures as "assets" of the
+GitHub release.
 
 When the new release is out, don't forget to tell webmaster@pcre.org and the
 mailing list.
@@ -365,8 +368,6 @@
 
   See Unicode TR 29. The last two are very much aimed at natural language.
 
-. (?[...]) extended classes: big project.
-
 . Allow a callout to specify a number of characters to skip. This can be done
   compatibly via an extra callout field.
 
@@ -436,13 +437,8 @@
   with lookarounds for \b and \B. Ideally the setting should last till the end
   of the group, which means remembering all previous settings; maybe a fixed
   amount of stack would do - how deep would anyone want to nest these things?
-  See GitHub issue #13 for a compendium of character class issues.
-
-. Recognize the short script names. They are already listed in maint/
-  Multistage2.py because they are needed for scanning the script extensions
-  file.
-
-. Use script extensions for \p?
+  See GitHub issue #13 for a compendium of character class issues, including
+  (?[...]) extended classes.
 
 . A user suggested something like --with-build-info to set a build information
   string that could be retrieved by pcre2_config(). However, there's no
@@ -461,4 +457,4 @@
 Philip Hazel
 Email local part: Philip.Hazel
 Email domain: gmail.com
-Last updated: 10 January 2022
+Last updated: 25 April 2022
diff --git a/maint/ucptest.c b/maint/ucptest.c
index 48475bb..6faead3 100644
--- a/maint/ucptest.c
+++ b/maint/ucptest.c
@@ -546,7 +546,6 @@
 int type = -1;
 int gbreak = -1;
 int bidiclass = -1;
-BOOL bidicontrol = FALSE;
 BOOL script_not = FALSE;
 BOOL type_not = FALSE;
 BOOL gbreak_not = FALSE;
@@ -559,12 +558,10 @@
   {
   unsigned int offset = 0;
   BOOL scriptx_not = FALSE;
-  char *value_start;
 
   for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
   *t = 0;
   while (isspace(*s)) s++;
-  value_start = s;
 
   for (t = value; *s != 0 && !isspace(*s); s++) 
     {
diff --git a/maint/ucptestdata/testoutput1 b/maint/ucptestdata/testoutput1
index 36421c2..469be7d 100644
--- a/maint/ucptestdata/testoutput1
+++ b/maint/ucptestdata/testoutput1
@@ -1,139 +1,139 @@
 findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 
-U+0000 BN  Control: Control, common, Control, [ascii]
-U+0001 BN  Control: Control, common, Control, [ascii]
-U+0002 BN  Control: Control, common, Control, [ascii]
-U+0003 BN  Control: Control, common, Control, [ascii]
-U+0004 BN  Control: Control, common, Control, [ascii]
-U+0005 BN  Control: Control, common, Control, [ascii]
-U+0006 BN  Control: Control, common, Control, [ascii]
-U+0007 BN  Control: Control, common, Control, [ascii]
-U+0008 BN  Control: Control, common, Control, [ascii]
-U+0009 S   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
-U+000A B   Control: Control, common, LF, [ascii, patternwhitespace, whitespace]
-U+000B S   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
-U+000C WS  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
-U+000D B   Control: Control, common, CR, [ascii, patternwhitespace, whitespace]
-U+000E BN  Control: Control, common, Control, [ascii]
-U+000F BN  Control: Control, common, Control, [ascii]
+U+0000 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0001 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0002 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0003 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0004 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0005 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0006 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0007 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0008 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0009 S   Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000A B   Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000B S   Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000C WS  Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000D B   Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000E BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+000F BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
 findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 
-U+0010 BN  Control: Control, common, Control, [ascii]
-U+0011 BN  Control: Control, common, Control, [ascii]
-U+0012 BN  Control: Control, common, Control, [ascii]
-U+0013 BN  Control: Control, common, Control, [ascii]
-U+0014 BN  Control: Control, common, Control, [ascii]
-U+0015 BN  Control: Control, common, Control, [ascii]
-U+0016 BN  Control: Control, common, Control, [ascii]
-U+0017 BN  Control: Control, common, Control, [ascii]
-U+0018 BN  Control: Control, common, Control, [ascii]
-U+0019 BN  Control: Control, common, Control, [ascii]
-U+001A BN  Control: Control, common, Control, [ascii]
-U+001B BN  Control: Control, common, Control, [ascii]
-U+001C B   Control: Control, common, Control, [ascii]
-U+001D B   Control: Control, common, Control, [ascii]
-U+001E B   Control: Control, common, Control, [ascii]
-U+001F S   Control: Control, common, Control, [ascii]
+U+0010 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0011 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0012 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0013 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0014 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0015 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0016 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0017 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0018 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0019 BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001A BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001B BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001C B   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001D B   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001E B   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+001F S   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
 findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 
-U+0020 WS  Separator: Space separator, common, Other, [ascii, graphemebase, patternwhitespace, whitespace]
-U+0021 ON  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
-U+0022 ON  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax, quotationmark]
-U+0023 ET  Punctuation: Other punctuation, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
-U+0024 ET  Symbol: Currency symbol, common, Other, [ascii, graphemebase, patternsyntax]
-U+0025 ET  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax]
-U+0026 ON  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax]
-U+0027 ON  Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
-U+0028 ON  Punctuation: Open punctuation, common, Other, [ascii, bidimirrored, graphemebase, patternsyntax]
-U+0029 ON  Punctuation: Close punctuation, common, Other, [ascii, bidimirrored, graphemebase, patternsyntax]
-U+002A ON  Punctuation: Other punctuation, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
-U+002B ES  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, math, patternsyntax]
-U+002C CS  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax, terminalpunctuation]
-U+002D ES  Punctuation: Dash punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
-U+002E CS  Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
-U+002F CS  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax]
+U+0020 WS  Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
+U+0021 ON  Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
+U+0022 ON  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, math, patternsyntax]
+U+0023 ET  Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
+U+0024 ET  Symbol: Currency symbol, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+0025 ET  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+0026 ON  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+0027 ON  Punctuation: Other punctuation, common, Other, [ascii, bidimirrored, graphemebase, math, patternsyntax]
+U+0028 ON  Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+0029 ON  Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+002A ON  Punctuation: Other punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
+U+002B ES  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
+U+002C CS  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+002D ES  Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
+U+002E CS  Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
+U+002F CS  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
 findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f 
-U+0030 EN  Number: Decimal number, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
-U+0031 EN  Number: Decimal number, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
-U+0032 EN  Number: Decimal number, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
-U+0033 EN  Number: Decimal number, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
-U+0034 EN  Number: Decimal number, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
-U+0035 EN  Number: Decimal number, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
-U+0036 EN  Number: Decimal number, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
-U+0037 EN  Number: Decimal number, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
-U+0038 EN  Number: Decimal number, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
-U+0039 EN  Number: Decimal number, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
-U+003A CS  Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, terminalpunctuation]
-U+003B ON  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax, terminalpunctuation]
-U+003C ON  Symbol: Mathematical symbol, common, Other, [ascii, bidimirrored, graphemebase, math, patternsyntax]
-U+003D ON  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, math, patternsyntax]
-U+003E ON  Symbol: Mathematical symbol, common, Other, [ascii, bidimirrored, graphemebase, math, patternsyntax]
-U+003F ON  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+0030 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0031 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0032 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0033 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0034 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0035 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0036 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0037 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0038 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+0039 EN  Number: Decimal number, common, Other, [caseignorable, diacritic, graphemebase]
+U+003A CS  Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+003B ON  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+003C ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
+U+003D ON  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
+U+003E ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
+U+003F ON  Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, quotationmark]
 findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f 
-U+0040 ON  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax]
-U+0041 L   Letter: Upper case letter, latin, Other, U+0061, [ascii, asciihexdigit, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, hexdigit, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0042 L   Letter: Upper case letter, latin, Other, U+0062, [ascii, asciihexdigit, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, hexdigit, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0043 L   Letter: Upper case letter, latin, Other, U+0063, [ascii, asciihexdigit, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, hexdigit, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0044 L   Letter: Upper case letter, latin, Other, U+0064, [ascii, asciihexdigit, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, hexdigit, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0045 L   Letter: Upper case letter, latin, Other, U+0065, [ascii, asciihexdigit, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, hexdigit, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0046 L   Letter: Upper case letter, latin, Other, U+0066, [ascii, asciihexdigit, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, hexdigit, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0047 L   Letter: Upper case letter, latin, Other, U+0067, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0048 L   Letter: Upper case letter, latin, Other, U+0068, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0049 L   Letter: Upper case letter, latin, Other, U+0069, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+004A L   Letter: Upper case letter, latin, Other, U+006A, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+004B L   Letter: Upper case letter, latin, Other, U+006B, U+212A, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+004C L   Letter: Upper case letter, latin, Other, U+006C, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+004D L   Letter: Upper case letter, latin, Other, U+006D, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+004E L   Letter: Upper case letter, latin, Other, U+006E, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+004F L   Letter: Upper case letter, latin, Other, U+006F, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+0040 ON  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+0041 L   Letter: Upper case letter, latin, Other, U+0061, [graphemebase]
+U+0042 L   Letter: Upper case letter, latin, Other, U+0062, [graphemebase]
+U+0043 L   Letter: Upper case letter, latin, Other, U+0063, [graphemebase]
+U+0044 L   Letter: Upper case letter, latin, Other, U+0064, [graphemebase]
+U+0045 L   Letter: Upper case letter, latin, Other, U+0065, [graphemebase]
+U+0046 L   Letter: Upper case letter, latin, Other, U+0066, [graphemebase]
+U+0047 L   Letter: Upper case letter, latin, Other, U+0067, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0048 L   Letter: Upper case letter, latin, Other, U+0068, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0049 L   Letter: Upper case letter, latin, Other, U+0069, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004A L   Letter: Upper case letter, latin, Other, U+006A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004B L   Letter: Upper case letter, latin, Other, U+006B, U+212A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004C L   Letter: Upper case letter, latin, Other, U+006C, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004D L   Letter: Upper case letter, latin, Other, U+006D, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004E L   Letter: Upper case letter, latin, Other, U+006E, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+004F L   Letter: Upper case letter, latin, Other, U+006F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
 findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f 
-U+0050 L   Letter: Upper case letter, latin, Other, U+0070, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0051 L   Letter: Upper case letter, latin, Other, U+0071, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0052 L   Letter: Upper case letter, latin, Other, U+0072, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0053 L   Letter: Upper case letter, latin, Other, U+0073, U+017F, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0054 L   Letter: Upper case letter, latin, Other, U+0074, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0055 L   Letter: Upper case letter, latin, Other, U+0075, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0056 L   Letter: Upper case letter, latin, Other, U+0076, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0057 L   Letter: Upper case letter, latin, Other, U+0077, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0058 L   Letter: Upper case letter, latin, Other, U+0078, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0059 L   Letter: Upper case letter, latin, Other, U+0079, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+005A L   Letter: Upper case letter, latin, Other, U+007A, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+005B ON  Punctuation: Open punctuation, common, Other, [ascii, bidimirrored, graphemebase, patternsyntax]
-U+005C ON  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax]
-U+005D ON  Punctuation: Close punctuation, common, Other, [ascii, bidimirrored, graphemebase, patternsyntax]
-U+005E ON  Symbol: Modifier symbol, common, Other, [ascii, caseignorable, diacritic, graphemebase, math, patternsyntax]
-U+005F ON  Punctuation: Connector punctuation, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
+U+0050 L   Letter: Upper case letter, latin, Other, U+0070, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0051 L   Letter: Upper case letter, latin, Other, U+0071, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0052 L   Letter: Upper case letter, latin, Other, U+0072, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0053 L   Letter: Upper case letter, latin, Other, U+0073, U+017F, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0054 L   Letter: Upper case letter, latin, Other, U+0074, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0055 L   Letter: Upper case letter, latin, Other, U+0075, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0056 L   Letter: Upper case letter, latin, Other, U+0076, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0057 L   Letter: Upper case letter, latin, Other, U+0077, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0058 L   Letter: Upper case letter, latin, Other, U+0078, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+0059 L   Letter: Upper case letter, latin, Other, U+0079, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+005A L   Letter: Upper case letter, latin, Other, U+007A, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
+U+005B ON  Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+005C ON  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+005D ON  Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+005E ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+005F ON  Punctuation: Connector punctuation, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, deprecated, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
 findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f 
-U+0060 ON  Symbol: Modifier symbol, common, Other, [ascii, caseignorable, diacritic, graphemebase, patternsyntax]
-U+0061 L   Letter: Lower case letter, latin, Other, U+0041, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0062 L   Letter: Lower case letter, latin, Other, U+0042, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0063 L   Letter: Lower case letter, latin, Other, U+0043, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0064 L   Letter: Lower case letter, latin, Other, U+0044, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0065 L   Letter: Lower case letter, latin, Other, U+0045, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0066 L   Letter: Lower case letter, latin, Other, U+0046, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0067 L   Letter: Lower case letter, latin, Other, U+0047, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0068 L   Letter: Lower case letter, latin, Other, U+0048, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0069 L   Letter: Lower case letter, latin, Other, U+0049, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
-U+006A L   Letter: Lower case letter, latin, Other, U+004A, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
-U+006B L   Letter: Lower case letter, latin, Other, U+004B, U+212A, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+006C L   Letter: Lower case letter, latin, Other, U+004C, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+006D L   Letter: Lower case letter, latin, Other, U+004D, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+006E L   Letter: Lower case letter, latin, Other, U+004E, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+006F L   Letter: Lower case letter, latin, Other, U+004F, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0060 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+0061 L   Letter: Lower case letter, latin, Other, U+0041, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0062 L   Letter: Lower case letter, latin, Other, U+0042, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0063 L   Letter: Lower case letter, latin, Other, U+0043, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0064 L   Letter: Lower case letter, latin, Other, U+0044, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0065 L   Letter: Lower case letter, latin, Other, U+0045, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0066 L   Letter: Lower case letter, latin, Other, U+0046, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0067 L   Letter: Lower case letter, latin, Other, U+0047, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0068 L   Letter: Lower case letter, latin, Other, U+0048, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0069 L   Letter: Lower case letter, latin, Other, U+0049, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+006A L   Letter: Lower case letter, latin, Other, U+004A, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+006B L   Letter: Lower case letter, latin, Other, U+004B, U+212A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+006C L   Letter: Lower case letter, latin, Other, U+004C, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+006D L   Letter: Lower case letter, latin, Other, U+004D, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+006E L   Letter: Lower case letter, latin, Other, U+004E, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+006F L   Letter: Lower case letter, latin, Other, U+004F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
 findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f 
-U+0070 L   Letter: Lower case letter, latin, Other, U+0050, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0071 L   Letter: Lower case letter, latin, Other, U+0051, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0072 L   Letter: Lower case letter, latin, Other, U+0052, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0073 L   Letter: Lower case letter, latin, Other, U+0053, U+017F, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0074 L   Letter: Lower case letter, latin, Other, U+0054, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0075 L   Letter: Lower case letter, latin, Other, U+0055, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0076 L   Letter: Lower case letter, latin, Other, U+0056, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0077 L   Letter: Lower case letter, latin, Other, U+0057, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0078 L   Letter: Lower case letter, latin, Other, U+0058, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0079 L   Letter: Lower case letter, latin, Other, U+0059, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+007A L   Letter: Lower case letter, latin, Other, U+005A, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+007B ON  Punctuation: Open punctuation, common, Other, [ascii, bidimirrored, graphemebase, patternsyntax]
-U+007C ON  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, math, patternsyntax]
-U+007D ON  Punctuation: Close punctuation, common, Other, [ascii, bidimirrored, graphemebase, patternsyntax]
-U+007E ON  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, math, patternsyntax]
-U+007F BN  Control: Control, common, Control, [ascii]
+U+0070 L   Letter: Lower case letter, latin, Other, U+0050, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0071 L   Letter: Lower case letter, latin, Other, U+0051, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0072 L   Letter: Lower case letter, latin, Other, U+0052, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0073 L   Letter: Lower case letter, latin, Other, U+0053, U+017F, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0074 L   Letter: Lower case letter, latin, Other, U+0054, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0075 L   Letter: Lower case letter, latin, Other, U+0055, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0076 L   Letter: Lower case letter, latin, Other, U+0056, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0077 L   Letter: Lower case letter, latin, Other, U+0057, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0078 L   Letter: Lower case letter, latin, Other, U+0058, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+0079 L   Letter: Lower case letter, latin, Other, U+0059, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+007A L   Letter: Lower case letter, latin, Other, U+005A, [alphabetic, caseignorable, diacritic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+007B ON  Punctuation: Open punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+007C ON  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
+U+007D ON  Punctuation: Close punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+007E ON  Symbol: Mathematical symbol, common, Other, [ascii, graphemebase, idcontinue, xidcontinue]
+U+007F BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
 
 findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 
 U+0080 BN  Control: Control, common, Control
@@ -141,7 +141,7 @@
 U+0082 BN  Control: Control, common, Control
 U+0083 BN  Control: Control, common, Control
 U+0084 BN  Control: Control, common, Control
-U+0085 B   Control: Control, common, Control, [patternwhitespace, whitespace]
+U+0085 B   Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
 U+0086 BN  Control: Control, common, Control
 U+0087 BN  Control: Control, common, Control
 U+0088 BN  Control: Control, common, Control
@@ -170,240 +170,240 @@
 U+009E BN  Control: Control, common, Control
 U+009F BN  Control: Control, common, Control
 findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af 
-U+00A0 CS  Separator: Space separator, common, Other, [graphemebase, whitespace]
-U+00A1 ON  Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax]
-U+00A2 ET  Symbol: Currency symbol, common, Other, [graphemebase, patternsyntax]
-U+00A3 ET  Symbol: Currency symbol, common, Other, [graphemebase, patternsyntax]
-U+00A4 ET  Symbol: Currency symbol, common, Other, [graphemebase, patternsyntax]
-U+00A5 ET  Symbol: Currency symbol, common, Other, [graphemebase, patternsyntax]
-U+00A6 ON  Symbol: Other symbol, common, Other, [graphemebase, patternsyntax]
-U+00A7 ON  Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax]
-U+00A8 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+00A9 ON  Symbol: Other symbol, common, Extended Pictographic, [emoji, extendedpictographic, graphemebase, patternsyntax]
-U+00AA L   Letter: Other letter, latin, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00AB ON  Punctuation: Initial punctuation, common, Other, [bidimirrored, graphemebase, patternsyntax, quotationmark]
-U+00AC ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
-U+00AD BN  Control: Format, common, Control, [caseignorable, defaultignorablecodepoint]
-U+00AE ON  Symbol: Other symbol, common, Extended Pictographic, [emoji, extendedpictographic, graphemebase, patternsyntax]
-U+00AF ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
+U+00A0 CS  Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+U+00A1 ON  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A2 ET  Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A3 ET  Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A4 ET  Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A5 ET  Symbol: Currency symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A6 ON  Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A7 ON  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00A8 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00A9 ON  Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00AA L   Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
+U+00AB ON  Punctuation: Initial punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
+U+00AC ON  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+00AD BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+00AE ON  Symbol: Other symbol, common, Extended Pictographic, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00AF ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
 findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf 
-U+00B0 ET  Symbol: Other symbol, common, Other, [graphemebase, patternsyntax]
-U+00B1 ET  Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
-U+00B2 EN  Number: Other number, common, Other, [graphemebase]
-U+00B3 EN  Number: Other number, common, Other, [graphemebase]
-U+00B4 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+00B5 L   Letter: Lower case letter, common, Other, U+03BC, U+039C, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00B6 ON  Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax]
-U+00B7 ON  Punctuation: Other punctuation, common, Other, [caseignorable, diacritic, extender, graphemebase, idcontinue, xidcontinue]
-U+00B8 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+00B9 EN  Number: Other number, common, Other, [graphemebase]
-U+00BA L   Letter: Other letter, latin, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00BB ON  Punctuation: Final punctuation, common, Other, [bidimirrored, graphemebase, patternsyntax, quotationmark]
-U+00BC ON  Number: Other number, common, Other, [graphemebase]
-U+00BD ON  Number: Other number, common, Other, [graphemebase]
-U+00BE ON  Number: Other number, common, Other, [graphemebase]
-U+00BF ON  Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax]
+U+00B0 ET  Symbol: Other symbol, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00B1 ET  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+00B2 EN  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00B3 EN  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00B4 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00B5 L   Letter: Lower case letter, common, Other, U+03BC, U+039C, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00B6 ON  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
+U+00B7 ON  Punctuation: Other punctuation, common, Other, [alphabetic, graphemebase, idcontinue, xidcontinue]
+U+00B8 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00B9 EN  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00BA L   Letter: Other letter, latin, Other, [caseignorable, graphemeextend]
+U+00BB ON  Punctuation: Final punctuation, common, Other, [graphemebase, sentenceterminal, terminalpunctuation]
+U+00BC ON  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00BD ON  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00BE ON  Number: Other number, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+00BF ON  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, idcontinue, terminalpunctuation, xidcontinue]
 findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf 
-U+00C0 L   Letter: Upper case letter, latin, Other, U+00E0, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00C1 L   Letter: Upper case letter, latin, Other, U+00E1, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00C2 L   Letter: Upper case letter, latin, Other, U+00E2, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00C3 L   Letter: Upper case letter, latin, Other, U+00E3, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00C4 L   Letter: Upper case letter, latin, Other, U+00E4, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00C5 L   Letter: Upper case letter, latin, Other, U+00E5, U+212B, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00C6 L   Letter: Upper case letter, latin, Other, U+00E6, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00C7 L   Letter: Upper case letter, latin, Other, U+00E7, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00C8 L   Letter: Upper case letter, latin, Other, U+00E8, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00C9 L   Letter: Upper case letter, latin, Other, U+00E9, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00CA L   Letter: Upper case letter, latin, Other, U+00EA, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00CB L   Letter: Upper case letter, latin, Other, U+00EB, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00CC L   Letter: Upper case letter, latin, Other, U+00EC, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00CD L   Letter: Upper case letter, latin, Other, U+00ED, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00CE L   Letter: Upper case letter, latin, Other, U+00EE, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00CF L   Letter: Upper case letter, latin, Other, U+00EF, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+00C0 L   Letter: Upper case letter, latin, Other, U+00E0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C1 L   Letter: Upper case letter, latin, Other, U+00E1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C2 L   Letter: Upper case letter, latin, Other, U+00E2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C3 L   Letter: Upper case letter, latin, Other, U+00E3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C4 L   Letter: Upper case letter, latin, Other, U+00E4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C5 L   Letter: Upper case letter, latin, Other, U+00E5, U+212B, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C6 L   Letter: Upper case letter, latin, Other, U+00E6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C7 L   Letter: Upper case letter, latin, Other, U+00E7, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C8 L   Letter: Upper case letter, latin, Other, U+00E8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00C9 L   Letter: Upper case letter, latin, Other, U+00E9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CA L   Letter: Upper case letter, latin, Other, U+00EA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CB L   Letter: Upper case letter, latin, Other, U+00EB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CC L   Letter: Upper case letter, latin, Other, U+00EC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CD L   Letter: Upper case letter, latin, Other, U+00ED, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CE L   Letter: Upper case letter, latin, Other, U+00EE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00CF L   Letter: Upper case letter, latin, Other, U+00EF, [alphabetic, graphemeextend, idcontinue, xidcontinue]
 findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df 
-U+00D0 L   Letter: Upper case letter, latin, Other, U+00F0, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00D1 L   Letter: Upper case letter, latin, Other, U+00F1, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00D2 L   Letter: Upper case letter, latin, Other, U+00F2, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00D3 L   Letter: Upper case letter, latin, Other, U+00F3, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00D4 L   Letter: Upper case letter, latin, Other, U+00F4, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00D5 L   Letter: Upper case letter, latin, Other, U+00F5, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00D6 L   Letter: Upper case letter, latin, Other, U+00F6, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00D7 ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
-U+00D8 L   Letter: Upper case letter, latin, Other, U+00F8, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00D9 L   Letter: Upper case letter, latin, Other, U+00F9, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00DA L   Letter: Upper case letter, latin, Other, U+00FA, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00DB L   Letter: Upper case letter, latin, Other, U+00FB, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00DC L   Letter: Upper case letter, latin, Other, U+00FC, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00DD L   Letter: Upper case letter, latin, Other, U+00FD, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00DE L   Letter: Upper case letter, latin, Other, U+00FE, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+00DF L   Letter: Lower case letter, latin, Other, U+1E9E, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+00D0 L   Letter: Upper case letter, latin, Other, U+00F0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D1 L   Letter: Upper case letter, latin, Other, U+00F1, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D2 L   Letter: Upper case letter, latin, Other, U+00F2, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D3 L   Letter: Upper case letter, latin, Other, U+00F3, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D4 L   Letter: Upper case letter, latin, Other, U+00F4, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D5 L   Letter: Upper case letter, latin, Other, U+00F5, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D6 L   Letter: Upper case letter, latin, Other, U+00F6, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D7 ON  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+00D8 L   Letter: Upper case letter, latin, Other, U+00F8, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00D9 L   Letter: Upper case letter, latin, Other, U+00F9, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DA L   Letter: Upper case letter, latin, Other, U+00FA, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DB L   Letter: Upper case letter, latin, Other, U+00FB, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DC L   Letter: Upper case letter, latin, Other, U+00FC, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DD L   Letter: Upper case letter, latin, Other, U+00FD, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DE L   Letter: Upper case letter, latin, Other, U+00FE, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+00DF L   Letter: Lower case letter, latin, Other, U+1E9E, [alphabetic, deprecated, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
 findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef 
-U+00E0 L   Letter: Lower case letter, latin, Other, U+00C0, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00E1 L   Letter: Lower case letter, latin, Other, U+00C1, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00E2 L   Letter: Lower case letter, latin, Other, U+00C2, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00E3 L   Letter: Lower case letter, latin, Other, U+00C3, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00E4 L   Letter: Lower case letter, latin, Other, U+00C4, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00E5 L   Letter: Lower case letter, latin, Other, U+00C5, U+212B, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00E6 L   Letter: Lower case letter, latin, Other, U+00C6, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00E7 L   Letter: Lower case letter, latin, Other, U+00C7, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00E8 L   Letter: Lower case letter, latin, Other, U+00C8, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00E9 L   Letter: Lower case letter, latin, Other, U+00C9, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00EA L   Letter: Lower case letter, latin, Other, U+00CA, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00EB L   Letter: Lower case letter, latin, Other, U+00CB, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00EC L   Letter: Lower case letter, latin, Other, U+00CC, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00ED L   Letter: Lower case letter, latin, Other, U+00CD, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00EE L   Letter: Lower case letter, latin, Other, U+00CE, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00EF L   Letter: Lower case letter, latin, Other, U+00CF, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+00E0 L   Letter: Lower case letter, latin, Other, U+00C0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E1 L   Letter: Lower case letter, latin, Other, U+00C1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E2 L   Letter: Lower case letter, latin, Other, U+00C2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E3 L   Letter: Lower case letter, latin, Other, U+00C3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E4 L   Letter: Lower case letter, latin, Other, U+00C4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E5 L   Letter: Lower case letter, latin, Other, U+00C5, U+212B, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E6 L   Letter: Lower case letter, latin, Other, U+00C6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E7 L   Letter: Lower case letter, latin, Other, U+00C7, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E8 L   Letter: Lower case letter, latin, Other, U+00C8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00E9 L   Letter: Lower case letter, latin, Other, U+00C9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00EA L   Letter: Lower case letter, latin, Other, U+00CA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00EB L   Letter: Lower case letter, latin, Other, U+00CB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00EC L   Letter: Lower case letter, latin, Other, U+00CC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00ED L   Letter: Lower case letter, latin, Other, U+00CD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00EE L   Letter: Lower case letter, latin, Other, U+00CE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00EF L   Letter: Lower case letter, latin, Other, U+00CF, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
 findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff 
-U+00F0 L   Letter: Lower case letter, latin, Other, U+00D0, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00F1 L   Letter: Lower case letter, latin, Other, U+00D1, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00F2 L   Letter: Lower case letter, latin, Other, U+00D2, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00F3 L   Letter: Lower case letter, latin, Other, U+00D3, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00F4 L   Letter: Lower case letter, latin, Other, U+00D4, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00F5 L   Letter: Lower case letter, latin, Other, U+00D5, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00F6 L   Letter: Lower case letter, latin, Other, U+00D6, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00F7 ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
-U+00F8 L   Letter: Lower case letter, latin, Other, U+00D8, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00F9 L   Letter: Lower case letter, latin, Other, U+00D9, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00FA L   Letter: Lower case letter, latin, Other, U+00DA, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00FB L   Letter: Lower case letter, latin, Other, U+00DB, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00FC L   Letter: Lower case letter, latin, Other, U+00DC, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00FD L   Letter: Lower case letter, latin, Other, U+00DD, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00FE L   Letter: Lower case letter, latin, Other, U+00DE, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+00FF L   Letter: Lower case letter, latin, Other, U+0178, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+00F0 L   Letter: Lower case letter, latin, Other, U+00D0, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F1 L   Letter: Lower case letter, latin, Other, U+00D1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F2 L   Letter: Lower case letter, latin, Other, U+00D2, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F3 L   Letter: Lower case letter, latin, Other, U+00D3, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F4 L   Letter: Lower case letter, latin, Other, U+00D4, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F5 L   Letter: Lower case letter, latin, Other, U+00D5, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F6 L   Letter: Lower case letter, latin, Other, U+00D6, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F7 ON  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+00F8 L   Letter: Lower case letter, latin, Other, U+00D8, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00F9 L   Letter: Lower case letter, latin, Other, U+00D9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FA L   Letter: Lower case letter, latin, Other, U+00DA, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FB L   Letter: Lower case letter, latin, Other, U+00DB, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FC L   Letter: Lower case letter, latin, Other, U+00DC, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FD L   Letter: Lower case letter, latin, Other, U+00DD, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FE L   Letter: Lower case letter, latin, Other, U+00DE, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+00FF L   Letter: Lower case letter, latin, Other, U+0178, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
 
 findprop 0100 0101 0102 0103 0104 0105 0106
-U+0100 L   Letter: Upper case letter, latin, Other, U+0101, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0101 L   Letter: Lower case letter, latin, Other, U+0100, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0102 L   Letter: Upper case letter, latin, Other, U+0103, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0103 L   Letter: Lower case letter, latin, Other, U+0102, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0104 L   Letter: Upper case letter, latin, Other, U+0105, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+0105 L   Letter: Lower case letter, latin, Other, U+0104, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+0106 L   Letter: Upper case letter, latin, Other, U+0107, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+0100 L   Letter: Upper case letter, latin, Other, U+0101, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+0101 L   Letter: Lower case letter, latin, Other, U+0100, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+0102 L   Letter: Upper case letter, latin, Other, U+0103, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+0103 L   Letter: Lower case letter, latin, Other, U+0102, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+0104 L   Letter: Upper case letter, latin, Other, U+0105, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+0105 L   Letter: Lower case letter, latin, Other, U+0104, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+0106 L   Letter: Upper case letter, latin, Other, U+0107, [alphabetic, graphemeextend, idcontinue, xidcontinue]
 
 findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7 
-U+FFE0 ET  Symbol: Currency symbol, common, Other, [graphemebase]
-U+FFE1 ET  Symbol: Currency symbol, common, Other, [graphemebase]
-U+FFE2 ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math]
-U+FFE3 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+FFE4 ON  Symbol: Other symbol, common, Other, [graphemebase]
-U+FFE5 ET  Symbol: Currency symbol, common, Other, [graphemebase]
-U+FFE6 ET  Symbol: Currency symbol, common, Other, [graphemebase]
+U+FFE0 ET  Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFE1 ET  Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFE2 ON  Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
+U+FFE3 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+FFE4 ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFE5 ET  Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFE6 ET  Symbol: Currency symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
 U+FFE7 L   Control: Unassigned, unknown, Other
 findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
-U+FFE8 ON  Symbol: Other symbol, common, Other, [graphemebase]
-U+FFE9 ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math]
-U+FFEA ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math]
-U+FFEB ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math]
-U+FFEC ON  Symbol: Mathematical symbol, common, Other, [graphemebase, math]
-U+FFED ON  Symbol: Other symbol, common, Other, [graphemebase]
-U+FFEE ON  Symbol: Other symbol, common, Other, [graphemebase]
+U+FFE8 ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFE9 ON  Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
+U+FFEA ON  Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
+U+FFEB ON  Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
+U+FFEC ON  Symbol: Mathematical symbol, common, Other, [emoji, extendedpictographic, graphemebase]
+U+FFED ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFEE ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
 U+FFEF L   Control: Unassigned, unknown, Other
 findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
-U+FFF8 BN  Control: Unassigned, unknown, Control, [defaultignorablecodepoint]
-U+FFF9 ON  Control: Format, common, Control, [caseignorable]
-U+FFFA ON  Control: Format, common, Control, [caseignorable]
-U+FFFB ON  Control: Format, common, Control, [caseignorable]
-U+FFFC ON  Symbol: Other symbol, common, Other, [graphemebase]
-U+FFFD ON  Symbol: Other symbol, common, Other, [graphemebase]
-U+FFFE BN  Control: Unassigned, unknown, Other, [noncharactercodepoint]
-U+FFFF BN  Control: Unassigned, unknown, Other, [noncharactercodepoint]
+U+FFF8 BN  Control: Unassigned, unknown, Control, [dash, defaultignorablecodepoint, deprecated, extendedpictographic, joincontrol, lowercase, patternwhitespace, quotationmark, sentenceterminal, softdotted, xidcontinue, xidstart]
+U+FFF9 ON  Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
+U+FFFA ON  Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
+U+FFFB ON  Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
+U+FFFC ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFFD ON  Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FFFE BN  Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FFFF BN  Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
 findprop 10000 10001 e01ef f0000 100000
-U+10000 L   Letter: Other letter, linearb, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+10001 L   Letter: Other letter, linearb, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+E01EF NSM Mark: Non-spacing mark, inherited, Extend, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, variationselector, xidcontinue]
+U+10000 L   Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10001 L   Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+E01EF NSM Mark: Non-spacing mark, inherited, Extend, []
 U+F0000 L   Control: Private use, unknown, Other
 U+100000 L   Control: Private use, unknown, Other
 
 findprop 1b00 12000 7c0 a840 10900
-U+1B00 NSM Mark: Non-spacing mark, balinese, Extend, [alphabetic, caseignorable, graphemeextend, idcontinue, xidcontinue]
-U+12000 L   Letter: Other letter, cuneiform, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+07C0 R   Number: Decimal number, nko, Other, [graphemebase, idcontinue, xidcontinue]
-U+A840 L   Letter: Other letter, phagspa, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+10900 R   Letter: Other letter, phoenician, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+1B00 NSM Mark: Non-spacing mark, balinese, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
+U+12000 L   Letter: Other letter, cuneiform, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+07C0 R   Number: Decimal number, nko, Other, [graphemebase, patternsyntax, terminalpunctuation]
+U+A840 L   Letter: Other letter, phagspa, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10900 R   Letter: Other letter, phoenician, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
 findprop 1d79 a77d
-U+1D79 L   Letter: Lower case letter, latin, Other, U+A77D, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
-U+A77D L   Letter: Upper case letter, latin, Other, U+1D79, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+1D79 L   Letter: Lower case letter, latin, Other, U+A77D, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
+U+A77D L   Letter: Upper case letter, latin, Other, U+1D79, [alphabetic, graphemeextend, idcontinue, xidcontinue]
 
 findprop  0800  083e  a4d0  a4f7  aa80  aadf
-U+0800 R   Letter: Other letter, samaritan, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+083E R   Punctuation: Other punctuation, samaritan, Other, [graphemebase, sentenceterminal, terminalpunctuation]
-U+A4D0 L   Letter: Other letter, lisu, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+A4F7 L   Letter: Other letter, lisu, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AA80 L   Letter: Other letter, taiviet, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AADF L   Punctuation: Other punctuation, taiviet, Other, [graphemebase, terminalpunctuation]
+U+0800 R   Letter: Other letter, samaritan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+083E R   Punctuation: Other punctuation, samaritan, Other, [bidimirrored, graphemebase, math, patternsyntax]
+U+A4D0 L   Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+A4F7 L   Letter: Other letter, lisu, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AA80 L   Letter: Other letter, taiviet, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AADF L   Punctuation: Other punctuation, taiviet, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
 findprop 10b00 10b35 13000 1342e 10840 10855
-U+10B00 R   Letter: Other letter, avestan, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+10B35 R   Letter: Other letter, avestan, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+13000 L   Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+1342E L   Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+10840 R   Letter: Other letter, imperialaramaic, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+10855 R   Letter: Other letter, imperialaramaic, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+10B00 R   Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10B35 R   Letter: Other letter, avestan, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+13000 L   Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+1342E L   Letter: Other letter, egyptianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10840 R   Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10855 R   Letter: Other letter, imperialaramaic, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
 
 findprop 11100 1113c 11680 116c0
-U+11100 NSM Mark: Non-spacing mark, chakma, Extend, [alphabetic, caseignorable, graphemeextend, idcontinue, xidcontinue]
-U+1113C L   Number: Decimal number, chakma, Other, [graphemebase, idcontinue, xidcontinue]
-U+11680 L   Letter: Other letter, takri, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+116C0 L   Number: Decimal number, takri, Other, [graphemebase, idcontinue, xidcontinue]
+U+11100 NSM Mark: Non-spacing mark, chakma, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
+U+1113C L   Number: Decimal number, chakma, Other, [graphemebase, patternsyntax, terminalpunctuation]
+U+11680 L   Letter: Other letter, takri, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+116C0 L   Number: Decimal number, takri, Other, [graphemebase, patternsyntax, terminalpunctuation]
 
 findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
-U+000D B   Control: Control, common, CR, [ascii, patternwhitespace, whitespace]
-U+000A B   Control: Control, common, LF, [ascii, patternwhitespace, whitespace]
-U+000E BN  Control: Control, common, Control, [ascii]
-U+0711 NSM Mark: Non-spacing mark, syriac, Extend, [alphabetic, caseignorable, graphemeextend, idcontinue, xidcontinue]
-U+1B04 L   Mark: Spacing mark, balinese, SpacingMark, [alphabetic, graphemebase, idcontinue, xidcontinue]
-U+1111 L   Letter: Other letter, hangul, Hangul syllable type L, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+1169 L   Letter: Other letter, hangul, Hangul syllable type V, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+11FE L   Letter: Other letter, hangul, Hangul syllable type T, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AE4C L   Letter: Other letter, hangul, Hangul syllable type LV, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AD89 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+000D B   Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000A B   Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000E BN  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+U+0711 NSM Mark: Non-spacing mark, syriac, Extend, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, uppercase]
+U+1B04 L   Mark: Spacing mark, balinese, SpacingMark, [dash, emoji, extendedpictographic, graphemebase, patternsyntax]
+U+1111 L   Letter: Other letter, hangul, Hangul syllable type L, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+1169 L   Letter: Other letter, hangul, Hangul syllable type V, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+11FE L   Letter: Other letter, hangul, Hangul syllable type T, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE4C L   Letter: Other letter, hangul, Hangul syllable type LV, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD89 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
 
 findprop 118a0 11ac7 16ad0
-U+118A0 L   Letter: Upper case letter, warangciti, Other, U+118C0, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+11AC7 L   Letter: Other letter, paucinhau, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+16AD0 L   Letter: Other letter, bassavah, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+118A0 L   Letter: Upper case letter, warangciti, Other, U+118C0, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+11AC7 L   Letter: Other letter, paucinhau, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+16AD0 L   Letter: Other letter, bassavah, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
 
 findprop 11700 14400 108e0 11280 1d800
-U+11700 L   Letter: Other letter, ahom, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+14400 L   Letter: Other letter, anatolianhieroglyphs, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+108E0 R   Letter: Other letter, hatran, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+11280 L   Letter: Other letter, multani, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+1D800 L   Symbol: Other symbol, signwriting, Other, [graphemebase]
+U+11700 L   Letter: Other letter, ahom, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+14400 L   Letter: Other letter, anatolianhieroglyphs, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+108E0 R   Letter: Other letter, hatran, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+11280 L   Letter: Other letter, multani, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+1D800 L   Symbol: Other symbol, signwriting, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
 
 findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
-U+11800 L   Letter: Other letter, dogra, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+1E903 R   Letter: Upper case letter, adlam, Other, U+1E925, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+11DA9 L   Number: Decimal number, gunjalagondi, Other, [graphemebase, idcontinue, xidcontinue]
-U+10D27 NSM Mark: Non-spacing mark, hanifirohingya, Extend, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
-U+11EE0 L   Letter: Other letter, makasar, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+16E48 L   Letter: Upper case letter, medefaidrin, Other, U+16E68, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
-U+10F27 R   Letter: Other letter, oldsogdian, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+10F30 AL  Letter: Other letter, sogdian, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+11800 L   Letter: Other letter, dogra, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+1E903 R   Letter: Upper case letter, adlam, Other, U+1E925, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+11DA9 L   Number: Decimal number, gunjalagondi, Other, [graphemebase, patternsyntax, terminalpunctuation]
+U+10D27 NSM Mark: Non-spacing mark, hanifirohingya, Extend, [extendedpictographic, graphemebase, patternsyntax]
+U+11EE0 L   Letter: Other letter, makasar, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+16E48 L   Letter: Upper case letter, medefaidrin, Other, U+16E68, [alphabetic, graphemeextend, idcontinue, xidcontinue]
+U+10F27 R   Letter: Other letter, oldsogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10F30 AL  Letter: Other letter, sogdian, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
 
 findprop  a836  a833  1cf4  20f0  1cd0
-U+A836 L   Symbol: Other symbol, common, Other, [devanagari, gurmukhi, gujarati, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra], [graphemebase]
-U+A833 L   Number: Other number, common, Other, [devanagari, gurmukhi, gujarati, kannada, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra, nandinagari], [graphemebase]
-U+1CF4 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, kannada, grantha], [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
-U+20F0 NSM Mark: Non-spacing mark, inherited, Extend, [latin, devanagari, grantha], [caseignorable, graphemeextend, idcontinue, xidcontinue]
-U+1CD0 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, bengali, kannada, grantha], [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+A836 L   Symbol: Other symbol, common, Other, [devanagari, gurmukhi, gujarati, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+A833 L   Number: Other number, common, Other, [devanagari, gurmukhi, gujarati, kannada, kaithi, takri, khojki, mahajani, modi, khudawadi, tirhuta, dogra, nandinagari], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+1CF4 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
+U+20F0 NSM Mark: Non-spacing mark, inherited, Extend, [latin, devanagari, grantha], [caseignorable, graphemebase, patternsyntax, quotationmark]
+U+1CD0 NSM Mark: Non-spacing mark, inherited, Extend, [devanagari, bengali, kannada, grantha], [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
 
 findprop 32ff
-U+32FF L   Symbol: Other symbol, common, Other, [han], [graphemebase]
+U+32FF L   Symbol: Other symbol, common, Other, [han], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
 
 findprop 1f16d
-U+1F16D ON  Symbol: Other symbol, common, Extended Pictographic, [extendedpictographic, graphemebase]
+U+1F16D ON  Symbol: Other symbol, common, Extended Pictographic, [ascii, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
 
 findprop U+10e93 U+10eaa
-U+10E93 R   Letter: Other letter, yezidi, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+10E93 R   Letter: Other letter, yezidi, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
 U+10EAA R   Control: Unassigned, unknown, Other
 
 findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
-U+0602 AN  Control: Format, arabic, Prepend, [caseignorable, prependedconcatenationmark]
-U+202A LRE Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
-U+202B RLE Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
-U+202C PDF Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
-U+2068 FSI Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
-U+2069 PDI Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
-U+202D LRO Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
-U+202E RLO Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
-U+2067 RLI Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
+U+0602 AN  Control: Format, arabic, Prepend, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, lowercase]
+U+202A LRE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+202B RLE Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+202D LRO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
+U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
diff --git a/maint/ucptestdata/testoutput2 b/maint/ucptestdata/testoutput2
index 9eba2ae..088e802 100644
--- a/maint/ucptestdata/testoutput2
+++ b/maint/ucptestdata/testoutput2
@@ -1,278 +1,298 @@
 find script Han
-U+2E80..U+2E99 ON  Symbol: Other symbol, han, Other, [graphemebase, radical]
-U+2E9B..U+2EF3 ON  Symbol: Other symbol, han, Other, [graphemebase, radical]
-U+2F00..U+2FD5 ON  Symbol: Other symbol, han, Other, [graphemebase, radical]
-        U+3005 L   Letter: Modifier letter, han, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-        U+3007 L   Number: Letter number, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-U+3021..U+3029 L   Number: Letter number, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-U+3038..U+303A L   Number: Letter number, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-        U+303B L   Letter: Modifier letter, han, Other, [alphabetic, caseignorable, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+3400..U+4DBF L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-U+4E00..U+9FFF L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-U+F900..U+FA0D L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-U+FA0E..U+FA0F L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-        U+FA10 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-        U+FA11 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-        U+FA12 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-U+FA13..U+FA14 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-U+FA15..U+FA1E L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-        U+FA1F L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-        U+FA20 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-        U+FA21 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-        U+FA22 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-U+FA23..U+FA24 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-U+FA25..U+FA26 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-U+FA27..U+FA29 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-U+FA2A..U+FA6D L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-U+FA70..U+FAD9 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-        U+16FE2 ON  Punctuation: Other punctuation, han, Other, [graphemebase]
-       U+16FE3 L   Letter: Modifier letter, han, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+16FF0..U+16FF1 L   Mark: Spacing mark, han, SpacingMark, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
-U+20000..U+2A6DF L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-U+2A700..U+2B738 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-U+2B740..U+2B81D L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-U+2B820..U+2CEA1 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-U+2CEB0..U+2EBE0 L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
-U+2F800..U+2FA1D L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
-U+30000..U+3134A L   Letter: Other letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, unifiedideograph, xidcontinue, xidstart]
+U+2E80..U+2E99 ON  Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
+U+2E9B..U+2EF3 ON  Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
+U+2F00..U+2FD5 ON  Symbol: Other symbol, han, Other, [ascii, sentenceterminal, unifiedideograph, whitespace, xidstart]
+        U+3005 L   Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+        U+3007 L   Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+3021..U+3029 L   Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+3038..U+303A L   Number: Letter number, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+        U+303B L   Letter: Modifier letter, han, Other, [alphabetic, graphemebase, idcontinue, idstart, ideographic, xidcontinue, xidstart]
+U+3400..U+4DBF L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+4E00..U+9FFF L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+F900..U+FA0D L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+FA0E..U+FA0F L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+        U+FA10 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+        U+FA11 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+        U+FA12 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+FA13..U+FA14 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FA15..U+FA1E L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+        U+FA1F L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+        U+FA20 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+        U+FA21 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+        U+FA22 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+FA23..U+FA24 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FA25..U+FA26 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+FA27..U+FA29 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FA2A..U+FA6D L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+FA70..U+FAD9 L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+        U+16FE2 ON  Punctuation: Other punctuation, han, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+       U+16FE3 L   Letter: Modifier letter, han, Other, [emoji, emojimodifierbase, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+16FF0..U+16FF1 L   Mark: Spacing mark, han, SpacingMark, [caseignorable, graphemeextend, idcontinue, ideographic, xidcontinue]
+U+20000..U+2A6DF L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+2A700..U+2B738 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+2B740..U+2B81D L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+2B820..U+2CEA1 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+2CEB0..U+2EBE0 L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+2F800..U+2FA1D L   Letter: Other letter, han, Other, [sentenceterminal, unifiedideograph, xidcontinue, xidstart]
+U+30000..U+3134A L   Letter: Other letter, han, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
 find type Pe script Common scriptx Hangul
-U+3009 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [bidimirrored, graphemebase, patternsyntax]
-U+300B ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [bidimirrored, graphemebase, patternsyntax]
-U+300D ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [bidimirrored, graphemebase, patternsyntax, quotationmark]
-U+300F ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [bidimirrored, graphemebase, patternsyntax, quotationmark]
-U+3011 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [bidimirrored, graphemebase, patternsyntax]
-U+3015 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [bidimirrored, graphemebase, patternsyntax]
-U+3017 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [bidimirrored, graphemebase, patternsyntax]
-U+3019 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [bidimirrored, graphemebase, patternsyntax]
-U+301B ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [bidimirrored, graphemebase, patternsyntax]
-U+301E..U+301F ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [graphemebase, patternsyntax, quotationmark]
-        U+FF63 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [bidimirrored, graphemebase, quotationmark]
+U+3009 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+300B ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+300D ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
+U+300F ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [graphemebase, sentenceterminal, terminalpunctuation]
+U+3011 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+3015 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+3017 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+3019 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+301B ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, dash, emojimodifier, emojimodifierbase]
+U+301E..U+301F ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [softdotted, terminalpunctuation, unifiedideograph, xidcontinue, xidstart]
+        U+FF63 ON  Punctuation: Close punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han, yiii], [changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, emojimodifier, emojimodifierbase]
 find type Sk
-U+005E ON  Symbol: Modifier symbol, common, Other, [ascii, caseignorable, diacritic, graphemebase, math, patternsyntax]
-U+0060 ON  Symbol: Modifier symbol, common, Other, [ascii, caseignorable, diacritic, graphemebase, patternsyntax]
-U+00A8 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+00AF ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+00B4 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+00B8 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+02C2..U+02C5 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+02D2..U+02DF ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+02E5..U+02E9 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+02EA..U+02EB ON  Symbol: Modifier symbol, bopomofo, Other, [caseignorable, diacritic, graphemebase]
-        U+02ED ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+02EF..U+02FF ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-        U+0375 ON  Symbol: Modifier symbol, greek, Other, [caseignorable, diacritic, graphemebase]
-        U+0384 ON  Symbol: Modifier symbol, greek, Other, [caseignorable, diacritic, graphemebase]
-        U+0385 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-        U+0888 AL  Symbol: Modifier symbol, arabic, Other, [caseignorable, graphemebase]
-        U+1FBD ON  Symbol: Modifier symbol, greek, Other, [caseignorable, diacritic, graphemebase]
-U+1FBF..U+1FC1 ON  Symbol: Modifier symbol, greek, Other, [caseignorable, diacritic, graphemebase]
-U+1FCD..U+1FCF ON  Symbol: Modifier symbol, greek, Other, [caseignorable, diacritic, graphemebase]
-U+1FDD..U+1FDF ON  Symbol: Modifier symbol, greek, Other, [caseignorable, diacritic, graphemebase]
-U+1FED..U+1FEF ON  Symbol: Modifier symbol, greek, Other, [caseignorable, diacritic, graphemebase]
-U+1FFD..U+1FFE ON  Symbol: Modifier symbol, greek, Other, [caseignorable, diacritic, graphemebase]
-U+309B..U+309C ON  Symbol: Modifier symbol, common, Other, [hiragana, katakana], [caseignorable, diacritic, graphemebase, idcontinue, idstart]
-U+A700..U+A707 ON  Symbol: Modifier symbol, common, Other, [latin, han], [caseignorable, diacritic, graphemebase]
-U+A708..U+A716 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+A720..U+A721 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+A789..U+A78A L   Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-        U+AB5B L   Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+AB6A..U+AB6B ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+FBB2..U+FBC2 AL  Symbol: Modifier symbol, arabic, Other, [caseignorable, graphemebase]
-        U+FF3E ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase, math]
-        U+FF40 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-        U+FFE3 ON  Symbol: Modifier symbol, common, Other, [caseignorable, diacritic, graphemebase]
-U+1F3FB..U+1F3FF ON  Symbol: Modifier symbol, common, Extend, [caseignorable, emoji, emojicomponent, emojimodifier, emojipresentation, graphemebase]
+U+005E ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+0060 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, changeswhencasefolded, changeswhencasemapped, changeswhenlowercased, changeswhentitlecased, graphemebase, idcontinue, idstart, uppercase, xidcontinue, xidstart]
+U+00A8 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00AF ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00B4 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+00B8 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+02C2..U+02C5 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+02D2..U+02DF ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+02E5..U+02E9 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+02EA..U+02EB ON  Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+02ED ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+02EF..U+02FF ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+0375 ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+0384 ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+0385 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+0888 AL  Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
+        U+1FBD ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1FBF..U+1FC1 ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1FCD..U+1FCF ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1FDD..U+1FDF ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1FED..U+1FEF ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1FFD..U+1FFE ON  Symbol: Modifier symbol, greek, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+309B..U+309C ON  Symbol: Modifier symbol, common, Other, [hiragana, katakana], [alphabetic, bidimirrored, caseignorable, cased, changeswhencasefolded, changeswhenlowercased, changeswhentitlecased, changeswhenuppercased, dash, defaultignorablecodepoint, deprecated, diacritic, emoji, emojicomponent, emojimodifier, emojimodifierbase, emojipresentation, extendedpictographic, extender, graphemebase, graphemeextend, graphemelink, hexdigit, idsbinaryoperator, idstrinaryoperator, idcontinue, idstart, ideographic, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
+U+A700..U+A707 ON  Symbol: Modifier symbol, common, Other, [latin, han], [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+A708..U+A716 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+A720..U+A721 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+A789..U+A78A L   Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+AB5B L   Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+AB6A..U+AB6B ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+FBB2..U+FBC2 AL  Symbol: Modifier symbol, arabic, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, math, softdotted, xidcontinue, xidstart]
+        U+FF3E ON  Symbol: Modifier symbol, common, Other, [asciihexdigit, bidicontrol, bidimirrored, cased, changeswhencasefolded, sentenceterminal, unifiedideograph, whitespace, xidstart]
+        U+FF40 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+        U+FFE3 ON  Symbol: Modifier symbol, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+1F3FB..U+1F3FF ON  Symbol: Modifier symbol, common, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternsyntax, radical, sentenceterminal, terminalpunctuation]
 find type Pd
-U+002D ES  Punctuation: Dash punctuation, common, Other, [ascii, dash, graphemebase, patternsyntax]
-U+058A ON  Punctuation: Dash punctuation, armenian, Other, [dash, graphemebase]
-U+05BE R   Punctuation: Dash punctuation, hebrew, Other, [dash, graphemebase]
-U+1400 ON  Punctuation: Dash punctuation, canadianaboriginal, Other, [dash, graphemebase]
-U+1806 ON  Punctuation: Dash punctuation, mongolian, Other, [dash, graphemebase]
-U+2010..U+2015 ON  Punctuation: Dash punctuation, common, Other, [dash, graphemebase, patternsyntax]
-        U+2E17 ON  Punctuation: Dash punctuation, common, Other, [dash, graphemebase, patternsyntax]
-        U+2E1A ON  Punctuation: Dash punctuation, common, Other, [dash, graphemebase, patternsyntax]
-U+2E3A..U+2E3B ON  Punctuation: Dash punctuation, common, Other, [dash, graphemebase, patternsyntax]
-        U+2E40 ON  Punctuation: Dash punctuation, common, Other, [dash, graphemebase, patternsyntax]
-        U+2E5D ON  Punctuation: Dash punctuation, common, Other, [dash, graphemebase, patternsyntax]
-        U+301C ON  Punctuation: Dash punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [dash, graphemebase, patternsyntax]
-        U+3030 ON  Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [dash, emoji, extendedpictographic, graphemebase, patternsyntax]
-        U+30A0 ON  Punctuation: Dash punctuation, common, Other, [hiragana, katakana], [dash, graphemebase]
-U+FE31..U+FE32 ON  Punctuation: Dash punctuation, common, Other, [dash, graphemebase]
-        U+FE58 ON  Punctuation: Dash punctuation, common, Other, [dash, graphemebase]
-        U+FE63 ES  Punctuation: Dash punctuation, common, Other, [dash, graphemebase, math]
-        U+FF0D ES  Punctuation: Dash punctuation, common, Other, [dash, graphemebase]
-        U+10EAD R   Punctuation: Dash punctuation, yezidi, Other, [dash, graphemebase]
+U+002D ES  Punctuation: Dash punctuation, common, Other, [ascii, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
+U+058A ON  Punctuation: Dash punctuation, armenian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+05BE R   Punctuation: Dash punctuation, hebrew, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+1400 ON  Punctuation: Dash punctuation, canadianaboriginal, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+1806 ON  Punctuation: Dash punctuation, mongolian, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+2010..U+2015 ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+2E17 ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+2E1A ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+U+2E3A..U+2E3B ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+2E40 ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+2E5D ON  Punctuation: Dash punctuation, common, Other, [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+301C ON  Punctuation: Dash punctuation, common, Other, [hangul, hiragana, katakana, bopomofo, han], [dash, defaultignorablecodepoint, deprecated, emojipresentation, joincontrol, lowercase, patternwhitespace, radical, regionalindicator, softdotted, xidcontinue, xidstart]
+        U+3030 ON  Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+        U+30A0 ON  Punctuation: Dash punctuation, common, Other, [hiragana, katakana], [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+U+FE31..U+FE32 ON  Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+        U+FE58 ON  Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+        U+FE63 ES  Punctuation: Dash punctuation, common, Other, [caseignorable, sentenceterminal, unifiedideograph, xidcontinue]
+        U+FF0D ES  Punctuation: Dash punctuation, common, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
+        U+10EAD R   Punctuation: Dash punctuation, yezidi, Other, [emoji, emojipresentation, extendedpictographic, graphemebase, patternsyntax]
 find gbreak LVT
-U+AC01..U+AC1B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AC1D..U+AC37 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AC39..U+AC53 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AC55..U+AC6F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AC71..U+AC8B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AC8D..U+ACA7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+ACA9..U+ACC3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+ACC5..U+ACDF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+ACE1..U+ACFB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+ACFD..U+AD17 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AD19..U+AD33 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AD35..U+AD4F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AD51..U+AD6B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AD6D..U+AD87 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AD89..U+ADA3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+ADA5..U+ADBF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+ADC1..U+ADDB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+ADDD..U+ADF7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+ADF9..U+AE13 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AE15..U+AE2F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AE31..U+AE4B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AE4D..U+AE67 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AE69..U+AE83 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AE85..U+AE9F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AEA1..U+AEBB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AEBD..U+AED7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AED9..U+AEF3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AEF5..U+AF0F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AF11..U+AF2B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AF2D..U+AF47 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AF49..U+AF63 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AF65..U+AF7F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AF81..U+AF9B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AF9D..U+AFB7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AFB9..U+AFD3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AFD5..U+AFEF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+AFF1..U+B00B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B00D..U+B027 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B029..U+B043 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B045..U+B05F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B061..U+B07B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B07D..U+B097 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B099..U+B0B3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B0B5..U+B0CF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B0D1..U+B0EB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B0ED..U+B107 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B109..U+B123 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B125..U+B13F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B141..U+B15B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B15D..U+B177 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B179..U+B193 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B195..U+B1AF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B1B1..U+B1CB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B1CD..U+B1E7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B1E9..U+B203 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B205..U+B21F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B221..U+B23B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B23D..U+B257 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B259..U+B273 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B275..U+B28F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B291..U+B2AB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B2AD..U+B2C7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B2C9..U+B2E3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B2E5..U+B2FF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B301..U+B31B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B31D..U+B337 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B339..U+B353 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B355..U+B36F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B371..U+B38B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B38D..U+B3A7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B3A9..U+B3C3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B3C5..U+B3DF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B3E1..U+B3FB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B3FD..U+B417 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B419..U+B433 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B435..U+B44F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B451..U+B46B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B46D..U+B487 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B489..U+B4A3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B4A5..U+B4BF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B4C1..U+B4DB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B4DD..U+B4F7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B4F9..U+B513 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B515..U+B52F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B531..U+B54B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B54D..U+B567 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B569..U+B583 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B585..U+B59F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B5A1..U+B5BB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B5BD..U+B5D7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B5D9..U+B5F3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B5F5..U+B60F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B611..U+B62B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B62D..U+B647 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B649..U+B663 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B665..U+B67F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B681..U+B69B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B69D..U+B6B7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B6B9..U+B6D3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+B6D5..U+B6EF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+AC01..U+AC1B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AC1D..U+AC37 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AC39..U+AC53 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AC55..U+AC6F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AC71..U+AC8B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AC8D..U+ACA7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ACA9..U+ACC3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ACC5..U+ACDF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ACE1..U+ACFB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ACFD..U+AD17 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD19..U+AD33 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD35..U+AD4F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD51..U+AD6B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD6D..U+AD87 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AD89..U+ADA3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ADA5..U+ADBF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ADC1..U+ADDB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ADDD..U+ADF7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+ADF9..U+AE13 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE15..U+AE2F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE31..U+AE4B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE4D..U+AE67 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE69..U+AE83 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AE85..U+AE9F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AEA1..U+AEBB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AEBD..U+AED7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AED9..U+AEF3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AEF5..U+AF0F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF11..U+AF2B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF2D..U+AF47 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF49..U+AF63 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF65..U+AF7F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF81..U+AF9B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AF9D..U+AFB7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AFB9..U+AFD3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AFD5..U+AFEF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+AFF1..U+B00B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B00D..U+B027 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B029..U+B043 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B045..U+B05F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B061..U+B07B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B07D..U+B097 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B099..U+B0B3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B0B5..U+B0CF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B0D1..U+B0EB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B0ED..U+B107 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B109..U+B123 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B125..U+B13F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B141..U+B15B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B15D..U+B177 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B179..U+B193 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B195..U+B1AF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B1B1..U+B1CB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B1CD..U+B1E7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B1E9..U+B203 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B205..U+B21F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B221..U+B23B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B23D..U+B257 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B259..U+B273 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B275..U+B28F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B291..U+B2AB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B2AD..U+B2C7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B2C9..U+B2E3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B2E5..U+B2FF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B301..U+B31B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B31D..U+B337 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B339..U+B353 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B355..U+B36F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B371..U+B38B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B38D..U+B3A7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B3A9..U+B3C3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B3C5..U+B3DF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B3E1..U+B3FB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B3FD..U+B417 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B419..U+B433 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B435..U+B44F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B451..U+B46B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B46D..U+B487 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B489..U+B4A3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B4A5..U+B4BF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B4C1..U+B4DB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B4DD..U+B4F7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B4F9..U+B513 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B515..U+B52F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B531..U+B54B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B54D..U+B567 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B569..U+B583 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B585..U+B59F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B5A1..U+B5BB L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B5BD..U+B5D7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B5D9..U+B5F3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B5F5..U+B60F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B611..U+B62B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B62D..U+B647 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B649..U+B663 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B665..U+B67F L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B681..U+B69B L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B69D..U+B6B7 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B6B9..U+B6D3 L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+B6D5..U+B6EF L   Letter: Other letter, hangul, Hangul syllable type LVT, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
 ...
 find script Old_Uyghur
-U+10F70..U+10F81 R   Letter: Other letter, olduyghur, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+10F82..U+10F85 NSM Mark: Non-spacing mark, olduyghur, Extend, [caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
-U+10F86..U+10F89 R   Punctuation: Other punctuation, olduyghur, Other, [graphemebase, sentenceterminal, terminalpunctuation]
+U+10F70..U+10F81 R   Letter: Other letter, olduyghur, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+10F82..U+10F85 NSM Mark: Non-spacing mark, olduyghur, Extend, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, softdotted, xidcontinue, xidstart]
+U+10F86..U+10F89 R   Punctuation: Other punctuation, olduyghur, Other, [bidimirrored, graphemebase, math, patternsyntax]
 find bidi PDF
-U+202C PDF Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
+U+202C PDF Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
 find bidi CS
-U+002C CS  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax, terminalpunctuation]
-U+002E CS  Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
-U+002F CS  Punctuation: Other punctuation, common, Other, [ascii, graphemebase, patternsyntax]
-U+003A CS  Punctuation: Other punctuation, common, Other, [ascii, caseignorable, graphemebase, patternsyntax, terminalpunctuation]
-U+00A0 CS  Separator: Space separator, common, Other, [graphemebase, whitespace]
-U+060C CS  Punctuation: Other punctuation, common, Other, [arabic, syriac, thaana, nko, hanifirohingya, yezidi], [graphemebase, terminalpunctuation]
-U+202F CS  Separator: Space separator, common, Other, [latin, mongolian], [graphemebase, whitespace]
-U+2044 CS  Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
-U+FE50 CS  Punctuation: Other punctuation, common, Other, [graphemebase, terminalpunctuation]
-U+FE52 CS  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, sentenceterminal, terminalpunctuation]
-U+FE55 CS  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, terminalpunctuation]
-U+FF0C CS  Punctuation: Other punctuation, common, Other, [graphemebase, terminalpunctuation]
-U+FF0E CS  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, sentenceterminal, terminalpunctuation]
-U+FF0F CS  Punctuation: Other punctuation, common, Other, [graphemebase]
-U+FF1A CS  Punctuation: Other punctuation, common, Other, [caseignorable, graphemebase, terminalpunctuation]
+U+002C CS  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, hexdigit, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+002E CS  Punctuation: Other punctuation, common, Other, [graphemebase, whitespace]
+U+002F CS  Punctuation: Other punctuation, common, Other, [ascii, asciihexdigit, emoji, emojicomponent, graphemebase, hexdigit, idcontinue, xidcontinue]
+U+003A CS  Punctuation: Other punctuation, common, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, lowercase, xidcontinue, xidstart]
+U+00A0 CS  Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+U+060C CS  Punctuation: Other punctuation, common, Other, [arabic, syriac, thaana, nko, hanifirohingya, yezidi], [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+202F CS  Separator: Space separator, common, Other, [latin, mongolian], [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+U+2044 CS  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
+U+FE50 CS  Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+FE52 CS  Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FE55 CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+FF0C CS  Punctuation: Other punctuation, common, Other, [graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+FF0E CS  Punctuation: Other punctuation, common, Other, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
+U+FF0F CS  Punctuation: Other punctuation, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+FF1A CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
 find bidi CS type Sm
-U+2044 CS  Symbol: Mathematical symbol, common, Other, [graphemebase, math, patternsyntax]
+U+2044 CS  Symbol: Mathematical symbol, common, Other, [alphabetic, caseignorable, diacritic, graphemeextend, idcontinue, xidcontinue]
 find bidi B
-U+000A B   Control: Control, common, LF, [ascii, patternwhitespace, whitespace]
-U+000D B   Control: Control, common, CR, [ascii, patternwhitespace, whitespace]
-U+001C..U+001E B   Control: Control, common, Control, [ascii]
-        U+0085 B   Control: Control, common, Control, [patternwhitespace, whitespace]
-        U+2029 B   Separator: Paragraph separator, common, Control, [patternwhitespace, whitespace]
+U+000A B   Control: Control, common, LF, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000D B   Control: Control, common, CR, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+001C..U+001E B   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
+        U+0085 B   Control: Control, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
+        U+2029 B   Separator: Paragraph separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
 find bidi FSI
-U+2068 FSI Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
+U+2068 FSI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
 find bidi PDI
-U+2069 PDI Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
+U+2069 PDI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
 find bidi RLI
-U+2067 RLI Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
+U+2067 RLI Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
 find bidi RLO
-U+202E RLO Control: Format, common, Control, [bidicontrol, caseignorable, defaultignorablecodepoint]
+U+202E RLO Control: Format, common, Control, [extendedpictographic, graphemebase, math, patternsyntax]
 find bidi S
-U+0009 S   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
-U+000B S   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
-U+001F S   Control: Control, common, Control, [ascii]
+U+0009 S   Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+000B S   Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+001F S   Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
 find bidi WS
-U+000C WS  Control: Control, common, Control, [ascii, patternwhitespace, whitespace]
-U+0020 WS  Separator: Space separator, common, Other, [ascii, graphemebase, patternwhitespace, whitespace]
-U+1680 WS  Separator: Space separator, ogham, Other, [graphemebase, whitespace]
-U+2000..U+200A WS  Separator: Space separator, common, Other, [graphemebase, whitespace]
-        U+2028 WS  Separator: Line separator, common, Control, [patternwhitespace, whitespace]
-        U+205F WS  Separator: Space separator, common, Other, [graphemebase, whitespace]
-        U+3000 WS  Separator: Space separator, common, Other, [graphemebase, whitespace]
+U+000C WS  Control: Control, common, Control, [ascii, graphemebase, patternsyntax, sentenceterminal, terminalpunctuation]
+U+0020 WS  Separator: Space separator, common, Other, [ascii, emoji, emojicomponent, graphemebase, patternsyntax]
+U+1680 WS  Separator: Space separator, ogham, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+U+2000..U+200A WS  Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+        U+2028 WS  Separator: Line separator, common, Control, [caseignorable, defaultignorablecodepoint, graphemeextend, idcontinue, xidcontinue]
+        U+205F WS  Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
+        U+3000 WS  Separator: Space separator, common, Other, [alphabetic, caseignorable, cased, diacritic, graphemebase, idcontinue, idstart, lowercase]
 find script bopo
-U+02EA..U+02EB ON  Symbol: Modifier symbol, bopomofo, Other, [caseignorable, diacritic, graphemebase]
-U+3105..U+312F L   Letter: Other letter, bopomofo, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
-U+31A0..U+31BF L   Letter: Other letter, bopomofo, Other, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
+U+02EA..U+02EB ON  Symbol: Modifier symbol, bopomofo, Other, [alphabetic, cased, graphemebase, idcontinue, idstart, math, uppercase, xidcontinue, xidstart]
+U+3105..U+312F L   Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
+U+31A0..U+31BF L   Letter: Other letter, bopomofo, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
 find bool prependedconcatenationmark
-U+0600..U+0604 AN  Control: Format, arabic, Prepend, [caseignorable, prependedconcatenationmark]
-        U+0605 AN  Control: Format, common, Prepend, [caseignorable, prependedconcatenationmark]
-        U+06DD AN  Control: Format, common, Prepend, [caseignorable, prependedconcatenationmark]
-        U+070F AL  Control: Format, syriac, Prepend, [caseignorable, prependedconcatenationmark]
-U+0890..U+0891 AN  Control: Format, arabic, Prepend, [caseignorable, prependedconcatenationmark]
-        U+08E2 AN  Control: Format, common, Prepend, [caseignorable, prependedconcatenationmark]
-        U+110BD L   Control: Format, kaithi, Prepend, [caseignorable, prependedconcatenationmark]
-       U+110CD L   Control: Format, kaithi, Prepend, [caseignorable, prependedconcatenationmark]
+U+00AD BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+180E BN  Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
+U+200B BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+2060 BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+2118 ON  Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+3030 ON  Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+AAC0 L   Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+U+AAC2 L   Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
+U+FE55 CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+FEFF BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+FF1A CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+FF21..U+FF26 L   Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+10D22..U+10D23 AL  Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+       U+1135D L   Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+1BCA0..U+1BCA3 BN  Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
+U+1D173..U+1D17A BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+1F1E6..U+1F1FF L   Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
 find bool pcm
-U+0600..U+0604 AN  Control: Format, arabic, Prepend, [caseignorable, prependedconcatenationmark]
-        U+0605 AN  Control: Format, common, Prepend, [caseignorable, prependedconcatenationmark]
-        U+06DD AN  Control: Format, common, Prepend, [caseignorable, prependedconcatenationmark]
-        U+070F AL  Control: Format, syriac, Prepend, [caseignorable, prependedconcatenationmark]
-U+0890..U+0891 AN  Control: Format, arabic, Prepend, [caseignorable, prependedconcatenationmark]
-        U+08E2 AN  Control: Format, common, Prepend, [caseignorable, prependedconcatenationmark]
-        U+110BD L   Control: Format, kaithi, Prepend, [caseignorable, prependedconcatenationmark]
-       U+110CD L   Control: Format, kaithi, Prepend, [caseignorable, prependedconcatenationmark]
+U+00AD BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+180E BN  Control: Format, mongolian, Control, [caseignorable, prependedconcatenationmark]
+U+200B BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+2060 BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+2118 ON  Symbol: Mathematical symbol, common, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+3030 ON  Punctuation: Dash punctuation, common, Extended Pictographic, [hangul, hiragana, katakana, bopomofo, han], [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, lowercase, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+AAC0 L   Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+U+AAC2 L   Letter: Other letter, taiviet, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+U+FE0F NSM Mark: Non-spacing mark, inherited, Extend, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
+U+FE55 CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+FEFF BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+FF1A CS  Punctuation: Other punctuation, common, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+FF21..U+FF26 L   Letter: Upper case letter, latin, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, noncharactercodepoint, patternwhitespace, prependedconcatenationmark]
+U+10D22..U+10D23 AL  Letter: Other letter, hanifirohingya, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, logicalorderexception, lowercase, math, patternwhitespace, prependedconcatenationmark]
+       U+1135D L   Letter: Other letter, grantha, Other, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
+U+1BCA0..U+1BCA3 BN  Control: Format, common, Control, [duployan], [caseignorable, prependedconcatenationmark]
+U+1D173..U+1D17A BN  Control: Format, common, Control, [caseignorable, prependedconcatenationmark]
+U+1F1E6..U+1F1FF L   Symbol: Other symbol, common, Regional Indicator, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, math, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
diff --git a/perltest.sh b/perltest.sh
index 31406c5..aa1aa37 100755
--- a/perltest.sh
+++ b/perltest.sh
@@ -218,7 +218,7 @@
 
   $showrest = ($mod =~ s/aftertext,?//);
 
-  # The "subject_literal" modifer disables escapes in subjects.
+  # The "subject_literal" modifier disables escapes in subjects.
 
   $subject_literal = ($mod =~ s/subject_literal,?//);
 
diff --git a/src/config.h.generic b/src/config.h.generic
index 76dc586..5548d18 100644
--- a/src/config.h.generic
+++ b/src/config.h.generic
@@ -236,7 +236,7 @@
 #define PACKAGE_NAME "PCRE2"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "PCRE2 10.40"
+#define PACKAGE_STRING "PCRE2 10.42"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "pcre2"
@@ -245,7 +245,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "10.40"
+#define PACKAGE_VERSION "10.42"
 
 /* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
    parentheses (of any kind) in a pattern. This limits the amount of system
@@ -438,7 +438,13 @@
 #endif
 
 /* Version number of package */
-#define VERSION "10.40"
+#define VERSION "10.42"
+
+/* Number of bits in a file offset, on hosts where this is settable. */
+/* #undef _FILE_OFFSET_BITS */
+
+/* Define for large files, on AIX-style hosts. */
+/* #undef _LARGE_FILES */
 
 /* Define to empty if `const' does not conform to ANSI C. */
 /* #undef const */
diff --git a/src/config.h.in b/src/config.h.in
index beb9142..c48e8ea 100644
--- a/src/config.h.in
+++ b/src/config.h.in
@@ -428,6 +428,12 @@
 /* Version number of package */
 #undef VERSION
 
+/* Number of bits in a file offset, on hosts where this is settable. */
+#undef _FILE_OFFSET_BITS
+
+/* Define for large files, on AIX-style hosts. */
+#undef _LARGE_FILES
+
 /* Define to empty if `const' does not conform to ANSI C. */
 #undef const
 
diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic
index 8adcede..1cbecd0 100644
--- a/src/pcre2.h.generic
+++ b/src/pcre2.h.generic
@@ -42,9 +42,9 @@
 /* The current PCRE version information. */
 
 #define PCRE2_MAJOR           10
-#define PCRE2_MINOR           40
+#define PCRE2_MINOR           42
 #define PCRE2_PRERELEASE      
-#define PCRE2_DATE            2022-04-14
+#define PCRE2_DATE            2022-12-11
 
 /* When an application links to a PCRE DLL in Windows, the symbols that are
 imported have to be identified as such. When building PCRE2, the appropriate
@@ -572,19 +572,19 @@
 /* Functions for manipulating contexts. */
 
 #define PCRE2_GENERAL_CONTEXT_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \
-  *pcre2_general_context_copy(pcre2_general_context *); \
-PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \
-  *pcre2_general_context_create(void *(*)(PCRE2_SIZE, void *), \
+PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \
+  pcre2_general_context_copy(pcre2_general_context *); \
+PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \
+  pcre2_general_context_create(void *(*)(PCRE2_SIZE, void *), \
     void (*)(void *, void *), void *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_general_context_free(pcre2_general_context *);
 
 #define PCRE2_COMPILE_CONTEXT_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \
-  *pcre2_compile_context_copy(pcre2_compile_context *); \
-PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \
-  *pcre2_compile_context_create(pcre2_general_context *);\
+PCRE2_EXP_DECL pcre2_compile_context *PCRE2_CALL_CONVENTION \
+  pcre2_compile_context_copy(pcre2_compile_context *); \
+PCRE2_EXP_DECL pcre2_compile_context *PCRE2_CALL_CONVENTION \
+  pcre2_compile_context_create(pcre2_general_context *);\
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_compile_context_free(pcre2_compile_context *); \
 PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
@@ -604,10 +604,10 @@
     int (*)(uint32_t, void *), void *);
 
 #define PCRE2_MATCH_CONTEXT_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \
-  *pcre2_match_context_copy(pcre2_match_context *); \
-PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \
-  *pcre2_match_context_create(pcre2_general_context *); \
+PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \
+  pcre2_match_context_copy(pcre2_match_context *); \
+PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \
+  pcre2_match_context_create(pcre2_general_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_match_context_free(pcre2_match_context *); \
 PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
@@ -631,10 +631,10 @@
     void *(*)(PCRE2_SIZE, void *), void (*)(void *, void *), void *);
 
 #define PCRE2_CONVERT_CONTEXT_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_convert_context PCRE2_CALL_CONVENTION \
-  *pcre2_convert_context_copy(pcre2_convert_context *); \
-PCRE2_EXP_DECL pcre2_convert_context PCRE2_CALL_CONVENTION \
-  *pcre2_convert_context_create(pcre2_general_context *); \
+PCRE2_EXP_DECL pcre2_convert_context *PCRE2_CALL_CONVENTION \
+  pcre2_convert_context_copy(pcre2_convert_context *); \
+PCRE2_EXP_DECL pcre2_convert_context *PCRE2_CALL_CONVENTION \
+  pcre2_convert_context_create(pcre2_general_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_convert_context_free(pcre2_convert_context *); \
 PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
@@ -646,15 +646,15 @@
 /* Functions concerned with compiling a pattern to PCRE internal code. */
 
 #define PCRE2_COMPILE_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
-  *pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, \
+PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \
+  pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, \
     pcre2_compile_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_code_free(pcre2_code *); \
-PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
-  *pcre2_code_copy(const pcre2_code *); \
-PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
-  *pcre2_code_copy_with_tables(const pcre2_code *);
+PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \
+  pcre2_code_copy(const pcre2_code *); \
+PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \
+  pcre2_code_copy_with_tables(const pcre2_code *);
 
 
 /* Functions that give information about a compiled pattern. */
@@ -670,10 +670,10 @@
 /* Functions for running a match and inspecting the result. */
 
 #define PCRE2_MATCH_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \
-  *pcre2_match_data_create(uint32_t, pcre2_general_context *); \
-PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \
-  *pcre2_match_data_create_from_pattern(const pcre2_code *, \
+PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \
+  pcre2_match_data_create(uint32_t, pcre2_general_context *); \
+PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \
+  pcre2_match_data_create_from_pattern(const pcre2_code *, \
     pcre2_general_context *); \
 PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
   pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \
@@ -689,8 +689,8 @@
   pcre2_get_match_data_size(pcre2_match_data *); \
 PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \
   pcre2_get_ovector_count(pcre2_match_data *); \
-PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
-  *pcre2_get_ovector_pointer(pcre2_match_data *); \
+PCRE2_EXP_DECL PCRE2_SIZE *PCRE2_CALL_CONVENTION \
+  pcre2_get_ovector_pointer(pcre2_match_data *); \
 PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
   pcre2_get_startchar(pcre2_match_data *);
 
@@ -770,8 +770,8 @@
     uint32_t, pcre2_match_data *, pcre2_match_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_jit_free_unused_memory(pcre2_general_context *); \
-PCRE2_EXP_DECL pcre2_jit_stack PCRE2_CALL_CONVENTION \
-  *pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, pcre2_general_context *); \
+PCRE2_EXP_DECL pcre2_jit_stack *PCRE2_CALL_CONVENTION \
+  pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, pcre2_general_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_jit_stack_assign(pcre2_match_context *, pcre2_jit_callback, void *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
@@ -783,8 +783,8 @@
 #define PCRE2_OTHER_FUNCTIONS \
 PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
   pcre2_get_error_message(int, PCRE2_UCHAR *, PCRE2_SIZE); \
-PCRE2_EXP_DECL const uint8_t PCRE2_CALL_CONVENTION \
-  *pcre2_maketables(pcre2_general_context *); \
+PCRE2_EXP_DECL const uint8_t *PCRE2_CALL_CONVENTION \
+  pcre2_maketables(pcre2_general_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_maketables_free(pcre2_general_context *, const uint8_t *);
 
diff --git a/src/pcre2.h.in b/src/pcre2.h.in
index 19bd29e..7b8818d 100644
--- a/src/pcre2.h.in
+++ b/src/pcre2.h.in
@@ -572,19 +572,19 @@
 /* Functions for manipulating contexts. */
 
 #define PCRE2_GENERAL_CONTEXT_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \
-  *pcre2_general_context_copy(pcre2_general_context *); \
-PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \
-  *pcre2_general_context_create(void *(*)(PCRE2_SIZE, void *), \
+PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \
+  pcre2_general_context_copy(pcre2_general_context *); \
+PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \
+  pcre2_general_context_create(void *(*)(PCRE2_SIZE, void *), \
     void (*)(void *, void *), void *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_general_context_free(pcre2_general_context *);
 
 #define PCRE2_COMPILE_CONTEXT_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \
-  *pcre2_compile_context_copy(pcre2_compile_context *); \
-PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \
-  *pcre2_compile_context_create(pcre2_general_context *);\
+PCRE2_EXP_DECL pcre2_compile_context *PCRE2_CALL_CONVENTION \
+  pcre2_compile_context_copy(pcre2_compile_context *); \
+PCRE2_EXP_DECL pcre2_compile_context *PCRE2_CALL_CONVENTION \
+  pcre2_compile_context_create(pcre2_general_context *);\
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_compile_context_free(pcre2_compile_context *); \
 PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
@@ -604,10 +604,10 @@
     int (*)(uint32_t, void *), void *);
 
 #define PCRE2_MATCH_CONTEXT_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \
-  *pcre2_match_context_copy(pcre2_match_context *); \
-PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \
-  *pcre2_match_context_create(pcre2_general_context *); \
+PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \
+  pcre2_match_context_copy(pcre2_match_context *); \
+PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \
+  pcre2_match_context_create(pcre2_general_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_match_context_free(pcre2_match_context *); \
 PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
@@ -631,10 +631,10 @@
     void *(*)(PCRE2_SIZE, void *), void (*)(void *, void *), void *);
 
 #define PCRE2_CONVERT_CONTEXT_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_convert_context PCRE2_CALL_CONVENTION \
-  *pcre2_convert_context_copy(pcre2_convert_context *); \
-PCRE2_EXP_DECL pcre2_convert_context PCRE2_CALL_CONVENTION \
-  *pcre2_convert_context_create(pcre2_general_context *); \
+PCRE2_EXP_DECL pcre2_convert_context *PCRE2_CALL_CONVENTION \
+  pcre2_convert_context_copy(pcre2_convert_context *); \
+PCRE2_EXP_DECL pcre2_convert_context *PCRE2_CALL_CONVENTION \
+  pcre2_convert_context_create(pcre2_general_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_convert_context_free(pcre2_convert_context *); \
 PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
@@ -646,15 +646,15 @@
 /* Functions concerned with compiling a pattern to PCRE internal code. */
 
 #define PCRE2_COMPILE_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
-  *pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, \
+PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \
+  pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, \
     pcre2_compile_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_code_free(pcre2_code *); \
-PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
-  *pcre2_code_copy(const pcre2_code *); \
-PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
-  *pcre2_code_copy_with_tables(const pcre2_code *);
+PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \
+  pcre2_code_copy(const pcre2_code *); \
+PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \
+  pcre2_code_copy_with_tables(const pcre2_code *);
 
 
 /* Functions that give information about a compiled pattern. */
@@ -670,10 +670,10 @@
 /* Functions for running a match and inspecting the result. */
 
 #define PCRE2_MATCH_FUNCTIONS \
-PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \
-  *pcre2_match_data_create(uint32_t, pcre2_general_context *); \
-PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \
-  *pcre2_match_data_create_from_pattern(const pcre2_code *, \
+PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \
+  pcre2_match_data_create(uint32_t, pcre2_general_context *); \
+PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \
+  pcre2_match_data_create_from_pattern(const pcre2_code *, \
     pcre2_general_context *); \
 PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
   pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \
@@ -689,8 +689,8 @@
   pcre2_get_match_data_size(pcre2_match_data *); \
 PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \
   pcre2_get_ovector_count(pcre2_match_data *); \
-PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
-  *pcre2_get_ovector_pointer(pcre2_match_data *); \
+PCRE2_EXP_DECL PCRE2_SIZE *PCRE2_CALL_CONVENTION \
+  pcre2_get_ovector_pointer(pcre2_match_data *); \
 PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
   pcre2_get_startchar(pcre2_match_data *);
 
@@ -770,8 +770,8 @@
     uint32_t, pcre2_match_data *, pcre2_match_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_jit_free_unused_memory(pcre2_general_context *); \
-PCRE2_EXP_DECL pcre2_jit_stack PCRE2_CALL_CONVENTION \
-  *pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, pcre2_general_context *); \
+PCRE2_EXP_DECL pcre2_jit_stack *PCRE2_CALL_CONVENTION \
+  pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, pcre2_general_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_jit_stack_assign(pcre2_match_context *, pcre2_jit_callback, void *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
@@ -783,8 +783,8 @@
 #define PCRE2_OTHER_FUNCTIONS \
 PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
   pcre2_get_error_message(int, PCRE2_UCHAR *, PCRE2_SIZE); \
-PCRE2_EXP_DECL const uint8_t PCRE2_CALL_CONVENTION \
-  *pcre2_maketables(pcre2_general_context *); \
+PCRE2_EXP_DECL const uint8_t *PCRE2_CALL_CONVENTION \
+  pcre2_maketables(pcre2_general_context *); \
 PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
   pcre2_maketables_free(pcre2_general_context *, const uint8_t *);
 
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index de259c9..edf7e82 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -1266,8 +1266,10 @@
 
 if (code != NULL)
   {
+#ifdef SUPPORT_JIT
   if (code->executable_jit != NULL)
     PRIV(jit_free)(code->executable_jit, &code->memctl);
+#endif
 
   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
     {
@@ -2687,7 +2689,7 @@
 while (ptr < ptrend)
   {
   int prev_expect_cond_assert;
-  uint32_t min_repeat, max_repeat;
+  uint32_t min_repeat = 0, max_repeat = 0;
   uint32_t set, unset, *optset;
   uint32_t terminator;
   uint32_t prev_meta_quantifier;
@@ -8552,7 +8554,7 @@
             op == OP_SCBRA || op == OP_SCBRAPOS)
      {
      int n = GET2(scode, 1+LINK_SIZE);
-     int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
+     unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
      if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
      }
 
@@ -10620,4 +10622,10 @@
 goto EXIT;
 }
 
+/* These #undefs are here to enable unity builds with CMake. */
+
+#undef NLBLOCK /* Block containing newline information */
+#undef PSSTART /* Field containing processed string start */
+#undef PSEND   /* Field containing processed string end */
+
 /* End of pcre2_compile.c */
diff --git a/src/pcre2_context.c b/src/pcre2_context.c
index f904a49..8e05ede 100644
--- a/src/pcre2_context.c
+++ b/src/pcre2_context.c
@@ -7,7 +7,7 @@
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2018 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -443,8 +443,11 @@
 return 0;
 }
 
-/* This function became obsolete at release 10.30. It is kept as a synonym for
-backwards compatibility. */
+/* These functions became obsolete at release 10.30. The first is kept as a
+synonym for backwards compatibility. The second now does nothing. Exclude both
+from coverage reports. */
+
+/* LCOV_EXCL_START */
 
 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 pcre2_set_recursion_limit(pcre2_match_context *mcontext, uint32_t limit)
@@ -464,6 +467,9 @@
 return 0;
 }
 
+/* LCOV_EXCL_STOP */
+
+
 /* ------------ Convert context ------------ */
 
 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
diff --git a/src/pcre2_convert.c b/src/pcre2_convert.c
index d45b6fe..36466e4 100644
--- a/src/pcre2_convert.c
+++ b/src/pcre2_convert.c
@@ -7,7 +7,7 @@
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2018 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -65,9 +65,8 @@
 #define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS
 #define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS
 
-/* States for range and POSIX processing */
+/* States for POSIX processing */
 
-enum { RANGE_NOT_STARTED, RANGE_STARTING, RANGE_STARTED };
 enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET,
        POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };
 
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index d29130f..b16e594 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -350,7 +350,7 @@
 */
 
 static int
-do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
+do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
   PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
   PCRE2_SIZE *lengthptr)
 {
@@ -2799,7 +2799,7 @@
             || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
           {
           PCRE2_SIZE callout_length;
-          rrc = do_callout(code, offsets, current_subject, ptr, mb,
+          rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
             1 + LINK_SIZE, &callout_length);
           if (rrc < 0) return rrc;                 /* Abandon */
           if (rrc > 0) break;                      /* Fail this thread */
@@ -3196,7 +3196,7 @@
       case OP_CALLOUT_STR:
         {
         PCRE2_SIZE callout_length;
-        rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
+        rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
           &callout_length);
         if (rrc < 0) return rrc;   /* Abandon */
         if (rrc == 0)
@@ -4057,4 +4057,10 @@
 return rc;
 }
 
+/* These #undefs are here to enable unity builds with CMake. */
+
+#undef NLBLOCK /* Block containing newline information */
+#undef PSSTART /* Field containing processed string start */
+#undef PSEND   /* Field containing processed string end */
+
 /* End of pcre2_dfa_match.c */
diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h
index fe7a0e0..92dd313 100644
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@@ -220,18 +220,17 @@
 
 #define COMPILE_ERROR_BASE 100
 
-/* The initial frames vector for remembering backtracking points in
-pcre2_match() is allocated on the system stack, of this size (bytes). The size
-must be a multiple of sizeof(PCRE2_SPTR) in all environments, so making it a
-multiple of 8 is best. Typical frame sizes are a few hundred bytes (it depends
-on the number of capturing parentheses) so 20KiB handles quite a few frames. A
-larger vector on the heap is obtained for patterns that need more frames. The
-maximum size of this can be limited. */
+/* The initial frames vector for remembering pcre2_match() backtracking points
+is allocated on the heap, of this size (bytes) or ten times the frame size if
+larger, unless the heap limit is smaller. Typical frame sizes are a few hundred
+bytes (it depends on the number of capturing parentheses) so 20KiB handles
+quite a few frames. A larger vector on the heap is obtained for matches that
+need more frames, subject to the heap limit. */
 
 #define START_FRAMES_SIZE 20480
 
-/* Similarly, for DFA matching, an initial internal workspace vector is
-allocated on the stack. */
+/* For DFA matching, an initial internal workspace vector is allocated on the
+stack. The heap is used only if this turns out to be too small. */
 
 #define DFA_START_RWS_SIZE 30720
 
diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h
index f8a3d25..390e737 100644
--- a/src/pcre2_intmodedep.h
+++ b/src/pcre2_intmodedep.h
@@ -7,7 +7,7 @@
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2018 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -649,19 +649,23 @@
 subpatterns is 65535 we must allow for 65536 strings to include the overall
 match. (See also the heapframe structure below.) */
 
+struct heapframe;  /* Forward reference */
+
 typedef struct pcre2_real_match_data {
-  pcre2_memctl     memctl;
-  const pcre2_real_code *code;    /* The pattern used for the match */
-  PCRE2_SPTR       subject;       /* The subject that was matched */
-  PCRE2_SPTR       mark;          /* Pointer to last mark */
-  PCRE2_SIZE       leftchar;      /* Offset to leftmost code unit */
-  PCRE2_SIZE       rightchar;     /* Offset to rightmost code unit */
-  PCRE2_SIZE       startchar;     /* Offset to starting code unit */
-  uint8_t          matchedby;     /* Type of match (normal, JIT, DFA) */
-  uint8_t          flags;         /* Various flags */
-  uint16_t         oveccount;     /* Number of pairs */
-  int              rc;            /* The return code from the match */
-  PCRE2_SIZE       ovector[131072]; /* Must be last in the structure */
+  pcre2_memctl     memctl;           /* Memory control fields */
+  const pcre2_real_code *code;       /* The pattern used for the match */
+  PCRE2_SPTR       subject;          /* The subject that was matched */
+  PCRE2_SPTR       mark;             /* Pointer to last mark */
+  struct heapframe *heapframes;      /* Backtracking frames heap memory */
+  PCRE2_SIZE       heapframes_size;  /* Malloc-ed size */
+  PCRE2_SIZE       leftchar;         /* Offset to leftmost code unit */
+  PCRE2_SIZE       rightchar;        /* Offset to rightmost code unit */
+  PCRE2_SIZE       startchar;        /* Offset to starting code unit */
+  uint8_t          matchedby;        /* Type of match (normal, JIT, DFA) */
+  uint8_t          flags;            /* Various flags */
+  uint16_t         oveccount;        /* Number of pairs */
+  int              rc;               /* The return code from the match */
+  PCRE2_SIZE       ovector[131072];  /* Must be last in the structure */
 } pcre2_real_match_data;
 
 
@@ -854,10 +858,6 @@
 
 typedef struct match_block {
   pcre2_memctl memctl;            /* For general use */
-  PCRE2_SIZE frame_vector_size;   /* Size of a backtracking frame */
-  heapframe *match_frames;        /* Points to vector of frames */
-  heapframe *match_frames_top;    /* Points after the end of the vector */
-  heapframe *stack_frames;        /* The original vector on the stack */
   PCRE2_SIZE heap_limit;          /* As it says */
   uint32_t match_limit;           /* As it says */
   uint32_t match_limit_depth;     /* As it says */
diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c
index d726c3c..0afd27c 100644
--- a/src/pcre2_jit_compile.c
+++ b/src/pcre2_jit_compile.c
@@ -542,7 +542,7 @@
 #undef CMP
 
 /* Used for accessing the elements of the stack. */
-#define STACK(i)      ((i) * (int)sizeof(sljit_sw))
+#define STACK(i)      ((i) * SSIZE_OF(sw))
 
 #ifdef SLJIT_PREF_SHIFT_REG
 #if SLJIT_PREF_SHIFT_REG == SLJIT_R2
@@ -590,8 +590,8 @@
 group contains the start / end character pointers, and the second is
 the start pointers when the end of the capturing group has not yet reached. */
 #define OVECTOR_START    (common->ovector_start)
-#define OVECTOR(i)       (OVECTOR_START + (i) * (sljit_sw)sizeof(sljit_sw))
-#define OVECTOR_PRIV(i)  (common->cbra_ptr + (i) * (sljit_sw)sizeof(sljit_sw))
+#define OVECTOR(i)       (OVECTOR_START + (i) * SSIZE_OF(sw))
+#define OVECTOR_PRIV(i)  (common->cbra_ptr + (i) * SSIZE_OF(sw))
 #define PRIVATE_DATA(cc) (common->private_data_ptrs[(cc) - common->start])
 
 #if PCRE2_CODE_UNIT_WIDTH == 8
@@ -2151,9 +2151,9 @@
       {
       OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0));
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -OVECTOR(0));
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       setsom_found = TRUE;
       }
     cc += 1;
@@ -2168,9 +2168,9 @@
       {
       OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->mark_ptr);
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->mark_ptr);
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       setmark_found = TRUE;
       }
     cc += 1 + 2 + cc[1];
@@ -2181,27 +2181,27 @@
       {
       OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0));
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -OVECTOR(0));
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       setsom_found = TRUE;
       }
     if (common->mark_ptr != 0 && !setmark_found)
       {
       OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->mark_ptr);
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->mark_ptr);
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       setmark_found = TRUE;
       }
     if (common->capture_last_ptr != 0 && !capture_last_found)
       {
       OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr);
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->capture_last_ptr);
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       capture_last_found = TRUE;
       }
     cc += 1 + LINK_SIZE;
@@ -2215,20 +2215,20 @@
       {
       OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr);
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, -common->capture_last_ptr);
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
-      stackpos -= (int)sizeof(sljit_sw);
+      stackpos -= SSIZE_OF(sw);
       capture_last_found = TRUE;
       }
     offset = (GET2(cc, 1 + LINK_SIZE)) << 1;
     OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, SLJIT_IMM, OVECTOR(offset));
-    stackpos -= (int)sizeof(sljit_sw);
+    stackpos -= SSIZE_OF(sw);
     OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset));
     OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1));
     OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP1, 0);
-    stackpos -= (int)sizeof(sljit_sw);
+    stackpos -= SSIZE_OF(sw);
     OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), stackpos, TMP2, 0);
-    stackpos -= (int)sizeof(sljit_sw);
+    stackpos -= SSIZE_OF(sw);
 
     cc += 1 + LINK_SIZE + IMM2_SIZE;
     break;
@@ -3144,7 +3144,7 @@
 DEFINE_COMPILER;
 
 SLJIT_ASSERT(size > 0);
-OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, size * sizeof(sljit_sw));
+OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, size * SSIZE_OF(sw));
 #ifdef DESTROY_REGISTERS
 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 12345);
 OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
@@ -3160,7 +3160,7 @@
 DEFINE_COMPILER;
 
 SLJIT_ASSERT(size > 0);
-OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, size * sizeof(sljit_sw));
+OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, size * SSIZE_OF(sw));
 }
 
 static sljit_uw * allocate_read_only_data(compiler_common *common, sljit_uw size)
@@ -3200,12 +3200,12 @@
   }
 else
   {
-  if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw)) == SLJIT_SUCCESS)
+  if (sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw)) == SLJIT_SUCCESS)
     {
     GET_LOCAL_BASE(SLJIT_R1, 0, OVECTOR_START);
     OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, length - 1);
     loop = LABEL();
-    sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw));
+    sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_PRE, SLJIT_R0, SLJIT_MEM1(SLJIT_R1), sizeof(sljit_sw));
     OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, 1);
     JUMPTO(SLJIT_NOT_ZERO, loop);
     }
@@ -3261,8 +3261,8 @@
 loop = LABEL();
 OP1(SLJIT_MOV, SLJIT_MEM1(TMP1), 0, src, 0);
 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 3 * sizeof(sljit_sw));
-OP1(SLJIT_MOV, SLJIT_MEM1(TMP1), -2 * (sljit_sw)sizeof(sljit_sw), src, 0);
-OP1(SLJIT_MOV, SLJIT_MEM1(TMP1), -1 * (sljit_sw)sizeof(sljit_sw), src, 0);
+OP1(SLJIT_MOV, SLJIT_MEM1(TMP1), -2 * SSIZE_OF(sw), src, 0);
+OP1(SLJIT_MOV, SLJIT_MEM1(TMP1), -1 * SSIZE_OF(sw), src, 0);
 CMPTO(SLJIT_LESS, TMP1, 0, TMP2, 0, loop);
 
 if (uncleared_size >= sizeof(sljit_sw))
@@ -3289,12 +3289,12 @@
   }
 else
   {
-  if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, TMP1, SLJIT_MEM1(TMP2), sizeof(sljit_sw)) == SLJIT_SUCCESS)
+  if (sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_STORE | SLJIT_MEM_PRE, TMP1, SLJIT_MEM1(TMP2), sizeof(sljit_sw)) == SLJIT_SUCCESS)
     {
     GET_LOCAL_BASE(TMP2, 0, OVECTOR_START + sizeof(sljit_sw));
     OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_IMM, length - 2);
     loop = LABEL();
-    sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_PRE, TMP1, SLJIT_MEM1(TMP2), sizeof(sljit_sw));
+    sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_STORE | SLJIT_MEM_PRE, TMP1, SLJIT_MEM1(TMP2), sizeof(sljit_sw));
     OP2(SLJIT_SUB | SLJIT_SET_Z, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 1);
     JUMPTO(SLJIT_NOT_ZERO, loop);
     }
@@ -3386,7 +3386,7 @@
   OP2(SLJIT_ADD, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_IMM, SLJIT_OFFSETOF(pcre2_match_data, ovector) - sizeof(PCRE2_SIZE));
   }
 
-has_pre = sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_S1, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw)) == SLJIT_SUCCESS;
+has_pre = sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_S1, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw)) == SLJIT_SUCCESS;
 
 GET_LOCAL_BASE(SLJIT_S0, 0, OVECTOR_START - (has_pre ? sizeof(sljit_sw) : 0));
 OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(HAS_VIRTUAL_REGISTERS ? SLJIT_R0 : ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, begin));
@@ -3394,7 +3394,7 @@
 loop = LABEL();
 
 if (has_pre)
-  sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_PRE, SLJIT_S1, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw));
+  sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_PRE, SLJIT_S1, SLJIT_MEM1(SLJIT_S0), sizeof(sljit_sw));
 else
   {
   OP1(SLJIT_MOV, SLJIT_S1, 0, SLJIT_MEM1(SLJIT_S0), 0);
@@ -3417,14 +3417,14 @@
 /* Calculate the return value, which is the maximum ovector value. */
 if (topbracket > 1)
   {
-  if (sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R0), -(2 * (sljit_sw)sizeof(sljit_sw))) == SLJIT_SUCCESS)
+  if (sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R0), -(2 * SSIZE_OF(sw))) == SLJIT_SUCCESS)
     {
     GET_LOCAL_BASE(SLJIT_R0, 0, OVECTOR_START + topbracket * 2 * sizeof(sljit_sw));
     OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, topbracket + 1);
 
     /* OVECTOR(0) is never equal to SLJIT_S2. */
     loop = LABEL();
-    sljit_emit_mem(compiler, SLJIT_MOV | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R0), -(2 * (sljit_sw)sizeof(sljit_sw)));
+    sljit_emit_mem_update(compiler, SLJIT_MOV | SLJIT_MEM_PRE, SLJIT_R2, SLJIT_MEM1(SLJIT_R0), -(2 * SSIZE_OF(sw)));
     OP2(SLJIT_SUB, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
     CMPTO(SLJIT_EQUAL, SLJIT_R2, 0, SLJIT_S2, 0, loop);
     OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_R1, 0);
@@ -3437,7 +3437,7 @@
     /* OVECTOR(0) is never equal to SLJIT_S2. */
     loop = LABEL();
     OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_R0), 0);
-    OP2(SLJIT_SUB, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 2 * (sljit_sw)sizeof(sljit_sw));
+    OP2(SLJIT_SUB, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, 2 * SSIZE_OF(sw));
     OP2(SLJIT_SUB, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1);
     CMPTO(SLJIT_EQUAL, SLJIT_R2, 0, SLJIT_S2, 0, loop);
     OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_R1, 0);
@@ -4652,8 +4652,8 @@
   /* All newlines are ascii, just skip intermediate octets. */
   jump[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
   loop = LABEL();
-  if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_POST, TMP2, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)) == SLJIT_SUCCESS)
-    sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, TMP2, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+  if (sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_POST, TMP2, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)) == SLJIT_SUCCESS)
+    sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_POST, TMP2, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
   else
     {
     OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
@@ -5886,7 +5886,7 @@
       while (j < i)
         {
         b_pri = chars[j].last_count;
-        if (b_pri > 2 && a_pri + b_pri >= max_pri)
+        if (b_pri > 2 && (sljit_u32)a_pri + (sljit_u32)b_pri >= max_pri)
           {
           b1 = chars[j].chars[0];
           b2 = chars[j].chars[1];
@@ -6572,21 +6572,21 @@
 
 /* Drop frames until we reach STACK_TOP. */
 mainloop = LABEL();
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), -sizeof(sljit_sw));
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(STACK_TOP), -SSIZE_OF(sw));
 jump = CMP(SLJIT_SIG_LESS_EQUAL, TMP2, 0, SLJIT_IMM, 0);
 
 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
 if (HAS_VIRTUAL_REGISTERS)
   {
-  OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw)));
-  OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), sizeof(sljit_sw), SLJIT_MEM1(STACK_TOP), -(3 * sizeof(sljit_sw)));
-  OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 3 * sizeof(sljit_sw));
+  OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, SLJIT_MEM1(STACK_TOP), -(2 * SSIZE_OF(sw)));
+  OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), sizeof(sljit_sw), SLJIT_MEM1(STACK_TOP), -(3 * SSIZE_OF(sw)));
+  OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 3 * SSIZE_OF(sw));
   }
 else
   {
-  OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw)));
-  OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(STACK_TOP), -(3 * sizeof(sljit_sw)));
-  OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 3 * sizeof(sljit_sw));
+  OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), -(2 * SSIZE_OF(sw)));
+  OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(STACK_TOP), -(3 * SSIZE_OF(sw)));
+  OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 3 * SSIZE_OF(sw));
   OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, TMP1, 0);
   GET_LOCAL_BASE(TMP1, 0, 0);
   OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), sizeof(sljit_sw), TMP3, 0);
@@ -6603,13 +6603,13 @@
 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
 if (HAS_VIRTUAL_REGISTERS)
   {
-  OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw)));
-  OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 2 * sizeof(sljit_sw));
+  OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, SLJIT_MEM1(STACK_TOP), -(2 * SSIZE_OF(sw)));
+  OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 2 * SSIZE_OF(sw));
   }
 else
   {
-  OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(STACK_TOP), -(2 * sizeof(sljit_sw)));
-  OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 2 * sizeof(sljit_sw));
+  OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(STACK_TOP), -(2 * SSIZE_OF(sw)));
+  OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 2 * SSIZE_OF(sw));
   OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), 0, TMP3, 0);
   }
 JUMPTO(SLJIT_JUMP, mainloop);
@@ -7159,11 +7159,11 @@
   OP1(SLJIT_MOV, RETURN_ADDR, 0, char2_reg, 0);
   }
 
-if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
+if (sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
   {
   label = LABEL();
-  sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
-  sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+  sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+  sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_POST, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
   jump = CMP(SLJIT_NOT_EQUAL, char1_reg, 0, char2_reg, 0);
   OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
   JUMPTO(SLJIT_NOT_ZERO, label);
@@ -7171,14 +7171,14 @@
   JUMPHERE(jump);
   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
   }
-else if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
+else if (sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
   {
   OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
   OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 
   label = LABEL();
-  sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
-  sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+  sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+  sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
   jump = CMP(SLJIT_NOT_EQUAL, char1_reg, 0, char2_reg, 0);
   OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
   JUMPTO(SLJIT_NOT_ZERO, label);
@@ -7232,9 +7232,9 @@
   lcc_table = TMP3;
   }
 
-if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
+if (sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
   opt_type = 1;
-else if (sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
+else if (sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_SUPP | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1)) == SLJIT_SUCCESS)
   opt_type = 2;
 
 sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0);
@@ -7253,8 +7253,8 @@
 if (opt_type == 1)
   {
   label = LABEL();
-  sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
-  sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_POST, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+  sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_POST, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+  sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_POST, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
   }
 else if (opt_type == 2)
   {
@@ -7262,8 +7262,8 @@
   OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 
   label = LABEL();
-  sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
-  sljit_emit_mem(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+  sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char1_reg, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+  sljit_emit_mem_update(compiler, MOV_UCHAR | SLJIT_MEM_PRE, char2_reg, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
   }
 else
   {
@@ -9689,7 +9689,7 @@
 return cc + 1 + LINK_SIZE;
 }
 
-static sljit_s32 SLJIT_FUNC do_callout(struct jit_arguments *arguments, pcre2_callout_block *callout_block, PCRE2_SPTR *jit_ovector)
+static sljit_s32 SLJIT_FUNC do_callout_jit(struct jit_arguments *arguments, pcre2_callout_block *callout_block, PCRE2_SPTR *jit_ovector)
 {
 PCRE2_SPTR begin;
 PCRE2_SIZE *ovector;
@@ -9756,7 +9756,7 @@
 sljit_sw value1;
 sljit_sw value2;
 sljit_sw value3;
-sljit_uw callout_arg_size = (common->re->top_bracket + 1) * 2 * sizeof(sljit_sw);
+sljit_uw callout_arg_size = (common->re->top_bracket + 1) * 2 * SSIZE_OF(sw);
 
 PUSH_BACKTRACK(sizeof(backtrack_common), cc, NULL);
 
@@ -9806,7 +9806,7 @@
 /* SLJIT_R0 = arguments */
 OP1(SLJIT_MOV, SLJIT_R1, 0, STACK_TOP, 0);
 GET_LOCAL_BASE(SLJIT_R2, 0, OVECTOR_START);
-sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3(32, W, W, W), SLJIT_IMM, SLJIT_FUNC_ADDR(do_callout));
+sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3(32, W, W, W), SLJIT_IMM, SLJIT_FUNC_ADDR(do_callout_jit));
 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
 free_stack(common, callout_arg_size);
 
@@ -11451,7 +11451,7 @@
 int private_data_ptr = PRIVATE_DATA(cc);
 int base = (private_data_ptr == 0) ? SLJIT_MEM1(STACK_TOP) : SLJIT_MEM1(SLJIT_SP);
 int offset0 = (private_data_ptr == 0) ? STACK(0) : private_data_ptr;
-int offset1 = (private_data_ptr == 0) ? STACK(1) : private_data_ptr + (int)sizeof(sljit_sw);
+int offset1 = (private_data_ptr == 0) ? STACK(1) : private_data_ptr + SSIZE_OF(sw);
 int tmp_base, tmp_offset;
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
 BOOL use_tmp;
@@ -11517,19 +11517,19 @@
     }
   }
 else if (exact == 1)
-  {
   compile_char1_matchingpath(common, type, cc, &backtrack->topbacktracks, TRUE);
 
-  if (early_fail_type == type_fail_range)
-    {
-    OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), early_fail_ptr);
-    OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), early_fail_ptr + (int)sizeof(sljit_sw));
-    OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, TMP2, 0);
-    OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, TMP2, 0);
-    add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_LESS_EQUAL, TMP2, 0, TMP1, 0));
+if (early_fail_type == type_fail_range)
+  {
+  /* Range end first, followed by range start. */
+  OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), early_fail_ptr);
+  OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), early_fail_ptr + SSIZE_OF(sw));
+  OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, TMP2, 0);
+  OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, TMP2, 0);
+  add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_LESS_EQUAL, TMP2, 0, TMP1, 0));
 
-    OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr + (int)sizeof(sljit_sw), STR_PTR, 0);
-    }
+  OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr, STR_PTR, 0);
+  OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), early_fail_ptr + SSIZE_OF(sw), STR_PTR, 0);
   }
 
 switch(opcode)
@@ -12428,7 +12428,7 @@
 int private_data_ptr = PRIVATE_DATA(cc);
 int base = (private_data_ptr == 0) ? SLJIT_MEM1(STACK_TOP) : SLJIT_MEM1(SLJIT_SP);
 int offset0 = (private_data_ptr == 0) ? STACK(0) : private_data_ptr;
-int offset1 = (private_data_ptr == 0) ? STACK(1) : private_data_ptr + (int)sizeof(sljit_sw);
+int offset1 = (private_data_ptr == 0) ? STACK(1) : private_data_ptr + SSIZE_OF(sw);
 
 cc = get_iterator_parameters(common, cc, &opcode, &type, &max, &exact, &end);
 
@@ -14148,7 +14148,7 @@
 if (common->currententry != NULL)
   {
   /* A free bit for each private data. */
-  common->recurse_bitset_size = ((private_data_size / (int)sizeof(sljit_sw)) + 7) >> 3;
+  common->recurse_bitset_size = ((private_data_size / SSIZE_OF(sw)) + 7) >> 3;
   SLJIT_ASSERT(common->recurse_bitset_size > 0);
   common->recurse_bitset = (sljit_u8*)SLJIT_MALLOC(common->recurse_bitset_size, allocator_data);;
 
@@ -14384,7 +14384,7 @@
 pcre2_real_code *re = (pcre2_real_code *)code;
 #ifdef SUPPORT_JIT
 executable_functions *functions;
-static int executable_allocator_is_working = 0;
+static int executable_allocator_is_working = -1;
 #endif
 
 if (code == NULL)
@@ -14447,23 +14447,21 @@
 
 if ((re->flags & PCRE2_NOJIT) != 0) return 0;
 
-if (executable_allocator_is_working == 0)
+if (executable_allocator_is_working == -1)
   {
   /* Checks whether the executable allocator is working. This check
      might run multiple times in multi-threaded environments, but the
      result should not be affected by it. */
   void *ptr = SLJIT_MALLOC_EXEC(32, NULL);
-
-  executable_allocator_is_working = -1;
-
   if (ptr != NULL)
     {
     SLJIT_FREE_EXEC(((sljit_u8*)(ptr)) + SLJIT_EXEC_OFFSET(ptr), NULL);
     executable_allocator_is_working = 1;
     }
+  else executable_allocator_is_working = 0;
   }
 
-if (executable_allocator_is_working < 0)
+if (!executable_allocator_is_working)
   return PCRE2_ERROR_NOMEMORY;
 
 if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
diff --git a/src/pcre2_jit_misc.c b/src/pcre2_jit_misc.c
index e57afad..bb6a558 100644
--- a/src/pcre2_jit_misc.c
+++ b/src/pcre2_jit_misc.c
@@ -110,8 +110,10 @@
 (void)gcontext;     /* Suppress warning */
 #else  /* SUPPORT_JIT */
 SLJIT_UNUSED_ARG(gcontext);
+#if (defined SLJIT_EXECUTABLE_ALLOCATOR && SLJIT_EXECUTABLE_ALLOCATOR)
 sljit_free_unused_memory_exec();
-#endif  /* SUPPORT_JIT */
+#endif /* SLJIT_EXECUTABLE_ALLOCATOR */
+#endif /* SUPPORT_JIT */
 }
 
 
diff --git a/src/pcre2_jit_neon_inc.h b/src/pcre2_jit_neon_inc.h
index 150da29..165602e 100644
--- a/src/pcre2_jit_neon_inc.h
+++ b/src/pcre2_jit_neon_inc.h
@@ -183,6 +183,8 @@
 #endif
 
 #if defined(FFCPS)
+if (str_ptr >= str_end)
+  return NULL;
 sljit_u8 *p1 = str_ptr - diff;
 #endif
 sljit_s32 align_offset = ((uint64_t)str_ptr & 0xf);
@@ -327,7 +329,7 @@
     return NULL;
 
 #if defined(FF_UTF)
-  if (utf_continue(str_ptr + IN_UCHARS(-offs1)))
+  if (utf_continue((PCRE2_SPTR)str_ptr - offs1))
     {
     /* Not a match. */
     str_ptr += IN_UCHARS(1);
diff --git a/src/pcre2_jit_simd_inc.h b/src/pcre2_jit_simd_inc.h
index d99cfc5..1a5ce4e 100644
--- a/src/pcre2_jit_simd_inc.h
+++ b/src/pcre2_jit_simd_inc.h
@@ -776,7 +776,7 @@
 } int_char;
 
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
-static SLJIT_INLINE int utf_continue(sljit_u8 *s)
+static SLJIT_INLINE int utf_continue(PCRE2_SPTR s)
 {
 #if PCRE2_CODE_UNIT_WIDTH == 8
 return (*s & 0xc0) == 0x80;
diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c
index bb141a0..81c9582 100644
--- a/src/pcre2_jit_test.c
+++ b/src/pcre2_jit_test.c
@@ -354,6 +354,7 @@
 	{ MU, A, 0, 0, "_[ab]+_*a", "_aa" },
 	{ MU, A, 0, 0, "#(A+)#\\d+", "#A#A#0" },
 	{ MU, A, 0, 0, "(?P<size>\\d+)m|M", "4M" },
+	{ M, PCRE2_NEWLINE_CRLF, 0, 0, "\\n?.+#", "\n,\n,#" },
 
 	/* Bracket repeats with limit. */
 	{ MU, A, 0, 0, "(?:(ab){2}){5}M", "abababababababababababM" },
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index 6354e1b..168b9fa 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -204,6 +204,7 @@
   P           a previous frame of interest
   frame_size  the frame size
   mb          points to the match block
+  match_data  points to the match data block
   s           identification text
 
 Returns:    nothing
@@ -211,7 +212,7 @@
 
 static void
 display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
-  match_block *mb, const char *s, ...)
+  match_block *mb, pcre2_match_data *match_data, const char *s, ...)
 {
 uint32_t i;
 heapframe *Q;
@@ -223,10 +224,10 @@
 va_end(ap);
 
 if (P != NULL) fprintf(f, " P=%lu",
-  ((char *)P - (char *)(mb->match_frames))/frame_size);
+  ((char *)P - (char *)(match_data->heapframes))/frame_size);
 fprintf(f, "\n");
 
-for (i = 0, Q = mb->match_frames;
+for (i = 0, Q = match_data->heapframes;
      Q <= F;
      i++, Q = (heapframe *)((char *)Q + frame_size))
   {
@@ -490,10 +491,16 @@
 match() recursively, but this ran substantially slower. The current version is
 a refactoring that uses a vector of frames to remember backtracking points.
 This runs no slower, and possibly even a bit faster than the original recursive
-implementation. An initial vector of size START_FRAMES_SIZE (enough for maybe
-50 frames) is allocated on the system stack. If this is not big enough, the
-heap is used for a larger vector.
+implementation.
 
+At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50
+frames) was allocated on the system stack. If this was not big enough, the heap
+was used for a larger vector. However, it turns out that there are environments
+where taking as little as 20KiB from the system stack is an embarrassment.
+After another refactoring, the heap is used exclusively, but a pointer the
+frames vector and its size are cached in the match_data block, so that there is
+no new memory allocation if the same match_data block is used for multiple
+matches (unless the frames vector has to be extended).
 *******************************************************************************
 ******************************************************************************/
 
@@ -566,10 +573,9 @@
 Arguments:
    start_eptr   starting character in subject
    start_ecode  starting position in compiled code
-   ovector      pointer to the final output vector
-   oveccount    number of pairs in ovector
    top_bracket  number of capturing parentheses in the pattern
    frame_size   size of each backtracking frame
+   match_data   pointer to the match_data block
    mb           pointer to "static" variables block
 
 Returns:        MATCH_MATCH if matched            )  these values are >= 0
@@ -580,17 +586,19 @@
 */
 
 static int
-match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector,
-  uint16_t oveccount, uint16_t top_bracket, PCRE2_SIZE frame_size,
-  match_block *mb)
+match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket,
+  PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
 {
 /* Frame-handling variables */
 
 heapframe *F;           /* Current frame pointer */
 heapframe *N = NULL;    /* Temporary frame pointers */
 heapframe *P = NULL;
+
+heapframe *frames_top;  /* End of frames vector */
 heapframe *assert_accept_frame = NULL;  /* For passing back a frame with captures */
-PCRE2_SIZE frame_copy_size;     /* Amount to copy when creating a new frame */
+PCRE2_SIZE heapframes_size;   /* Usable size of frames vector */
+PCRE2_SIZE frame_copy_size;   /* Amount to copy when creating a new frame */
 
 /* Local variables that do not need to be preserved over calls to RRMATCH(). */
 
@@ -627,10 +635,14 @@
 
 frame_copy_size = frame_size - offsetof(heapframe, eptr);
 
-/* Set up the first current frame at the start of the vector, and initialize
-fields that are not reset for new frames. */
+/* Set up the first frame and the end of the frames vector. We set the local
+heapframes_size to the usuable amount of the vector, that is, a whole number of
+frames. */
 
-F = mb->match_frames;
+F = match_data->heapframes;
+heapframes_size = (match_data->heapframes_size / frame_size) * frame_size;
+frames_top = (heapframe *)((char *)F + heapframes_size);
+
 Frdepth = 0;                        /* "Recursion" depth */
 Fcapture_last = 0;                  /* Number of most recent capture */
 Fcurrent_recurse = RECURSE_UNSET;   /* Not pattern recursing. */
@@ -646,34 +658,35 @@
 
 MATCH_RECURSE:
 
-/* Set up a new backtracking frame. If the vector is full, get a new one
-on the heap, doubling the size, but constrained by the heap limit. */
+/* Set up a new backtracking frame. If the vector is full, get a new one,
+doubling the size, but constrained by the heap limit (which is in KiB). */
 
 N = (heapframe *)((char *)F + frame_size);
-if (N >= mb->match_frames_top)
+if (N >= frames_top)
   {
-  PCRE2_SIZE newsize = mb->frame_vector_size * 2;
   heapframe *new;
+  PCRE2_SIZE newsize = match_data->heapframes_size * 2;
 
-  if ((newsize / 1024) > mb->heap_limit)
+  if (newsize > mb->heap_limit)
     {
-    PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size;
-    if (mb->frame_vector_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
+    PCRE2_SIZE maxsize = (mb->heap_limit/frame_size) * frame_size;
+    if (match_data->heapframes_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
     newsize = maxsize;
     }
 
-  new = mb->memctl.malloc(newsize, mb->memctl.memory_data);
+  new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data);
   if (new == NULL) return PCRE2_ERROR_NOMEMORY;
-  memcpy(new, mb->match_frames, mb->frame_vector_size);
+  memcpy(new, match_data->heapframes, heapframes_size);
 
-  F = (heapframe *)((char *)new + ((char *)F - (char *)mb->match_frames));
+  F = (heapframe *)((char *)new + ((char *)F - (char *)match_data->heapframes));
   N = (heapframe *)((char *)F + frame_size);
 
-  if (mb->match_frames != mb->stack_frames)
-    mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
-  mb->match_frames = new;
-  mb->match_frames_top = (heapframe *)((char *)mb->match_frames + newsize);
-  mb->frame_vector_size = newsize;
+  match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data);
+  match_data->heapframes = new;
+  match_data->heapframes_size = newsize;
+
+  heapframes_size = (newsize / frame_size) * frame_size;
+  frames_top = (heapframe *)((char *)new + heapframes_size);
   }
 
 #ifdef DEBUG_SHOW_RMATCH
@@ -731,7 +744,7 @@
 
 if (group_frame_type != 0)
   {
-  Flast_group_offset = (char *)F - (char *)mb->match_frames;
+  Flast_group_offset = (char *)F - (char *)match_data->heapframes;
   if (GF_IDMASK(group_frame_type) == GF_RECURSE)
     Fcurrent_recurse = GF_DATAMASK(group_frame_type);
   group_frame_type = 0;
@@ -773,7 +786,7 @@
       for(;;)
         {
         if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
-        N = (heapframe *)((char *)mb->match_frames + offset);
+        N = (heapframe *)((char *)match_data->heapframes + offset);
         P = (heapframe *)((char *)N - frame_size);
         if (N->group_frame_type == (GF_CAPTURE | number)) break;
         offset = P->last_group_offset;
@@ -811,7 +824,7 @@
       for(;;)
         {
         if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
-        N = (heapframe *)((char *)mb->match_frames + offset);
+        N = (heapframe *)((char *)match_data->heapframes + offset);
         P = (heapframe *)((char *)N - frame_size);
         if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
         offset = P->last_group_offset;
@@ -864,14 +877,15 @@
     mb->mark = Fmark;                    /* and the last success mark */
     if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
 
-    ovector[0] = Fstart_match - mb->start_subject;
-    ovector[1] = Feptr - mb->start_subject;
+    match_data->ovector[0] = Fstart_match - mb->start_subject;
+    match_data->ovector[1] = Feptr - mb->start_subject;
 
     /* Set i to the smaller of the sizes of the external and frame ovectors. */
 
-    i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1);
-    memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
-    while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET;
+    i = 2 * ((top_bracket + 1 > match_data->oveccount)?
+      match_data->oveccount : top_bracket + 1);
+    memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
+    while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET;
     return MATCH_MATCH;  /* Note: NOT RRETURN */
 
 
@@ -5328,7 +5342,7 @@
       offset = Flast_group_offset;
       while (offset != PCRE2_UNSET)
         {
-        N = (heapframe *)((char *)mb->match_frames + offset);
+        N = (heapframe *)((char *)match_data->heapframes + offset);
         P = (heapframe *)((char *)N - frame_size);
         if (N->group_frame_type == (GF_RECURSE | number))
           {
@@ -5729,7 +5743,7 @@
 
     if (*bracode != OP_BRA && *bracode != OP_COND)
       {
-      N = (heapframe *)((char *)mb->match_frames + Flast_group_offset);
+      N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset);
       P = (heapframe *)((char *)N - frame_size);
       Flast_group_offset = P->last_group_offset;
 
@@ -6346,6 +6360,7 @@
 #endif  /* SUPPORT_UNICODE */
 
 PCRE2_SIZE frame_size;
+PCRE2_SIZE heapframes_size;
 
 /* We need to have mb as a pointer to a match block, because the IS_NEWLINE
 macro is used below, and it expects NLBLOCK to be defined as a pointer. */
@@ -6354,15 +6369,6 @@
 match_block actual_match_block;
 match_block *mb = &actual_match_block;
 
-/* Allocate an initial vector of backtracking frames on the stack. If this
-proves to be too small, it is replaced by a larger one on the heap. To get a
-vector of the size required that is aligned for pointers, allocate it as a
-vector of pointers. */
-
-PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]
-    PCRE2_KEEP_UNINITIALIZED;
-mb->stack_frames = (heapframe *)stack_frames_vector;
-
 /* Recognize NULL, length 0 as an empty string. */
 
 if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
@@ -6793,15 +6799,11 @@
 vector at the end, whose size depends on the number of capturing parentheses in
 the pattern. It is not used at all if there are no capturing parentheses.
 
-  frame_size             is the total size of each frame
-  mb->frame_vector_size  is the total usable size of the vector (rounded down
-                           to a whole number of frames)
+  frame_size                   is the total size of each frame
+  match_data->heapframes       is the pointer to the frames vector
+  match_data->heapframes_size  is the total size of the vector
 
-The last of these is changed within the match() function if the frame vector
-has to be expanded. We therefore put it into the match block so that it is
-correct when calling match() more than once for non-anchored patterns.
-
-We must also pad frame_size for alignment to ensure subsequent frames are as
+We must pad the frame_size for alignment to ensure subsequent frames are as
 aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE
 array, that does not guarantee it is suitably aligned for pointers, as some
 architectures have pointers that are larger than a size_t. */
@@ -6813,8 +6815,8 @@
 /* Limits set in the pattern override the match context only if they are
 smaller. */
 
-mb->heap_limit = (mcontext->heap_limit < re->limit_heap)?
-  mcontext->heap_limit : re->limit_heap;
+mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)?
+  mcontext->heap_limit : re->limit_heap) * 1024;
 
 mb->match_limit = (mcontext->match_limit < re->limit_match)?
   mcontext->match_limit : re->limit_match;
@@ -6823,35 +6825,40 @@
   mcontext->depth_limit : re->limit_depth;
 
 /* If a pattern has very many capturing parentheses, the frame size may be very
-large. Ensure that there are at least 10 available frames by getting an initial
-vector on the heap if necessary, except when the heap limit prevents this. Get
-fewer if possible. (The heap limit is in kibibytes.) */
+large. Set the initial frame vector size to ensure that there are at least 10
+available frames, but enforce a minimum of START_FRAMES_SIZE. If this is
+greater than the heap limit, get as large a vector as possible. Always round
+the size to a multiple of the frame size. */
 
-if (frame_size <= START_FRAMES_SIZE/10)
+heapframes_size = frame_size * 10;
+if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE;
+if (heapframes_size > mb->heap_limit)
   {
-  mb->match_frames = mb->stack_frames;   /* Initial frame vector on the stack */
-  mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size);
+  if (frame_size > mb->heap_limit ) return PCRE2_ERROR_HEAPLIMIT;
+  heapframes_size = mb->heap_limit;
   }
-else
+
+/* If an existing frame vector in the match_data block is large enough, we can
+use it.Otherwise, free any pre-existing vector and get a new one. */
+
+if (match_data->heapframes_size < heapframes_size)
   {
-  mb->frame_vector_size = frame_size * 10;
-  if ((mb->frame_vector_size / 1024) > mb->heap_limit)
+  match_data->memctl.free(match_data->heapframes,
+    match_data->memctl.memory_data);
+  match_data->heapframes = match_data->memctl.malloc(heapframes_size,
+    match_data->memctl.memory_data);
+  if (match_data->heapframes == NULL)
     {
-    if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT;
-    mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size;
+    match_data->heapframes_size = 0;
+    return PCRE2_ERROR_NOMEMORY;
     }
-  mb->match_frames = mb->memctl.malloc(mb->frame_vector_size,
-    mb->memctl.memory_data);
-  if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY;
+  match_data->heapframes_size = heapframes_size;
   }
 
-mb->match_frames_top =
-  (heapframe *)((char *)mb->match_frames + mb->frame_vector_size);
-
 /* Write to the ovector within the first frame to mark every capture unset and
 to avoid uninitialized memory read errors when it is copied to a new frame. */
 
-memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff,
+memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff,
   frame_size - offsetof(heapframe, ovector));
 
 /* Pointers to the individual character tables */
@@ -7279,8 +7286,8 @@
   mb->end_offset_top = 0;
   mb->skip_arg_count = 0;
 
-  rc = match(start_match, mb->start_code, match_data->ovector,
-    match_data->oveccount, re->top_bracket, frame_size, mb);
+  rc = match(start_match, mb->start_code, re->top_bracket, frame_size,
+    match_data, mb);
 
   if (mb->hitend && start_partial == NULL)
     {
@@ -7463,11 +7470,6 @@
   }
 #endif  /* SUPPORT_UNICODE */
 
-/* Release an enlarged frame vector that is on the heap. */
-
-if (mb->match_frames != mb->stack_frames)
-  mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
-
 /* Fill in fields that are always returned in the match data. */
 
 match_data->code = re;
@@ -7533,4 +7535,10 @@
 return match_data->rc;
 }
 
+/* These #undefs are here to enable unity builds with CMake. */
+
+#undef NLBLOCK /* Block containing newline information */
+#undef PSSTART /* Field containing processed string start */
+#undef PSEND   /* Field containing processed string end */
+
 /* End of pcre2_match.c */
diff --git a/src/pcre2_match_data.c b/src/pcre2_match_data.c
index 53e4698..fa129b8 100644
--- a/src/pcre2_match_data.c
+++ b/src/pcre2_match_data.c
@@ -7,7 +7,7 @@
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2019 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -51,19 +51,23 @@
 *  Create a match data block given ovector size  *
 *************************************************/
 
-/* A minimum of 1 is imposed on the number of ovector pairs. */
+/* A minimum of 1 is imposed on the number of ovector pairs. A maximum is also
+imposed because the oveccount field in a match data block is uintt6_t. */
 
 PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION
 pcre2_match_data_create(uint32_t oveccount, pcre2_general_context *gcontext)
 {
 pcre2_match_data *yield;
 if (oveccount < 1) oveccount = 1;
+if (oveccount > UINT16_MAX) oveccount = UINT16_MAX;
 yield = PRIV(memctl_malloc)(
   offsetof(pcre2_match_data, ovector) + 2*oveccount*sizeof(PCRE2_SIZE),
   (pcre2_memctl *)gcontext);
 if (yield == NULL) return NULL;
 yield->oveccount = oveccount;
 yield->flags = 0;
+yield->heapframes = NULL;
+yield->heapframes_size = 0;
 return yield;
 }
 
@@ -95,6 +99,9 @@
 {
 if (match_data != NULL)
   {
+  if (match_data->heapframes != NULL)
+    match_data->memctl.free(match_data->heapframes,
+      match_data->memctl.memory_data);
   if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
     match_data->memctl.free((void *)match_data->subject,
       match_data->memctl.memory_data);
diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c
index 8b2c369..edbb78c 100644
--- a/src/pcre2_substitute.c
+++ b/src/pcre2_substitute.c
@@ -7,7 +7,7 @@
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2021 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -259,16 +259,16 @@
 
 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
   return PCRE2_ERROR_BADOPTION;
-  
-/* Validate length and find the end of the replacement. A NULL replacement of 
+
+/* Validate length and find the end of the replacement. A NULL replacement of
 zero length is interpreted as an empty string. */
 
-if (replacement == NULL) 
+if (replacement == NULL)
   {
   if (rlength != 0) return PCRE2_ERROR_NULL;
-  replacement = (PCRE2_SPTR)""; 
-  } 
-   
+  replacement = (PCRE2_SPTR)"";
+  }
+
 if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
 repend = replacement + rlength;
 
@@ -282,8 +282,9 @@
 match data block. We create an internal match_data block in two cases: (a) an
 external one is not supplied (and we are not starting from an existing match);
 (b) an existing match is to be used for the first substitution. In the latter
-case, we copy the existing match into the internal block. This ensures that no
-changes are made to the existing match data block. */
+case, we copy the existing match into the internal block, except for any cached
+heap frame size and pointer. This ensures that no changes are made to the
+external match data block. */
 
 if (match_data == NULL)
   {
@@ -309,6 +310,8 @@
   if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
   memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
     + 2*pairs*sizeof(PCRE2_SIZE));
+  internal_match_data->heapframes = NULL;
+  internal_match_data->heapframes_size = 0;
   match_data = internal_match_data;
   }
 
@@ -328,9 +331,9 @@
 
 if (subject == NULL)
   {
-  if (length != 0) return PCRE2_ERROR_NULL; 
+  if (length != 0) return PCRE2_ERROR_NULL;
   subject = (PCRE2_SPTR)"";
-  } 
+  }
 
 /* Find length of zero-terminated subject */
 
diff --git a/src/pcre2grep.c b/src/pcre2grep.c
index 2335d0d..e7eb311 100644
--- a/src/pcre2grep.c
+++ b/src/pcre2grep.c
@@ -13,7 +13,7 @@
 The header can be found in the special z/OS distribution, which is available
 from www.zaconsultants.net or from www.cbttape.org.
 
-           Copyright (c) 1997-2020 University of Cambridge
+           Copyright (c) 1997-2022 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -205,9 +205,6 @@
 *               Global variables                 *
 *************************************************/
 
-/* Jeffrey Friedl has some debugging requirements that are not part of the
-regular code. */
-
 static const char *colour_string = "1;31";
 static const char *colour_option = NULL;
 static const char *dee_option = NULL;
@@ -220,19 +217,24 @@
 
 static char *main_buffer = NULL;
 
+static const char *printname_nl = STDOUT_NL;  /* Changed to NULL for -Z */
+static int printname_colon = ':';             /* Changed to 0 for -Z */
+static int printname_hyphen = '-';            /* Changed to 0 for -Z */
+
 static int after_context = 0;
 static int before_context = 0;
 static int binary_files = BIN_BINARY;
 static int both_context = 0;
-static int bufthird = PCRE2GREP_BUFSIZE;
-static int max_bufthird = PCRE2GREP_MAX_BUFSIZE;
-static int bufsize = 3*PCRE2GREP_BUFSIZE;
 static int endlinetype;
 
 static int count_limit = -1;  /* Not long, so that it works with OP_NUMBER */
 static unsigned long int counts_printed = 0;
 static unsigned long int total_count = 0;
 
+static PCRE2_SIZE bufthird = PCRE2GREP_BUFSIZE;
+static PCRE2_SIZE max_bufthird = PCRE2GREP_MAX_BUFSIZE;
+static PCRE2_SIZE bufsize = 3*PCRE2GREP_BUFSIZE;
+
 #ifdef WIN32
 static int dee_action = dee_SKIP;
 #else
@@ -259,11 +261,13 @@
 
 static pcre2_compile_context *compile_context;
 static pcre2_match_context *match_context;
-static pcre2_match_data *match_data;
-static PCRE2_SIZE *offsets;
+static pcre2_match_data *match_data, *match_data_pair[2];
+static PCRE2_SIZE *offsets, *offsets_pair[2];
+static int match_data_toggle;
 static uint32_t offset_size;
 static uint32_t capture_max = DEFAULT_CAPTURE_MAX;
 
+static BOOL all_matches = FALSE;
 static BOOL count_only = FALSE;
 static BOOL do_colour = FALSE;
 #ifdef WIN32
@@ -425,8 +429,8 @@
   { OP_NODATA,     'a',      NULL,              "text",          "treat binary files as text" },
   { OP_NUMBER,     'B',      &before_context,   "before-context=number", "set number of prior context lines" },
   { OP_BINFILES,   N_BINARY_FILES, NULL,        "binary-files=word", "set treatment of binary files" },
-  { OP_NUMBER,     N_BUFSIZE,&bufthird,         "buffer-size=number", "set processing buffer starting size" },
-  { OP_NUMBER,     N_MAX_BUFSIZE,&max_bufthird, "max-buffer-size=number",  "set processing buffer maximum size" },
+  { OP_SIZE,       N_BUFSIZE,&bufthird,         "buffer-size=number", "set processing buffer starting size" },
+  { OP_SIZE,       N_MAX_BUFSIZE,&max_bufthird, "max-buffer-size=number",  "set processing buffer maximum size" },
   { OP_OP_STRING,  N_COLOUR, &colour_option,    "color=option",  "matched text color option" },
   { OP_OP_STRING,  N_COLOUR, &colour_option,    "colour=option", "matched text colour option" },
   { OP_NUMBER,     'C',      &both_context,     "context=number", "set number of context lines, before & after" },
@@ -482,6 +486,7 @@
   { OP_NODATA,    'w',      NULL,              "word-regex(p)", "force patterns to match only as words"  },
   { OP_NODATA,    'x',      NULL,              "line-regex(p)", "force patterns to match only whole lines" },
   { OP_NODATA,   N_ALLABSK, NULL,              "allow-lookaround-bsk", "allow \\K in lookarounds" },
+  { OP_NODATA,    'Z',      NULL,              "null",          "output 0 byte after file names"  },
   { OP_NODATA,    0,        NULL,               NULL,            NULL }
 };
 
@@ -675,6 +680,9 @@
 add_pattern(char *s, PCRE2_SIZE patlen, patstr *after)
 {
 patstr *p = (patstr *)malloc(sizeof(patstr));
+
+/* LCOV_EXCL_START - These won't be hit in normal testing. */
+
 if (p == NULL)
   {
   fprintf(stderr, "pcre2grep: malloc failed\n");
@@ -687,6 +695,9 @@
   free(p);
   return NULL;
   }
+
+/* LCOV_EXCL_STOP */
+
 p->next = NULL;
 p->string = s;
 p->length = patlen;
@@ -1368,11 +1379,16 @@
 {
 omstr *om = (omstr *)malloc(sizeof(omstr));
 
+/* LCOV_EXCL_START - These lines won't be hit in normal testing. */
+
 if (om == NULL)
   {
   fprintf(stderr, "pcre2grep: malloc failed\n");
   pcre2grep_exit(2);
   }
+
+/* LCOV_EXCL_STOP */
+
 om->next = NULL;
 om->groupnum = n;
 
@@ -1408,10 +1424,10 @@
 */
 
 static PCRE2_SIZE
-read_one_line(char *buffer, int length, FILE *f)
+read_one_line(char *buffer, PCRE2_SIZE length, FILE *f)
 {
 int c;
-int yield = 0;
+PCRE2_SIZE yield = 0;
 while ((c = fgetc(f)) != EOF)
   {
   buffer[yield++] = c;
@@ -1772,7 +1788,7 @@
     {
     char *pp = end_of_line(lastmatchrestart, endptr, &ellength);
     if (ellength == 0 && pp == main_buffer + bufsize) break;
-    if (printname != NULL) fprintf(stdout, "%s-", printname);
+    if (printname != NULL) fprintf(stdout, "%s%c", printname, printname_hyphen);
     if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
     FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
     lastmatchrestart = pp;
@@ -1798,9 +1814,10 @@
 *   Apply patterns to subject till one matches   *
 *************************************************/
 
-/* This function is called to run through all patterns, looking for a match. It
-is used multiple times for the same subject when colouring is enabled, in order
-to find all possible matches.
+/* This function is called to run through all the patterns, looking for a
+match. When all possible matches are required, for example, for colouring, it
+checks all patterns for matching, and returns the earliest match. Otherwise, it
+returns the first pattern that has matched.
 
 Arguments:
   matchptr     the start of the subject
@@ -1809,17 +1826,18 @@
   startoffset  where to start matching
   mrc          address of where to put the result of pcre2_match()
 
-Returns:      TRUE if there was a match
-              FALSE if there was no match
-              invert if there was a non-fatal error
+Returns:       TRUE if there was a match, match_data and offsets are set
+               FALSE if there was no match (but no errors)
+               invert if there was a non-fatal error
 */
 
 static BOOL
 match_patterns(char *matchptr, PCRE2_SIZE length, unsigned int options,
   PCRE2_SIZE startoffset, int *mrc)
 {
-int i;
 PCRE2_SIZE slen = length;
+int first = -1;
+int firstrc = 0;
 patstr *p = patterns;
 const char *msg = "this text:\n\n";
 
@@ -1829,27 +1847,53 @@
   msg = "text that starts:\n\n";
   }
 
-for (i = 1; p != NULL; p = p->next, i++)
+for (int i = 1; p != NULL; p = p->next, i++)
   {
-  *mrc = pcre2_match(p->compiled, (PCRE2_SPTR)matchptr, (int)length,
+  int rc = pcre2_match(p->compiled, (PCRE2_SPTR)matchptr, (int)length,
     startoffset, options, match_data, match_context);
-  if (*mrc >= 0) return TRUE;
-  if (*mrc == PCRE2_ERROR_NOMATCH) continue;
-  fprintf(stderr, "pcre2grep: pcre2_match() gave error %d while matching ", *mrc);
+  if (rc == PCRE2_ERROR_NOMATCH) continue;
+
+  /* Handle a successful match. When all_matches is false, we are done.
+  Otherwise we must save the earliest match. */
+
+  if (rc >= 0)
+    {
+    if (!all_matches)
+      {
+      *mrc = rc;
+      return TRUE;
+      }
+
+    if (first < 0 || offsets[0] < offsets_pair[first][0] ||
+         (offsets[0] == offsets_pair[first][0] &&
+          offsets[1] > offsets_pair[first][1]))
+      {
+      first = match_data_toggle;
+      firstrc = rc;
+      match_data_toggle ^= 1;
+      match_data = match_data_pair[match_data_toggle];
+      offsets = offsets_pair[match_data_toggle];
+      }
+    continue;
+    }
+
+  /* Deal with PCRE2 error. */
+
+  fprintf(stderr, "pcre2grep: pcre2_match() gave error %d while matching ", rc);
   if (patterns->next != NULL) fprintf(stderr, "pattern number %d to ", i);
   fprintf(stderr, "%s", msg);
   FWRITE_IGNORE(matchptr, 1, slen, stderr);   /* In case binary zero included */
   fprintf(stderr, "\n\n");
-  if (*mrc <= PCRE2_ERROR_UTF8_ERR1 &&
-      *mrc >= PCRE2_ERROR_UTF8_ERR21)
+  if (rc <= PCRE2_ERROR_UTF8_ERR1 &&
+      rc >= PCRE2_ERROR_UTF8_ERR21)
     {
     unsigned char mbuffer[256];
     PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
-    (void)pcre2_get_error_message(*mrc, mbuffer, sizeof(mbuffer));
+    (void)pcre2_get_error_message(rc, mbuffer, sizeof(mbuffer));
     fprintf(stderr, "%s at offset %" SIZ_FORM "\n\n", mbuffer, startchar);
     }
-  if (*mrc == PCRE2_ERROR_MATCHLIMIT || *mrc == PCRE2_ERROR_DEPTHLIMIT ||
-      *mrc == PCRE2_ERROR_HEAPLIMIT || *mrc == PCRE2_ERROR_JIT_STACKLIMIT)
+  if (rc == PCRE2_ERROR_MATCHLIMIT || rc == PCRE2_ERROR_DEPTHLIMIT ||
+      rc == PCRE2_ERROR_HEAPLIMIT || rc == PCRE2_ERROR_JIT_STACKLIMIT)
     resource_error = TRUE;
   if (error_count++ > 20)
     {
@@ -1859,7 +1903,18 @@
   return invert;    /* No more matching; don't show the line again */
   }
 
-return FALSE;  /* No match, no errors */
+/* We get here when all patterns have been tried. If all_matches is false,
+this means that none of them matched. If all_matches is true, matched_first
+will be non-NULL if there was at least one match, and it will point to the
+appropriate match_data block. */
+
+if (!all_matches || first < 0) return FALSE;
+
+match_data_toggle = first;
+match_data = match_data_pair[first];
+offsets = offsets_pair[first];
+*mrc = firstrc;
+return TRUE;
 }
 
 
@@ -2160,18 +2215,19 @@
         }
       continue;
 
+      /* LCOV_EXCL_START */
       default:  /* Should not occur */
       break;
+      /* LCOV_EXCL_STOP */
       }
     }
 
   else value = *string;  /* Not a $ escape */
 
-  if (utf && value <= 127) fprintf(stdout, "%c", *string); else
+  if (!utf || value <= 127) fprintf(stdout, "%c", value); else
     {
-    int i;
     int n = ord2utf8(value);
-    for (i = 0; i < n; i++) fputc(utf8_buffer[i], stdout);
+    for (int i = 0; i < n; i++) fputc(utf8_buffer[i], stdout);
     }
 
   printed = TRUE;
@@ -2303,9 +2359,11 @@
         else if (utf && value > 127) argslen += ord2utf8(value) - 1;
       break;
 
+      /* LCOV_EXCL_START */
       default:         /* Should not occur */
       case DDE_ERROR:
       return 0;
+      /* LCOV_EXCL_STOP */
       }
 
     length -= (string - begin);
@@ -2324,8 +2382,10 @@
 argsvector = (char**)malloc(argsvectorlen * sizeof(char*));
 if (argsvector == NULL)
   {
+  /* LCOV_EXCL_START */
   free(args);
   return 0;
+  /* LCOV_EXCL_STOP */
   }
 
 /* Now reprocess the string and set up the arguments. */
@@ -2381,11 +2441,13 @@
         }
       break;
 
+      /* LCOV_EXCL_START */
       default:         /* Even though this should not occur, the string having */
       case DDE_ERROR:  /* been checked above, we need to include the free() */
       free(args);      /* calls so that source checkers do not complain. */
       free(argsvector);
       return 0;
+      /* LCOV_EXCL_STOP */
       }
 
     length -= (string - begin);
@@ -2437,7 +2499,11 @@
   exit(1);
   }
 else if (pid > 0)
+  {
+  (void)fflush(stdout);
   (void)waitpid(pid, &result, 0);
+  (void)fflush(stdout);
+  }
 #endif  /* End Windows/VMS/other handling */
 
 free(args);
@@ -2457,8 +2523,8 @@
 *     Read a portion of the file into buffer     *
 *************************************************/
 
-static int
-fill_buffer(void *handle, int frtype, char *buffer, int length,
+static PCRE2_SIZE
+fill_buffer(void *handle, int frtype, char *buffer, PCRE2_SIZE length,
   BOOL input_line_buffered)
 {
 (void)frtype;  /* Avoid warning when not used */
@@ -2471,7 +2537,7 @@
 
 #ifdef SUPPORT_LIBBZ2
 if (frtype == FR_LIBBZ2)
-  return BZ2_bzread((BZFILE *)handle, buffer, length);
+  return (PCRE2_SIZE)BZ2_bzread((BZFILE *)handle, buffer, length);
 else
 #endif
 
@@ -2537,10 +2603,8 @@
 if (frtype != FR_LIBZ && frtype != FR_LIBBZ2)
   {
   in = (FILE *)handle;
-  if (feof(in))
-    return 1;
-  if (is_file_tty(in))
-    input_line_buffered = TRUE;
+  if (feof(in)) return 1;
+  if (is_file_tty(in)) input_line_buffered = TRUE;
   else
     {
     if (count_limit >= 0  && filename == stdin_name)
@@ -2553,7 +2617,7 @@
   input_line_buffered);
 
 #ifdef SUPPORT_LIBBZ2
-if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2;   /* Gotcha: bufflength is PCRE2_SIZE */
+if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 3;   /* Gotcha: bufflength is PCRE2_SIZE */
 #endif
 
 endptr = main_buffer + bufflength;
@@ -2620,21 +2684,24 @@
     if (bufthird < max_bufthird)
       {
       char *new_buffer;
-      int new_bufthird = 2*bufthird;
+      PCRE2_SIZE new_bufthird = 2*bufthird;
 
       if (new_bufthird > max_bufthird) new_bufthird = max_bufthird;
       new_buffer = (char *)malloc(3*new_bufthird);
 
       if (new_buffer == NULL)
         {
+        /* LCOV_EXCL_START */
         fprintf(stderr,
           "pcre2grep: line %lu%s%s is too long for the internal buffer\n"
-          "pcre2grep: not enough memory to increase the buffer size to %d\n",
+          "pcre2grep: not enough memory to increase the buffer size to %"
+            SIZ_FORM "\n",
           linenumber,
           (filename == NULL)? "" : " of file ",
           (filename == NULL)? "" : filename,
           new_bufthird);
         return 2;
+        /* LCOV_EXCL_STOP */
         }
 
       /* Copy the data and adjust pointers to the new buffer location. */
@@ -2659,7 +2726,7 @@
       {
       fprintf(stderr,
         "pcre2grep: line %lu%s%s is too long for the internal buffer\n"
-        "pcre2grep: the maximum buffer size is %d\n"
+        "pcre2grep: the maximum buffer size is %" SIZ_FORM "\n"
         "pcre2grep: use the --max-buffer-size option to change it\n",
         linenumber,
         (filename == NULL)? "" : " of file ",
@@ -2724,7 +2791,9 @@
 
     else if (filenames == FN_MATCH_ONLY)
       {
-      fprintf(stdout, "%s" STDOUT_NL, printname);
+      fprintf(stdout, "%s", printname);
+      if (printname_nl == NULL) fprintf(stdout, "%c", 0);
+        else fprintf(stdout, "%s", printname_nl);
       return 0;
       }
 
@@ -2743,7 +2812,8 @@
         {
         PCRE2_SIZE oldstartoffset;
 
-        if (printname != NULL) fprintf(stdout, "%s:", printname);
+        if (printname != NULL) fprintf(stdout, "%s%c", printname,
+          printname_colon);
         if (number) fprintf(stdout, "%lu:", linenumber);
 
         /* Handle --line-offsets */
@@ -2763,10 +2833,9 @@
 
         else if (output_text != NULL)
           {
-          if (display_output_text((PCRE2_SPTR)output_text, FALSE,
-              (PCRE2_SPTR)ptr, offsets, mrc) || printname != NULL ||
-              number)
-            fprintf(stdout, STDOUT_NL);
+          (void)display_output_text((PCRE2_SPTR)output_text, FALSE,
+              (PCRE2_SPTR)ptr, offsets, mrc);
+          fprintf(stdout, STDOUT_NL);
           }
 
         /* Handle --only-matching, which may occur many times */
@@ -2791,7 +2860,6 @@
                 }
               }
             }
-
           if (printed || printname != NULL || number)
             fprintf(stdout, STDOUT_NL);
           }
@@ -2865,7 +2933,8 @@
         while (lastmatchrestart < p)
           {
           char *pp = lastmatchrestart;
-          if (printname != NULL) fprintf(stdout, "%s-", printname);
+          if (printname != NULL) fprintf(stdout, "%s%c", printname,
+            printname_hyphen);
           if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
           pp = end_of_line(pp, endptr, &ellength);
           FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
@@ -2906,7 +2975,8 @@
           {
           int ellength;
           char *pp = p;
-          if (printname != NULL) fprintf(stdout, "%s-", printname);
+          if (printname != NULL) fprintf(stdout, "%s%c", printname,
+            printname_hyphen);
           if (number) fprintf(stdout, "%lu-", linenumber - linecount--);
           pp = end_of_line(pp, endptr, &ellength);
           FWRITE_IGNORE(p, 1, pp - p, stdout);
@@ -2920,7 +2990,8 @@
       if (after_context > 0 || before_context > 0)
         endhyphenpending = TRUE;
 
-      if (printname != NULL) fprintf(stdout, "%s:", printname);
+      if (printname != NULL) fprintf(stdout, "%s%c", printname,
+        printname_colon);
       if (number) fprintf(stdout, "%lu:", linenumber);
 
       /* In multiline mode, or if colouring, we have to split the line(s) up
@@ -3076,7 +3147,7 @@
 
   if (input_line_buffered && bufflength < (PCRE2_SIZE)bufsize)
     {
-    int add = read_one_line(ptr, bufsize - (int)(ptr - main_buffer), in);
+    PCRE2_SIZE add = read_one_line(ptr, bufsize - (ptr - main_buffer), in);
     bufflength += add;
     endptr += add;
     }
@@ -3125,7 +3196,9 @@
 
 if (filenames == FN_NOMATCH_ONLY)
   {
-  fprintf(stdout, "%s" STDOUT_NL, printname);
+  fprintf(stdout, "%s", printname);
+  if (printname_nl == NULL) fprintf(stdout, "%c", 0);
+    else fprintf(stdout, "%s", printname_nl);
   return 0;
   }
 
@@ -3136,7 +3209,7 @@
   if (count > 0 || !omit_zero_count)
     {
     if (printname != NULL && filenames != FN_NONE)
-      fprintf(stdout, "%s:", printname);
+      fprintf(stdout, "%s%c", printname, printname_colon);
     fprintf(stdout, "%lu" STDOUT_NL, count);
     counts_printed++;
     }
@@ -3268,10 +3341,12 @@
 
     if (dir == NULL)
       {
+      /* LCOV_EXCL_START - this is a "never" event */
       if (!silent)
         fprintf(stderr, "pcre2grep: Failed to open directory %s: %s\n", pathname,
           strerror(errno));
       return 2;
+      /* LCOV_EXCL_STOP */
       }
 
     while ((nextfile = readdirectory(dir)) != NULL)
@@ -3280,9 +3355,11 @@
       int fnlength = strlen(pathname) + strlen(nextfile) + 2;
       if (fnlength > FNBUFSIZ)
         {
+        /* LCOV_EXCL_START - this is a "never" event */
         fprintf(stderr, "pcre2grep: recursive filename is too long\n");
         rc = 2;
         break;
+        /* LCOV_EXCL_STOP */
         }
       sprintf(childpath, "%s%c%s", pathname, FILESEP, nextfile);
 
@@ -3299,7 +3376,9 @@
       BOOL isSame;
       size_t rlen;
       if (realpath(childpath, resolvedpath) == NULL)
+        /* LCOV_EXCL_START - this is a "never" event */
         continue;     /* This path is invalid - we can skip processing this */
+        /* LCOV_EXCL_STOP */
       isSame = strcmp(pathname, resolvedpath) == 0;
       if (isSame) continue;    /* We have a recursion */
       rlen = strlen(resolvedpath);
@@ -3388,10 +3467,12 @@
   ingz = gzopen(pathname, "rb");
   if (ingz == NULL)
     {
+    /* LCOV_EXCL_START */
     if (!silent)
       fprintf(stderr, "pcre2grep: Failed to open %s: %s\n", pathname,
         strerror(errno));
     return 2;
+    /* LCOV_EXCL_STOP */
     }
   handle = (void *)ingz;
   frtype = FR_LIBZ;
@@ -3462,10 +3543,12 @@
       BZ2_bzclose(inbz2);
       goto PLAIN_FILE;
       }
+    /* LCOV_EXCL_START */
     else if (!silent)
       fprintf(stderr, "pcre2grep: Failed to read %s using bzlib: %s\n",
         pathname, err);
     rc = 2;    /* The normal "something went wrong" code */
+    /* LCOV_EXCL_STOP */
     }
   BZ2_bzclose(inbz2);
   }
@@ -3487,6 +3570,8 @@
 *          Handle a no-data option               *
 *************************************************/
 
+/* This is called when a known option has been identified. */
+
 static int
 handle_option(int letter, int options)
 {
@@ -3522,8 +3607,6 @@
   case 'u': options |= PCRE2_UTF; utf = TRUE; break;
   case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break;
   case 'v': invert = TRUE; break;
-  case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
-  case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
 
   case 'V':
     {
@@ -3532,11 +3615,17 @@
     fprintf(stdout, "pcre2grep version %s" STDOUT_NL, buffer);
     }
   pcre2grep_exit(0);
-  break;
+  break;  /* LCOV_EXCL_LINE - statement kept to avoid compiler warning */
 
+  case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
+  case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
+  case 'Z': printname_colon = printname_hyphen = 0; printname_nl = NULL; break;
+
+  /* LCOV_EXCL_START - this is a "never event" */
   default:
   fprintf(stderr, "pcre2grep: Unknown option -%c\n", letter);
   pcre2grep_exit(usage(2));
+  /* LCOV_EXCL_STOP */
   }
 
 return options;
@@ -3714,8 +3803,10 @@
   *patlastptr = add_pattern(buffer, patlen, *patlastptr);
   if (*patlastptr == NULL)
     {
+    /* LCOV_EXCL_START - won't happen in testing */
     if (f != stdin) fclose(f);
     return FALSE;
+    /* LCOV_EXCL_STOP */
     }
   if (*patptr == NULL) *patptr = *patlastptr;
 
@@ -3868,9 +3959,11 @@
                      fulllen - baselen - 2, opbra + 1),
              ret < 0 || ret > (int)sizeof(buff2)))
           {
+          /* LCOV_EXCL_START - this is a "never" event */
           fprintf(stderr, "pcre2grep: Buffer overflow when parsing %s option\n",
             op->long_name);
           pcre2grep_exit(2);
+          /* LCOV_EXCL_STOP */
           }
 
         if (strncmp(arg, buff1, arglen) == 0 ||
@@ -4024,8 +4117,10 @@
     fn = (fnstr *)malloc(sizeof(fnstr));
     if (fn == NULL)
       {
+      /* LCOV_EXCL_START */
       fprintf(stderr, "pcre2grep: malloc failed\n");
       goto EXIT2;
+      /* LCOV_EXCL_STOP */
       }
     fn->next = NULL;
     fn->name = option_data;
@@ -4093,7 +4188,6 @@
   pcre2grep_exit(usage(2));
   }
 
-
 /* Check that there is a big enough ovector for all -o settings. */
 
 for (om = only_matching; om != NULL; om = om->next)
@@ -4113,13 +4207,18 @@
     !syntax_check_output_text((PCRE2_SPTR)output_text, FALSE))
   goto EXIT2;
 
-/* Set up default compile and match contexts and a match data block. */
+/* Set up default compile and match contexts and match data blocks. */
 
 offset_size = capture_max + 1;
 compile_context = pcre2_compile_context_create(NULL);
 match_context = pcre2_match_context_create(NULL);
-match_data = pcre2_match_data_create(offset_size, NULL);
-offsets = pcre2_get_ovector_pointer(match_data);
+match_data_pair[0] = pcre2_match_data_create(offset_size, NULL);
+match_data_pair[1] = pcre2_match_data_create(offset_size, NULL);
+offsets_pair[0] = pcre2_get_ovector_pointer(match_data_pair[0]);
+offsets_pair[1] = pcre2_get_ovector_pointer(match_data_pair[1]);
+match_data = match_data_pair[0];
+offsets = offsets_pair[0];
+match_data_toggle = 0;
 
 /* If string (script) callouts are supported, set up the callout processing
 function. */
@@ -4198,6 +4297,11 @@
     }
   }
 
+/* When colouring or otherwise identifying matching substrings, we need to find
+all possible matches when there are multiple patterns. */
+
+all_matches = do_colour || only_matching_count != 0;
+
 /* Sort out a newline setting. */
 
 if (newline_arg != NULL)
@@ -4253,8 +4357,6 @@
 
 (void)pcre2_set_compile_extra_options(compile_context, extra_options);
 
-/* Check the values for Jeffrey Friedl's debugging options. */
-
 /* If use_jit is set, check whether JIT is available. If not, do not try
 to use JIT. */
 
@@ -4278,8 +4380,10 @@
 
 if (main_buffer == NULL)
   {
+  /* LCOV_EXCL_START */
   fprintf(stderr, "pcre2grep: malloc failed\n");
   goto EXIT2;
+  /* LCOV_EXCL_STOP */
   }
 
 /* If no patterns were provided by -e, and there are no files provided by -f,
@@ -4421,14 +4525,6 @@
     else if (frc == 0 && rc == 1) rc = 0;
   }
 
-#ifdef SUPPORT_PCRE2GREP_CALLOUT
-/* If separating builtin echo callouts by implicit newline, add one more for
-the final item. */
-
-if (om_separator != NULL && strcmp(om_separator, STDOUT_NL) == 0)
-  fprintf(stdout, STDOUT_NL);
-#endif
-
 /* Show the total number of matches if requested, but not if only one file's
 count was printed. */
 
@@ -4450,7 +4546,8 @@
 
 pcre2_compile_context_free(compile_context);
 pcre2_match_context_free(match_context);
-pcre2_match_data_free(match_data);
+pcre2_match_data_free(match_data_pair[0]);
+pcre2_match_data_free(match_data_pair[1]);
 
 free_pattern_chain(patterns);
 free_pattern_chain(include_patterns);
diff --git a/src/pcre2posix.c b/src/pcre2posix.c
index 647edcc..a15174d 100644
--- a/src/pcre2posix.c
+++ b/src/pcre2posix.c
@@ -7,7 +7,7 @@
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2021 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -92,20 +92,6 @@
 #include "pcre2.h"
 #include "pcre2posix.h"
 
-/* When compiling with the MSVC compiler, it is sometimes necessary to include
-a "calling convention" before exported function names. (This is secondhand
-information; I know nothing about MSVC myself). For example, something like
-
-  void __cdecl function(....)
-
-might be needed. In order to make this easy, all the exported functions have
-PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not
-set, we ensure here that it has no effect. */
-
-#ifndef PCRE2_CALL_CONVENTION
-#define PCRE2_CALL_CONVENTION
-#endif
-
 /* Table to translate PCRE2 compile time error codes into POSIX error codes.
 Only a few PCRE2 errors with a value greater than 23 turn into special POSIX
 codes: most go to REG_BADPAT. The second table lists, in pairs, those that
@@ -342,8 +328,10 @@
 
 if (preg->re_match_data == NULL)
   {
+  /* LCOV_EXCL_START */
   pcre2_code_free(preg->re_pcre2_code);
   return REG_ESPACE;
+  /* LCOV_EXCL_STOP */
   }
 
 return 0;
@@ -423,17 +411,24 @@
 if (rc <= PCRE2_ERROR_UTF8_ERR1 && rc >= PCRE2_ERROR_UTF8_ERR21)
   return REG_INVARG;
 
+/* Most of these are events that won't occur during testing, so exclude them
+from coverage. */
+
 switch(rc)
   {
-  default: return REG_ASSERT;
+  case PCRE2_ERROR_HEAPLIMIT: return REG_ESPACE;
+  case PCRE2_ERROR_NOMATCH: return REG_NOMATCH;
+
+  /* LCOV_EXCL_START */
   case PCRE2_ERROR_BADMODE: return REG_INVARG;
   case PCRE2_ERROR_BADMAGIC: return REG_INVARG;
   case PCRE2_ERROR_BADOPTION: return REG_INVARG;
   case PCRE2_ERROR_BADUTFOFFSET: return REG_INVARG;
   case PCRE2_ERROR_MATCHLIMIT: return REG_ESPACE;
-  case PCRE2_ERROR_NOMATCH: return REG_NOMATCH;
   case PCRE2_ERROR_NOMEMORY: return REG_ESPACE;
   case PCRE2_ERROR_NULL: return REG_INVARG;
+  default: return REG_ASSERT;
+  /* LCOV_EXCL_STOP */
   }
 }
 
diff --git a/src/pcre2posix.h b/src/pcre2posix.h
index 3a663b9..6e3b2cf 100644
--- a/src/pcre2posix.h
+++ b/src/pcre2posix.h
@@ -9,7 +9,7 @@
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2019 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -116,6 +116,20 @@
   regoff_t rm_eo;
 } regmatch_t;
 
+/* When compiling with the MSVC compiler, it is sometimes necessary to include
+a "calling convention" before exported function names. (This is secondhand
+information; I know nothing about MSVC myself). For example, something like
+
+  void __cdecl function(....)
+
+might be needed. In order to make this easy, all the exported functions have
+PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not
+set, we ensure here that it has no effect. */
+
+#ifndef PCRE2_CALL_CONVENTION
+#define PCRE2_CALL_CONVENTION
+#endif
+
 /* When an application links to a PCRE2 DLL in Windows, the symbols that are
 imported have to be identified as such. When building PCRE2, the appropriate
 export settings are needed, and are set in pcre2posix.c before including this
@@ -144,11 +158,11 @@
 the PCRE2 library and not by accident from elsewhere (regex_t differs in size
 elsewhere). */
 
-PCRE2POSIX_EXP_DECL int pcre2_regcomp(regex_t *, const char *, int);
-PCRE2POSIX_EXP_DECL int pcre2_regexec(const regex_t *, const char *, size_t,
+PCRE2POSIX_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_regcomp(regex_t *, const char *, int);
+PCRE2POSIX_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_regexec(const regex_t *, const char *, size_t,
                      regmatch_t *, int);
-PCRE2POSIX_EXP_DECL size_t pcre2_regerror(int, const regex_t *, char *, size_t);
-PCRE2POSIX_EXP_DECL void pcre2_regfree(regex_t *);
+PCRE2POSIX_EXP_DECL size_t PCRE2_CALL_CONVENTION pcre2_regerror(int, const regex_t *, char *, size_t);
+PCRE2POSIX_EXP_DECL void PCRE2_CALL_CONVENTION pcre2_regfree(regex_t *);
 
 #define regcomp  pcre2_regcomp
 #define regexec  pcre2_regexec
diff --git a/src/pcre2posix_test.c b/src/pcre2posix_test.c
new file mode 100644
index 0000000..e85e72b
--- /dev/null
+++ b/src/pcre2posix_test.c
@@ -0,0 +1,209 @@
+/*************************************************
+*        PCRE2 POSIX interface test program      *
+*************************************************/
+
+/*
+Written by Philip Hazel, December 2022
+Copyright (c) 2022
+File last edited: December 2022
+
+This program tests the POSIX wrapper to the PCRE2 regular expression library.
+The main PCRE2 test program is pcre2test, which also tests these function
+calls. This little program is needed to test the case where the client includes
+pcre2posix.h but not pcre2.h, mainly to make sure that it builds successfully.
+However, the code is written as a flexible test program to which extra tests
+can be added.
+
+Compile with -lpcre2-posix -lpcre2-8
+
+If run with no options, there is no output on success, and the return code is
+zero. If any test fails there is output to stderr, and the return code is 1.
+
+For testing purposes, the "-v" option causes verification output to be written
+to stdout. */
+
+#include <stdio.h>
+#include <string.h>
+#include <pcre2posix.h>
+
+#define CAPCOUNT 5               /* Number of captures supported */
+#define PRINTF if (v) printf     /* Shorthand for testing output */
+
+/* This vector contains compiler flags for each pattern that is tested. */
+
+static int cflags[] = {
+  0,           /* Test 0 */
+  REG_ICASE,   /* Test 1 */
+  0,           /* Test 2 */
+  REG_NEWLINE, /* Test 3 */
+  0            /* Test 4 */
+};
+
+/* This vector contains match flags for each pattern that is tested. */
+
+static int mflags[] = {
+  0,           /* Test 0 */
+  0,           /* Test 1 */
+  0,           /* Test 2 */
+  REG_NOTBOL,  /* Test 3 */
+  0            /* Test 4 */
+};
+
+/* Automate the number of patterns */
+
+#define count (int)(sizeof(cflags)/sizeof(int))
+
+/* The data for each pattern consists of a pattern string, followed by any
+number of subject strings, terminated by NULL. Some tests share data, but use
+different flags. */
+
+static const char *data0_1[] = { "posix", "lower posix", "upper POSIX", NULL };
+static const char *data2_3[] = { "^(cat|dog)", "catastrophic\ncataclysm",
+  "dogfight", "no animals", NULL };
+static const char *data4[] = { "*badpattern", NULL };
+
+/* Index the data strings */
+
+static char **data[] = {
+  (char **)(&data0_1),
+  (char **)(&data0_1),
+  (char **)(&data2_3),
+  (char **)(&data2_3),
+  (char **)(&data4)
+};
+
+/* The expected results for each pattern consist of a compiler return code,
+optionally followed, for each subject string, by a match return code and, for a
+successful match, up to CAPCOUNT pairs of returned match data. */
+
+static int results0[] = {
+  0,             /* Compiler rc */
+  0, 6, 11,      /* 1st match */
+  REG_NOMATCH    /* 2nd match */
+};
+
+static int results1[] = {
+  0,             /* Compiler rc */
+  0, 6, 11,      /* 1st match */
+  0, 6, 11       /* 2nd match */
+};
+
+static int results2[] = {
+  0,             /* Compiler rc */
+  0, 0, 3, 0, 3, /* 1st match */
+  0, 0, 3, 0, 3, /* 2nd match */
+  REG_NOMATCH    /* 3rd match */
+};
+
+static int results3[] = {
+  0,                 /* Compiler rc */
+  0, 13, 16, 13, 16, /* 1st match */
+  REG_NOMATCH,       /* 2nd match */
+  REG_NOMATCH        /* 3rd match */
+};
+
+static int results4[] = {
+  REG_BADRPT         /* Compiler rc */
+};
+
+/* Index the result vectors */
+
+static int *results[] = {
+  (int *)(&results0),
+  (int *)(&results1),
+  (int *)(&results2),
+  (int *)(&results3),
+  (int *)(&results4)
+};
+
+/* And here is the program */
+
+int main(int argc, char **argv)
+{
+regex_t re;
+regmatch_t match[CAPCOUNT];
+int v = argc > 1 && strcmp(argv[1], "-v") == 0;
+
+PRINTF("Test of pcre2posix.h without pcre2.h\n");
+
+for (int i = 0; i < count; i++)
+  {
+  char *pattern = data[i][0];
+  char **subjects = data[i] + 1;
+  int *rd = results[i];
+  int rc = regcomp(&re, pattern, cflags[i]);
+
+  PRINTF("Pattern: %s flags=0x%02x\n", pattern, cflags[i]);
+
+  if (rc != *rd)
+    {
+    fprintf(stderr, "Unexpected compile error %d (expected %d)\n", rc, *rd);
+    fprintf(stderr, "Pattern is: %s\n", pattern);
+    return 1;
+    }
+
+  if (rc != 0)
+    {
+    if (v)
+      {
+      char buffer[256];
+      (void)regerror(rc, &re, buffer, sizeof(buffer));
+      PRINTF("Compile error %d: %s (expected)\n", rc, buffer);
+      }
+    continue;
+    }
+
+  for (; *subjects != NULL; subjects++)
+    {
+    rc = regexec(&re, *subjects, CAPCOUNT, match, mflags[i]);
+
+    PRINTF("Subject: %s\n", *subjects);
+    PRINTF("Return:  %d", rc);
+
+    if (rc != *(++rd))
+      {
+      PRINTF("\n");
+      fprintf(stderr, "Unexpected match error %d (expected %d)\n", rc, *rd);
+      fprintf(stderr, "Pattern is: %s\n", pattern);
+      fprintf(stderr, "Subject is: %s\n", *subjects);
+      return 1;
+      }
+
+    if (rc == 0)
+      {
+      for (int j = 0; j < CAPCOUNT; j++)
+        {
+        regmatch_t *m = match + j;
+        if (m->rm_so < 0) continue;
+        if (m->rm_so != *(++rd) || m->rm_eo != *(++rd))
+          {
+          PRINTF("\n");
+          fprintf(stderr, "Mismatched results for successful match\n");
+          fprintf(stderr, "Pattern is: %s\n", pattern);
+          fprintf(stderr, "Subject is: %s\n", *subjects);
+          fprintf(stderr, "Result %d: expected %d %d received %d %d\n",
+            j, rd[-1], rd[0], m->rm_so, m->rm_eo);
+          return 1;
+          }
+        PRINTF(" (%d %d %d)", j, m->rm_so, m->rm_eo);
+        }
+      }
+
+    else if (v)
+      {
+      char buffer[256];
+      (void)regerror(rc, &re, buffer, sizeof(buffer));
+      PRINTF(": %s (expected)", buffer);
+      }
+
+    PRINTF("\n");
+    }
+
+  regfree(&re);
+  }
+
+PRINTF("End of test\n");
+return 0;
+}
+
+/* End of pcre2posix_test.c */
diff --git a/src/pcre2test.c b/src/pcre2test.c
index ea52a20..4fa5884 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@@ -479,7 +479,7 @@
 #define CTL_DFA                          0x00000200u
 #define CTL_EXPAND                       0x00000400u
 #define CTL_FINDLIMITS                   0x00000800u
-#define CTL_FRAMESIZE                    0x00001000u
+#define CTL_FINDLIMITS_NOHEAP            0x00001000u
 #define CTL_FULLBINCODE                  0x00002000u
 #define CTL_GETALL                       0x00004000u
 #define CTL_GLOBAL                       0x00008000u
@@ -522,6 +522,7 @@
 #define CTL2_ALLVECTOR                   0x00000800u
 #define CTL2_NULL_SUBJECT                0x00001000u
 #define CTL2_NULL_REPLACEMENT            0x00002000u
+#define CTL2_FRAMESIZE                   0x00004000u
 
 #define CTL2_NL_SET                      0x40000000u  /* Informational */
 #define CTL2_BSR_SET                     0x80000000u  /* Informational */
@@ -673,8 +674,9 @@
   { "extended_more",               MOD_PATP, MOD_OPT, PCRE2_EXTENDED_MORE,        PO(options) },
   { "extra_alt_bsux",              MOD_CTC,  MOD_OPT, PCRE2_EXTRA_ALT_BSUX,       CO(extra_options) },
   { "find_limits",                 MOD_DAT,  MOD_CTL, CTL_FINDLIMITS,             DO(control) },
+  { "find_limits_noheap",          MOD_DAT,  MOD_CTL, CTL_FINDLIMITS_NOHEAP,      DO(control) },
   { "firstline",                   MOD_PAT,  MOD_OPT, PCRE2_FIRSTLINE,            PO(options) },
-  { "framesize",                   MOD_PAT,  MOD_CTL, CTL_FRAMESIZE,              PO(control) },
+  { "framesize",                   MOD_PAT,  MOD_CTL, CTL2_FRAMESIZE,             PO(control2) },
   { "fullbincode",                 MOD_PAT,  MOD_CTL, CTL_FULLBINCODE,            PO(control) },
   { "get",                         MOD_DAT,  MOD_NN,  DO(get_numbers),            DO(get_names) },
   { "getall",                      MOD_DAT,  MOD_CTL, CTL_GETALL,                 DO(control) },
@@ -781,10 +783,11 @@
 
 #define PUSH_SUPPORTED_COMPILE_CONTROLS ( \
   CTL_BINCODE|CTL_CALLOUT_INFO|CTL_FULLBINCODE|CTL_HEXPAT|CTL_INFO| \
-  CTL_JITVERIFY|CTL_MEMORY|CTL_FRAMESIZE|CTL_PUSH|CTL_PUSHCOPY| \
+  CTL_JITVERIFY|CTL_MEMORY|CTL_PUSH|CTL_PUSHCOPY| \
   CTL_PUSHTABLESCOPY|CTL_USE_LENGTH)
 
-#define PUSH_SUPPORTED_COMPILE_CONTROLS2 (CTL2_BSR_SET|CTL2_NL_SET)
+#define PUSH_SUPPORTED_COMPILE_CONTROLS2 (CTL2_BSR_SET|CTL2_FRAMESIZE| \
+  CTL2_NL_SET)
 
 /* Controls that apply only at compile time with 'push'. */
 
@@ -813,8 +816,9 @@
 first control word. */
 
 static uint32_t exclusive_dat_controls[] = {
-  CTL_ALLUSEDTEXT | CTL_STARTCHAR,
-  CTL_FINDLIMITS  | CTL_NULLCONTEXT };
+  CTL_ALLUSEDTEXT        | CTL_STARTCHAR,
+  CTL_FINDLIMITS         | CTL_NULLCONTEXT,
+  CTL_FINDLIMITS_NOHEAP  | CTL_NULLCONTEXT };
 
 /* Table of single-character abbreviated modifiers. The index field is
 initialized to -1, but the first time the modifier is encountered, it is filled
@@ -927,7 +931,6 @@
 static BOOL restrict_for_perl_test = FALSE;
 static BOOL show_memory = FALSE;
 
-static int code_unit_size;                    /* Bytes */
 static int jitrc;                             /* Return from JIT compile */
 static int test_mode = DEFAULT_TEST_MODE;
 static int timeit = 0;
@@ -937,6 +940,7 @@
 clock_t total_jit_compile_time = 0;
 clock_t total_match_time = 0;
 
+static uint32_t code_unit_size;               /* Bytes */
 static uint32_t dfa_matched;
 static uint32_t forbid_utf = 0;
 static uint32_t maxlookbehind;
@@ -1227,10 +1231,15 @@
   else \
     pcre2_jit_stack_free_32((pcre2_jit_stack_32 *)a);
 
-#define PCRE2_MAKETABLES(a) \
-  if (test_mode == PCRE8_MODE) a = pcre2_maketables_8(NULL); \
-  else if (test_mode == PCRE16_MODE) a = pcre2_maketables_16(NULL); \
-  else a = pcre2_maketables_32(NULL)
+#define PCRE2_MAKETABLES(a,c) \
+  if (test_mode == PCRE8_MODE) a = pcre2_maketables_8(G(c,8)); \
+  else if (test_mode == PCRE16_MODE) a = pcre2_maketables_16(G(c,16)); \
+  else a = pcre2_maketables_32(G(c,32))
+
+#define PCRE2_MAKETABLES_FREE(c,a) \
+  if (test_mode == PCRE8_MODE) pcre2_maketables_free_8(G(c,8),a); \
+  else if (test_mode == PCRE16_MODE) pcre2_maketables_free_16(G(c,16),a); \
+  else pcre2_maketables_free_32(G(c,32),a)
 
 #define PCRE2_MATCH(a,b,c,d,e,f,g,h) \
   if (test_mode == PCRE8_MODE) \
@@ -1242,19 +1251,19 @@
 
 #define PCRE2_MATCH_DATA_CREATE(a,b,c) \
   if (test_mode == PCRE8_MODE) \
-    G(a,8) = pcre2_match_data_create_8(b,c); \
+    G(a,8) = pcre2_match_data_create_8(b,G(c,8)); \
   else if (test_mode == PCRE16_MODE) \
-    G(a,16) = pcre2_match_data_create_16(b,c); \
+    G(a,16) = pcre2_match_data_create_16(b,G(c,16)); \
   else \
-    G(a,32) = pcre2_match_data_create_32(b,c)
+    G(a,32) = pcre2_match_data_create_32(b,G(c,32))
 
 #define PCRE2_MATCH_DATA_CREATE_FROM_PATTERN(a,b,c) \
   if (test_mode == PCRE8_MODE) \
-    G(a,8) = pcre2_match_data_create_from_pattern_8(G(b,8),c); \
+    G(a,8) = pcre2_match_data_create_from_pattern_8(G(b,8),G(c,8)); \
   else if (test_mode == PCRE16_MODE) \
-    G(a,16) = pcre2_match_data_create_from_pattern_16(G(b,16),c); \
+    G(a,16) = pcre2_match_data_create_from_pattern_16(G(b,16),G(c,16)); \
   else \
-    G(a,32) = pcre2_match_data_create_from_pattern_32(G(b,32),c)
+    G(a,32) = pcre2_match_data_create_from_pattern_32(G(b,32),G(c,32))
 
 #define PCRE2_MATCH_DATA_FREE(a) \
   if (test_mode == PCRE8_MODE) \
@@ -1746,11 +1755,17 @@
   else \
     G(pcre2_jit_stack_free_,BITTWO)((G(pcre2_jit_stack_,BITTWO) *)a);
 
-#define PCRE2_MAKETABLES(a) \
+#define PCRE2_MAKETABLES(a,c) \
   if (test_mode == G(G(PCRE,BITONE),_MODE)) \
-    a = G(pcre2_maketables_,BITONE)(NULL); \
+    a = G(pcre2_maketables_,BITONE)(G(c,BITONE)); \
   else \
-    a = G(pcre2_maketables_,BITTWO)(NULL)
+    a = G(pcre2_maketables_,BITTWO)(G(c,BITTWO))
+
+#define PCRE2_MAKETABLES_FREE(c,a) \
+  if (test_mode == G(G(PCRE,BITONE),_MODE)) \
+    G(pcre2_maketables_free_,BITONE)(G(c,BITONE),a); \
+  else \
+    G(pcre2_maketables_free_,BITTWO)(G(c,BITTWO),a)
 
 #define PCRE2_MATCH(a,b,c,d,e,f,g,h) \
   if (test_mode == G(G(PCRE,BITONE),_MODE)) \
@@ -1762,15 +1777,15 @@
 
 #define PCRE2_MATCH_DATA_CREATE(a,b,c) \
   if (test_mode == G(G(PCRE,BITONE),_MODE)) \
-    G(a,BITONE) = G(pcre2_match_data_create_,BITONE)(b,c); \
+    G(a,BITONE) = G(pcre2_match_data_create_,BITONE)(b,G(c,BITONE)); \
   else \
-    G(a,BITTWO) = G(pcre2_match_data_create_,BITTWO)(b,c)
+    G(a,BITTWO) = G(pcre2_match_data_create_,BITTWO)(b,G(c,BITTWO))
 
 #define PCRE2_MATCH_DATA_CREATE_FROM_PATTERN(a,b,c) \
   if (test_mode == G(G(PCRE,BITONE),_MODE)) \
-    G(a,BITONE) = G(pcre2_match_data_create_from_pattern_,BITONE)(G(b,BITONE),c); \
+    G(a,BITONE) = G(pcre2_match_data_create_from_pattern_,BITONE)(G(b,BITONE),G(c,BITONE)); \
   else \
-    G(a,BITTWO) = G(pcre2_match_data_create_from_pattern_,BITTWO)(G(b,BITTWO),c)
+    G(a,BITTWO) = G(pcre2_match_data_create_from_pattern_,BITTWO)(G(b,BITTWO),G(c,BITTWO))
 
 #define PCRE2_MATCH_DATA_FREE(a) \
   if (test_mode == G(G(PCRE,BITONE),_MODE)) \
@@ -2067,12 +2082,13 @@
 #define PCRE2_JIT_STACK_ASSIGN(a,b,c) \
   pcre2_jit_stack_assign_8(G(a,8),(pcre2_jit_callback_8)b,c);
 #define PCRE2_JIT_STACK_FREE(a) pcre2_jit_stack_free_8((pcre2_jit_stack_8 *)a);
-#define PCRE2_MAKETABLES(a) a = pcre2_maketables_8(NULL)
+#define PCRE2_MAKETABLES(a,c) a = pcre2_maketables_8(G(c,8))
+#define PCRE2_MAKETABLES_FREE(c,a) pcre2_maketables_free_8(G(c,8),a)
 #define PCRE2_MATCH(a,b,c,d,e,f,g,h) \
   a = pcre2_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),h)
-#define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,8) = pcre2_match_data_create_8(b,c)
+#define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,8) = pcre2_match_data_create_8(b,G(c,8))
 #define PCRE2_MATCH_DATA_CREATE_FROM_PATTERN(a,b,c) \
-  G(a,8) = pcre2_match_data_create_from_pattern_8(G(b,8),c)
+  G(a,8) = pcre2_match_data_create_from_pattern_8(G(b,8),G(c,8))
 #define PCRE2_MATCH_DATA_FREE(a) pcre2_match_data_free_8(G(a,8))
 #define PCRE2_PATTERN_CONVERT(a,b,c,d,e,f,g) a = pcre2_pattern_convert_8(G(b,8),c,d,(PCRE2_UCHAR8 **)e,f,G(g,8))
 #define PCRE2_PATTERN_INFO(a,b,c,d) a = pcre2_pattern_info_8(G(b,8),c,d)
@@ -2174,12 +2190,13 @@
 #define PCRE2_JIT_STACK_ASSIGN(a,b,c) \
   pcre2_jit_stack_assign_16(G(a,16),(pcre2_jit_callback_16)b,c);
 #define PCRE2_JIT_STACK_FREE(a) pcre2_jit_stack_free_16((pcre2_jit_stack_16 *)a);
-#define PCRE2_MAKETABLES(a) a = pcre2_maketables_16(NULL)
+#define PCRE2_MAKETABLES(a,c) a = pcre2_maketables_16(G(c,16))
+#define PCRE2_MAKETABLES_FREE(c,a) pcre2_maketables_free_16(G(c,16),a)
 #define PCRE2_MATCH(a,b,c,d,e,f,g,h) \
   a = pcre2_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),h)
-#define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,16) = pcre2_match_data_create_16(b,c)
+#define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,16) = pcre2_match_data_create_16(b,G(c,16))
 #define PCRE2_MATCH_DATA_CREATE_FROM_PATTERN(a,b,c) \
-  G(a,16) = pcre2_match_data_create_from_pattern_16(G(b,16),c)
+  G(a,16) = pcre2_match_data_create_from_pattern_16(G(b,16),G(c,16))
 #define PCRE2_MATCH_DATA_FREE(a) pcre2_match_data_free_16(G(a,16))
 #define PCRE2_PATTERN_CONVERT(a,b,c,d,e,f,g) a = pcre2_pattern_convert_16(G(b,16),c,d,(PCRE2_UCHAR16 **)e,f,G(g,16))
 #define PCRE2_PATTERN_INFO(a,b,c,d) a = pcre2_pattern_info_16(G(b,16),c,d)
@@ -2281,12 +2298,13 @@
 #define PCRE2_JIT_STACK_ASSIGN(a,b,c) \
   pcre2_jit_stack_assign_32(G(a,32),(pcre2_jit_callback_32)b,c);
 #define PCRE2_JIT_STACK_FREE(a) pcre2_jit_stack_free_32((pcre2_jit_stack_32 *)a);
-#define PCRE2_MAKETABLES(a) a = pcre2_maketables_32(NULL)
+#define PCRE2_MAKETABLES(a,c) a = pcre2_maketables_32(G(c,32))
+#define PCRE2_MAKETABLES_FREE(c,a) pcre2_maketables_free_32(G(c,32),a)
 #define PCRE2_MATCH(a,b,c,d,e,f,g,h) \
   a = pcre2_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),h)
-#define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,32) = pcre2_match_data_create_32(b,c)
+#define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,32) = pcre2_match_data_create_32(b,G(c,32))
 #define PCRE2_MATCH_DATA_CREATE_FROM_PATTERN(a,b,c) \
-  G(a,32) = pcre2_match_data_create_from_pattern_32(G(b,32),c)
+  G(a,32) = pcre2_match_data_create_from_pattern_32(G(b,32),G(c,32))
 #define PCRE2_MATCH_DATA_FREE(a) pcre2_match_data_free_32(G(a,32))
 #define PCRE2_PATTERN_CONVERT(a,b,c,d,e,f,g) a = pcre2_pattern_convert_32(G(b,32),c,d,(PCRE2_UCHAR32 **)e,f,G(g,32))
 #define PCRE2_PATTERN_INFO(a,b,c,d) a = pcre2_pattern_info_32(G(b,32),c,d)
@@ -2780,7 +2798,7 @@
 static void my_free(void *block, void *data)
 {
 (void)data;
-if (show_memory)
+if (show_memory && block != NULL)
   {
   uint32_t i, j;
   BOOL found = FALSE;
@@ -4112,7 +4130,7 @@
 static void
 show_controls(uint32_t controls, uint32_t controls2, const char *before)
 {
-fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
   before,
   ((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
   ((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
@@ -4130,7 +4148,8 @@
   ((controls & CTL_DFA) != 0)? " dfa" : "",
   ((controls & CTL_EXPAND) != 0)? " expand" : "",
   ((controls & CTL_FINDLIMITS) != 0)? " find_limits" : "",
-  ((controls & CTL_FRAMESIZE) != 0)? " framesize" : "",
+  ((controls & CTL_FINDLIMITS_NOHEAP) != 0)? " find_limits_noheap" : "",
+  ((controls2 & CTL2_FRAMESIZE) != 0)? " framesize" : "",
   ((controls & CTL_FULLBINCODE) != 0)? " fullbincode" : "",
   ((controls & CTL_GETALL) != 0)? " getall" : "",
   ((controls & CTL_GLOBAL) != 0)? " global" : "",
@@ -4307,12 +4326,18 @@
 (void)pattern_info(PCRE2_INFO_SIZE, &size, FALSE);
 (void)pattern_info(PCRE2_INFO_NAMECOUNT, &name_count, FALSE);
 (void)pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size, FALSE);
-fprintf(outfile, "Memory allocation (code space): %d\n",
-  (int)(size - name_count*name_entry_size*code_unit_size - cblock_size));
+
+/* The uint32_t variables are cast before multiplying to stop code analyzers
+grumbling about potential overflow. */
+
+fprintf(outfile, "Memory allocation (code space): %" SIZ_FORM "\n", size -
+  (size_t)name_count * (size_t)name_entry_size * (size_t)code_unit_size -
+  cblock_size);
+
 if (pat_patctl.jit != 0)
   {
   (void)pattern_info(PCRE2_INFO_JITSIZE, &size, FALSE);
-  fprintf(outfile, "Memory allocation (JIT code): %d\n", (int)size);
+  fprintf(outfile, "Memory allocation (JIT code): %" SIZ_FORM "\n", size);
   }
 }
 
@@ -4327,7 +4352,7 @@
 {
 size_t frame_size;
 (void)pattern_info(PCRE2_INFO_FRAMESIZE, &frame_size, FALSE);
-fprintf(outfile, "Frame size for pcre2_match(): %d\n", (int)frame_size);
+fprintf(outfile, "Frame size for pcre2_match(): %" SIZ_FORM "\n", frame_size);
 }
 
 
@@ -4749,19 +4774,19 @@
 
   if (pat_patctl.jit != 0 && (pat_patctl.control & CTL_JITVERIFY) != 0)
     {
+#ifdef SUPPORT_JIT
     if (FLD(compiled_code, executable_jit) != NULL)
       fprintf(outfile, "JIT compilation was successful\n");
     else
       {
-#ifdef SUPPORT_JIT
       fprintf(outfile, "JIT compilation was not successful");
       if (jitrc != 0 && !print_error_message(jitrc, " (", ")"))
         return PR_ABEND;
       fprintf(outfile, "\n");
+      }
 #else
       fprintf(outfile, "JIT support is not available in this version of PCRE2\n");
 #endif
-      }
     }
   }
 
@@ -4980,7 +5005,7 @@
     PCRE2_JIT_COMPILE(jitrc, compiled_code, pat_patctl.jit);
     }
   if ((pat_patctl.control & CTL_MEMORY) != 0) show_memory_info();
-  if ((pat_patctl.control & CTL_FRAMESIZE) != 0) show_framesize();
+  if ((pat_patctl.control2 & CTL2_FRAMESIZE) != 0) show_framesize();
   if ((pat_patctl.control & CTL_ANYINFO) != 0)
     {
     rc = show_pattern_info();
@@ -5426,8 +5451,11 @@
   if (strcmp((const char *)pat_patctl.locale, (const char *)locale_name) != 0)
     {
     strcpy((char *)locale_name, (char *)pat_patctl.locale);
-    if (locale_tables != NULL) free((void *)locale_tables);
-    PCRE2_MAKETABLES(locale_tables);
+    if (locale_tables != NULL)
+      {
+      PCRE2_MAKETABLES_FREE(general_context, (void *)locale_tables);
+      }
+    PCRE2_MAKETABLES(locale_tables, general_context);
     }
   use_tables = locale_tables;
   }
@@ -5942,7 +5970,7 @@
 /* Output code size and other information if requested. */
 
 if ((pat_patctl.control & CTL_MEMORY) != 0) show_memory_info();
-if ((pat_patctl.control & CTL_FRAMESIZE) != 0) show_framesize();
+if ((pat_patctl.control2 & CTL2_FRAMESIZE) != 0) show_framesize();
 if ((pat_patctl.control & CTL_ANYINFO) != 0)
   {
   int rc = show_pattern_info();
@@ -6021,10 +6049,46 @@
   {
   uint32_t stack_start = 0;
 
+  /* If we are checking the heap limit, free any frames vector that is cached
+  in the match_data so we always start without one. */
+
   if (errnumber == PCRE2_ERROR_HEAPLIMIT)
     {
     PCRE2_SET_HEAP_LIMIT(dat_context, mid);
+
+#ifdef SUPPORT_PCRE2_8
+    if (code_unit_size == 1)
+      {
+      match_data8->memctl.free(match_data8->heapframes,
+        match_data8->memctl.memory_data);
+      match_data8->heapframes = NULL;
+      match_data8->heapframes_size = 0;
+      }
+#endif
+
+#ifdef SUPPORT_PCRE2_16
+    if (code_unit_size == 2)
+      {
+      match_data16->memctl.free(match_data16->heapframes,
+        match_data16->memctl.memory_data);
+      match_data16->heapframes = NULL;
+      match_data16->heapframes_size = 0;
+      }
+#endif
+
+#ifdef SUPPORT_PCRE2_32
+    if (code_unit_size == 4)
+      {
+      match_data32->memctl.free(match_data32->heapframes,
+        match_data32->memctl.memory_data);
+      match_data32->heapframes = NULL;
+      match_data32->heapframes_size = 0;
+      }
+#endif
     }
+
+  /* No need to mess with the frames vector for match or depth limits. */
+
   else if (errnumber == PCRE2_ERROR_MATCHLIMIT)
     {
     PCRE2_SET_MATCH_LIMIT(dat_context, mid);
@@ -6034,6 +6098,8 @@
     PCRE2_SET_DEPTH_LIMIT(dat_context, mid);
     }
 
+  /* Do the appropriate match */
+
   if ((dat_datctl.control & CTL_DFA) != 0)
     {
     stack_start = DFA_START_RWS_SIZE/1024;
@@ -6052,7 +6118,6 @@
 
   else
     {
-    stack_start = START_FRAMES_SIZE/1024;
     PCRE2_MATCH(capcount, compiled_code, pp, ulen, dat_datctl.offset,
       dat_datctl.options, match_data, PTR(dat_context));
     }
@@ -6757,8 +6822,6 @@
     {
     long li;
     char *endptr;
-    size_t qoffset = CAST8VAR(q) - dbuffer;
-    size_t rep_offset = start_rep - dbuffer;
 
     if (*p++ != '{')
       {
@@ -6781,9 +6844,9 @@
       }
 
     i = (int32_t)li;
-    if (i-- == 0)
+    if (i-- <= 0)
       {
-      fprintf(outfile, "** Zero repeat not allowed\n");
+      fprintf(outfile, "** Zero or negative repeat not allowed\n");
       return PR_OK;
       }
 
@@ -6792,6 +6855,8 @@
 
     if (needlen >= dbuffer_size)
       {
+      size_t qoffset = CAST8VAR(q) - dbuffer;
+      size_t rep_offset = start_rep - dbuffer;
       while (needlen >= dbuffer_size) dbuffer_size *= 2;
       dbuffer = (uint8_t *)realloc(dbuffer, dbuffer_size);
       if (dbuffer == NULL)
@@ -7270,7 +7335,8 @@
 if (dat_datctl.oveccount == 0)
   {
   PCRE2_MATCH_DATA_FREE(match_data);
-  PCRE2_MATCH_DATA_CREATE_FROM_PATTERN(match_data, compiled_code, NULL);
+  PCRE2_MATCH_DATA_CREATE_FROM_PATTERN(match_data, compiled_code,
+    general_context);
   PCRE2_GET_OVECTOR_COUNT(max_oveccount, match_data);
   }
 else if (dat_datctl.oveccount <= max_oveccount)
@@ -7281,7 +7347,7 @@
   {
   max_oveccount = dat_datctl.oveccount;
   PCRE2_MATCH_DATA_FREE(match_data);
-  PCRE2_MATCH_DATA_CREATE(match_data, max_oveccount, NULL);
+  PCRE2_MATCH_DATA_CREATE(match_data, max_oveccount, general_context);
   }
 
 if (CASTVAR(void *, match_data) == NULL)
@@ -7578,12 +7644,13 @@
   limits are not relevant for JIT. The return from check_match_limit() is the
   return from the final call to pcre2_match() or pcre2_dfa_match(). */
 
-  if ((dat_datctl.control & CTL_FINDLIMITS) != 0)
+  if ((dat_datctl.control & (CTL_FINDLIMITS|CTL_FINDLIMITS_NOHEAP)) != 0)
     {
     capcount = 0;  /* This stops compiler warnings */
 
-    if (FLD(compiled_code, executable_jit) == NULL ||
-          (dat_datctl.options & PCRE2_NO_JIT) != 0)
+    if ((dat_datctl.control & CTL_FINDLIMITS_NOHEAP) == 0 &&
+        (FLD(compiled_code, executable_jit) == NULL ||
+          (dat_datctl.options & PCRE2_NO_JIT) != 0))
       {
       (void)check_match_limit(pp, arg_ulen, PCRE2_ERROR_HEAPLIMIT, "heap");
       }
@@ -8935,9 +9002,13 @@
     if (rlim.rlim_cur > rlim.rlim_max)
       {
       fprintf(stderr,
-        "pcre2test: requested stack size %luMiB is greater than hard limit "
-          "%luMiB\n", (unsigned long int)stack_size,
-          (unsigned long int)(rlim.rlim_max));
+        "pcre2test: requested stack size %luMiB is greater than hard limit ",
+          (unsigned long int)stack_size);
+      if (rlim.rlim_max % (1024*1024) == 0) fprintf(stderr, "%luMiB\n",
+        (unsigned long int)(rlim.rlim_max/(1024 * 1024)));
+      else if (rlim.rlim_max % 1024 == 0) fprintf(stderr, "%luKiB\n",
+        (unsigned long int)(rlim.rlim_max/1024));
+      else fprintf(stderr, "%lu bytes\n", (unsigned long int)(rlim.rlim_max));
       exit(1);
       }
     rc = setrlimit(RLIMIT_STACK, &rlim);
@@ -9170,7 +9241,6 @@
   (void)G(pcre2_set_compile_extra_options_,BITS)(G(pat_context,BITS), 0); \
   (void)G(pcre2_set_max_pattern_length_,BITS)(G(pat_context,BITS), 0); \
   (void)G(pcre2_set_offset_limit_,BITS)(G(dat_context,BITS), 0); \
-  (void)G(pcre2_set_recursion_memory_management_,BITS)(G(dat_context,BITS), my_malloc, my_free, NULL); \
   (void)G(pcre2_get_match_data_size_,BITS)(G(match_data,BITS))
 
 
@@ -9393,8 +9463,8 @@
 free(dbuffer);
 free(pbuffer8);
 free(dfa_workspace);
-free((void *)locale_tables);
 free(tables3);
+PCRE2_MAKETABLES_FREE(general_context, (void *)locale_tables);
 PCRE2_MATCH_DATA_FREE(match_data);
 SUB1(pcre2_code_free, compiled_code);
 
diff --git a/src/sljit/sljitConfig.h b/src/sljit/sljitConfig.h
index 1c821d2..5fba7aa 100644
--- a/src/sljit/sljitConfig.h
+++ b/src/sljit/sljitConfig.h
@@ -53,7 +53,8 @@
 /* #define SLJIT_CONFIG_PPC_64 1 */
 /* #define SLJIT_CONFIG_MIPS_32 1 */
 /* #define SLJIT_CONFIG_MIPS_64 1 */
-/* #define SLJIT_CONFIG_SPARC_32 1 */
+/* #define SLJIT_CONFIG_RISCV_32 1 */
+/* #define SLJIT_CONFIG_RISCV_64 1 */
 /* #define SLJIT_CONFIG_S390X 1 */
 
 /* #define SLJIT_CONFIG_AUTO 1 */
@@ -127,17 +128,6 @@
 
 #endif /* !SLJIT_EXECUTABLE_ALLOCATOR */
 
-/* Force cdecl calling convention even if a better calling
-   convention (e.g. fastcall) is supported by the C compiler.
-   If this option is disabled (this is the default), functions
-   called from JIT should be defined with SLJIT_FUNC attribute.
-   Standard C functions can still be called by using the
-   SLJIT_CALL_CDECL jump type. */
-#ifndef SLJIT_USE_CDECL_CALLING_CONVENTION
-/* Disabled by default */
-#define SLJIT_USE_CDECL_CALLING_CONVENTION 0
-#endif
-
 /* Return with error when an invalid argument is passed. */
 #ifndef SLJIT_ARGUMENT_CHECKS
 /* Disabled by default */
diff --git a/src/sljit/sljitConfigInternal.h b/src/sljit/sljitConfigInternal.h
index 55e4e39..cd3ce69 100644
--- a/src/sljit/sljitConfigInternal.h
+++ b/src/sljit/sljitConfigInternal.h
@@ -59,7 +59,8 @@
      SLJIT_64BIT_ARCHITECTURE : 64 bit architecture
      SLJIT_LITTLE_ENDIAN : little endian architecture
      SLJIT_BIG_ENDIAN : big endian architecture
-     SLJIT_UNALIGNED : allows unaligned memory accesses for non-fpu operations (only!)
+     SLJIT_UNALIGNED : unaligned memory accesses for non-fpu operations are supported
+     SLJIT_FPU_UNALIGNED : unaligned memory accesses for fpu operations are supported
      SLJIT_INDIRECT_CALL : see SLJIT_FUNC_ADDR() for more information
 
    Constants:
@@ -98,7 +99,8 @@
 	+ (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) \
 	+ (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) \
 	+ (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64) \
-	+ (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32) \
+	+ (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) \
+	+ (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) \
 	+ (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \
 	+ (defined SLJIT_CONFIG_AUTO && SLJIT_CONFIG_AUTO) \
 	+ (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED) >= 2
@@ -115,7 +117,8 @@
 	&& !(defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) \
 	&& !(defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) \
 	&& !(defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64) \
-	&& !(defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32) \
+	&& !(defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) \
+	&& !(defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) \
 	&& !(defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \
 	&& !(defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED) \
 	&& !(defined SLJIT_CONFIG_AUTO && SLJIT_CONFIG_AUTO)
@@ -156,8 +159,10 @@
 #define SLJIT_CONFIG_MIPS_32 1
 #elif defined(__mips64)
 #define SLJIT_CONFIG_MIPS_64 1
-#elif (defined(__sparc__) || defined(__sparc)) && !defined(_LP64)
-#define SLJIT_CONFIG_SPARC_32 1
+#elif defined (__riscv_xlen) && (__riscv_xlen == 32)
+#define SLJIT_CONFIG_RISCV_32 1
+#elif defined (__riscv_xlen) && (__riscv_xlen == 64)
+#define SLJIT_CONFIG_RISCV_64 1
 #elif defined(__s390x__)
 #define SLJIT_CONFIG_S390X 1
 #else
@@ -205,8 +210,8 @@
 #define SLJIT_CONFIG_PPC 1
 #elif (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) || (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
 #define SLJIT_CONFIG_MIPS 1
-#elif (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32) || (defined SLJIT_CONFIG_SPARC_64 && SLJIT_CONFIG_SPARC_64)
-#define SLJIT_CONFIG_SPARC 1
+#elif (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32) || (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+#define SLJIT_CONFIG_RISCV 1
 #endif
 
 /***********************************************************/
@@ -330,8 +335,14 @@
  * older versions are known to abort in some targets
  * https://github.com/PhilipHazel/pcre2/issues/92
  *
- * beware APPLE is known to have removed the code in iOS so
- * it will need to be excempted or result in broken builds
+ * beware some vendors (ex: Microsoft, Apple) are known to have
+ * removed the code to support this builtin even if the call for
+ * __has_builtin reports it is available.
+ *
+ * make sure linking doesn't fail because __clear_cache() is
+ * missing before changing it or add an exception so that the
+ * system provided method that should be defined below is used
+ * instead.
  */
 #if (!defined SLJIT_CACHE_FLUSH && defined __has_builtin)
 #if __has_builtin(__builtin___clear_cache) && !defined(__clang__)
@@ -339,9 +350,9 @@
 /*
  * https://gcc.gnu.org/bugzilla//show_bug.cgi?id=91248
  * https://gcc.gnu.org/bugzilla//show_bug.cgi?id=93811
- * gcc's clear_cache builtin for power and sparc are broken
+ * gcc's clear_cache builtin for power is broken
  */
-#if !defined(SLJIT_CONFIG_PPC) && !defined(SLJIT_CONFIG_SPARC_32)
+#if !defined(SLJIT_CONFIG_PPC)
 #define SLJIT_CACHE_FLUSH(from, to) \
 	__builtin___clear_cache((char*)(from), (char*)(to))
 #endif
@@ -373,12 +384,10 @@
 	ppc_cache_flush((from), (to))
 #define SLJIT_CACHE_FLUSH_OWN_IMPL 1
 
-#elif (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
+#elif defined(_WIN32)
 
-/* The __clear_cache() implementation of GCC is a dummy function on Sparc. */
 #define SLJIT_CACHE_FLUSH(from, to) \
-	sparc_cache_flush((from), (to))
-#define SLJIT_CACHE_FLUSH_OWN_IMPL 1
+	FlushInstructionCache(GetCurrentProcess(), (void*)(from), (char*)(to) - (char*)(from))
 
 #elif (defined(__GNUC__) && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) || defined(__clang__)
 
@@ -392,11 +401,6 @@
 #define SLJIT_CACHE_FLUSH(from, to) \
 	cacheflush((long)(from), (long)(to), 0)
 
-#elif defined _WIN32
-
-#define SLJIT_CACHE_FLUSH(from, to) \
-	FlushInstructionCache(GetCurrentProcess(), (void*)(from), (char*)(to) - (char*)(from))
-
 #else
 
 /* Call __ARM_NR_cacheflush on ARM-Linux or the corresponding MIPS syscall. */
@@ -435,6 +439,7 @@
 	&& !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
 	&& !(defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) \
 	&& !(defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64) \
+	&& !(defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64) \
 	&& !(defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
 #define SLJIT_32BIT_ARCHITECTURE 1
 #define SLJIT_WORD_SHIFT 2
@@ -495,8 +500,7 @@
 #if !defined(SLJIT_BIG_ENDIAN) && !defined(SLJIT_LITTLE_ENDIAN)
 
 /* These macros are mostly useful for the applications. */
-#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32) \
-	|| (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+#if (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
 
 #ifdef __LITTLE_ENDIAN__
 #define SLJIT_LITTLE_ENDIAN 1
@@ -504,8 +508,7 @@
 #define SLJIT_BIG_ENDIAN 1
 #endif
 
-#elif (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) \
-	|| (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+#elif (defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS)
 
 #ifdef __MIPSEL__
 #define SLJIT_LITTLE_ENDIAN 1
@@ -532,8 +535,7 @@
 
 #endif /* !SLJIT_MIPS_REV */
 
-#elif (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32) \
-	|| (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
+#elif (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
 
 #define SLJIT_BIG_ENDIAN 1
 
@@ -554,19 +556,30 @@
 
 #ifndef SLJIT_UNALIGNED
 
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) \
-	|| (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) \
+#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
 	|| (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7) \
 	|| (defined SLJIT_CONFIG_ARM_THUMB2 && SLJIT_CONFIG_ARM_THUMB2) \
 	|| (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
-	|| (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32) \
-	|| (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) \
+	|| (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC) \
+	|| (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV) \
 	|| (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
 #define SLJIT_UNALIGNED 1
 #endif
 
 #endif /* !SLJIT_UNALIGNED */
 
+#ifndef SLJIT_FPU_UNALIGNED
+
+#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
+	|| (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
+	|| (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC) \
+	|| (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV) \
+	|| (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
+#define SLJIT_FPU_UNALIGNED 1
+#endif
+
+#endif /* !SLJIT_FPU_UNALIGNED */
+
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 /* Auto detect SSE2 support using CPUID.
    On 64 bit x86 cpus, sse2 must be present. */
@@ -578,38 +591,7 @@
 /*****************************************************************************************/
 
 #ifndef SLJIT_FUNC
-
-#if (defined SLJIT_USE_CDECL_CALLING_CONVENTION && SLJIT_USE_CDECL_CALLING_CONVENTION) \
-	|| !(defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-
 #define SLJIT_FUNC
-
-#elif defined(__GNUC__) && !defined(__APPLE__)
-
-#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
-#define SLJIT_FUNC __attribute__ ((fastcall))
-#define SLJIT_X86_32_FASTCALL 1
-#else
-#define SLJIT_FUNC
-#endif /* gcc >= 3.4 */
-
-#elif defined(_MSC_VER)
-
-#define SLJIT_FUNC __fastcall
-#define SLJIT_X86_32_FASTCALL 1
-
-#elif defined(__BORLANDC__)
-
-#define SLJIT_FUNC __msfastcall
-#define SLJIT_X86_32_FASTCALL 1
-
-#else /* Unknown compiler. */
-
-/* The cdecl calling convention is usually the x86 default. */
-#define SLJIT_FUNC
-
-#endif /* SLJIT_USE_CDECL_CALLING_CONVENTION */
-
 #endif /* !SLJIT_FUNC */
 
 #ifndef SLJIT_INDIRECT_CALL
@@ -621,14 +603,10 @@
 #endif
 #endif /* SLJIT_INDIRECT_CALL */
 
-/* The offset which needs to be substracted from the return address to
+/* The offset which needs to be subtracted from the return address to
 determine the next executed instruction after return. */
 #ifndef SLJIT_RETURN_ADDRESS_OFFSET
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-#define SLJIT_RETURN_ADDRESS_OFFSET 8
-#else
 #define SLJIT_RETURN_ADDRESS_OFFSET 0
-#endif
 #endif /* SLJIT_RETURN_ADDRESS_OFFSET */
 
 /***************************************************/
@@ -666,10 +644,10 @@
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 
 #define SLJIT_NUMBER_OF_REGISTERS 12
-#define SLJIT_NUMBER_OF_SAVED_REGISTERS 9
+#define SLJIT_NUMBER_OF_SAVED_REGISTERS 7
 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 7
 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 0
-#define SLJIT_LOCALS_OFFSET_BASE (compiler->locals_offset)
+#define SLJIT_LOCALS_OFFSET_BASE (8 * SSIZE_OF(sw))
 #define SLJIT_PREF_SHIFT_REG SLJIT_R2
 
 #elif (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
@@ -683,7 +661,7 @@
 #else /* _WIN64 */
 #define SLJIT_NUMBER_OF_SAVED_REGISTERS 8
 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 10
-#define SLJIT_LOCALS_OFFSET_BASE (4 * (sljit_s32)sizeof(sljit_sw))
+#define SLJIT_LOCALS_OFFSET_BASE (4 * SSIZE_OF(sw))
 #endif /* !_WIN64 */
 #define SLJIT_PREF_SHIFT_REG SLJIT_R3
 
@@ -740,17 +718,13 @@
 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8
 #endif
 
-#elif (defined SLJIT_CONFIG_SPARC && SLJIT_CONFIG_SPARC)
+#elif (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV)
 
-#define SLJIT_NUMBER_OF_REGISTERS 18
-#define SLJIT_NUMBER_OF_SAVED_REGISTERS 14
-#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 14
-#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 0
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-/* saved registers (16), return struct pointer (1), space for 6 argument words (1),
-   4th double arg (2), double alignment (1). */
-#define SLJIT_LOCALS_OFFSET_BASE ((16 + 1 + 6 + 2 + 1) * (sljit_s32)sizeof(sljit_sw))
-#endif
+#define SLJIT_NUMBER_OF_REGISTERS 23
+#define SLJIT_NUMBER_OF_SAVED_REGISTERS 12
+#define SLJIT_LOCALS_OFFSET_BASE 0
+#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30
+#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 12
 
 #elif (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
 
@@ -806,7 +780,7 @@
 #if (defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM) \
 	|| (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC) \
 	|| (defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) \
-	|| (defined SLJIT_CONFIG_SPARC && SLJIT_CONFIG_SPARC) \
+	|| (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV) \
 	|| (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
 #define SLJIT_HAS_STATUS_FLAGS_STATE 1
 #endif
diff --git a/src/sljit/sljitLir.c b/src/sljit/sljitLir.c
index 313a061..abafe1a 100644
--- a/src/sljit/sljitLir.c
+++ b/src/sljit/sljitLir.c
@@ -133,6 +133,14 @@
 #define SLJIT_ARG_MASK		0x7
 #define SLJIT_ARG_FULL_MASK	(SLJIT_ARG_MASK | SLJIT_ARG_TYPE_SCRATCH_REG)
 
+/* Mask for sljit_emit_mem. */
+#define REG_PAIR_MASK		0xff00
+#define REG_PAIR_FIRST(reg)	((reg) & 0xff)
+#define REG_PAIR_SECOND(reg)	((reg) >> 8)
+
+/* Mask for sljit_emit_enter. */
+#define SLJIT_KEPT_SAVEDS_COUNT(options) ((options) & 0x3)
+
 /* Jump flags. */
 #define JUMP_LABEL	0x1
 #define JUMP_ADDR	0x2
@@ -145,16 +153,16 @@
 #	define PATCH_MD		0x10
 #endif
 #	define TYPE_SHIFT	13
-#endif
+#endif /* SLJIT_CONFIG_X86 */
 
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5) || (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
 #	define IS_BL		0x4
 #	define PATCH_B		0x8
-#endif
+#endif /* SLJIT_CONFIG_ARM_V5 || SLJIT_CONFIG_ARM_V7 */
 
 #if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
 #	define CPOOL_SIZE	512
-#endif
+#endif /* SLJIT_CONFIG_ARM_V5 */
 
 #if (defined SLJIT_CONFIG_ARM_THUMB2 && SLJIT_CONFIG_ARM_THUMB2)
 #	define IS_COND		0x04
@@ -172,7 +180,7 @@
 	/* BL + imm24 */
 #	define PATCH_BL		0x60
 	/* 0xf00 cc code for branches */
-#endif
+#endif /* SLJIT_CONFIG_ARM_THUMB2 */
 
 #if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
 #	define IS_COND		0x004
@@ -182,7 +190,7 @@
 #	define PATCH_COND	0x040
 #	define PATCH_ABS48	0x080
 #	define PATCH_ABS64	0x100
-#endif
+#endif /* SLJIT_CONFIG_ARM_64 */
 
 #if (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
 #	define IS_COND		0x004
@@ -192,9 +200,9 @@
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 #	define PATCH_ABS32	0x040
 #	define PATCH_ABS48	0x080
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 #	define REMOVE_COND	0x100
-#endif
+#endif /* SLJIT_CONFIG_PPC */
 
 #if (defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS)
 #	define IS_MOVABLE	0x004
@@ -212,7 +220,7 @@
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
 #	define PATCH_ABS32	0x400
 #	define PATCH_ABS48	0x800
-#endif
+#endif /* SLJIT_CONFIG_MIPS_64 */
 
 	/* instruction types */
 #	define MOVABLE_INS	0
@@ -221,28 +229,24 @@
 #	define UNMOVABLE_INS	32
 	/* FPU status register */
 #	define FCSR_FCC		33
-#endif
+#endif /* SLJIT_CONFIG_MIPS */
 
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-#	define IS_MOVABLE	0x04
-#	define IS_COND		0x08
-#	define IS_CALL		0x10
+#if (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV)
+#	define IS_COND		0x004
+#	define IS_CALL		0x008
 
-#	define PATCH_B		0x20
-#	define PATCH_CALL	0x40
+#	define PATCH_B		0x010
+#	define PATCH_J		0x020
 
-	/* instruction types */
-#	define MOVABLE_INS	0
-	/* 1 - 31 last destination register */
-	/* no destination (i.e: store) */
-#	define UNMOVABLE_INS	32
-
-#	define DST_INS_MASK	0xff
-
-	/* ICC_SET is the same as SET_FLAGS. */
-#	define ICC_IS_SET	(1 << 23)
-#	define FCC_IS_SET	(1 << 24)
-#endif
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+#	define PATCH_REL32	0x040
+#	define PATCH_ABS32	0x080
+#	define PATCH_ABS44	0x100
+#	define PATCH_ABS52	0x200
+#else /* !SLJIT_CONFIG_RISCV_64 */
+#	define PATCH_REL32	0x0
+#endif /* SLJIT_CONFIG_RISCV_64 */
+#endif /* SLJIT_CONFIG_RISCV */
 
 /* Stack management. */
 
@@ -385,7 +389,7 @@
 		invalid_integer_types);
 	SLJIT_COMPILE_ASSERT(SLJIT_REWRITABLE_JUMP != SLJIT_32,
 		rewritable_jump_and_single_op_must_not_be_the_same);
-	SLJIT_COMPILE_ASSERT(!(SLJIT_EQUAL & 0x1) && !(SLJIT_LESS & 0x1) && !(SLJIT_EQUAL_F64 & 0x1) && !(SLJIT_JUMP & 0x1),
+	SLJIT_COMPILE_ASSERT(!(SLJIT_EQUAL & 0x1) && !(SLJIT_LESS & 0x1) && !(SLJIT_F_EQUAL & 0x1) && !(SLJIT_JUMP & 0x1),
 		conditional_flags_must_be_even_numbers);
 
 	/* Only the non-zero members must be set. */
@@ -437,10 +441,6 @@
 	compiler->delay_slot = UNMOVABLE_INS;
 #endif
 
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-	compiler->delay_slot = UNMOVABLE_INS;
-#endif
-
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) \
 		|| (defined SLJIT_DEBUG && SLJIT_DEBUG)
 	compiler->last_flags = 0;
@@ -822,6 +822,9 @@
 	if (!(p & SLJIT_MEM))
 		return 0;
 
+	if (p == SLJIT_MEM1(SLJIT_SP))
+		return (i >= 0 && i < compiler->logical_local_size);
+
 	if (!(!(p & REG_MASK) || FUNCTION_CHECK_IS_REG(p & REG_MASK)))
 		return 0;
 
@@ -859,9 +862,6 @@
 	if (p == SLJIT_IMM)
 		return 1;
 
-	if (p == SLJIT_MEM1(SLJIT_SP))
-		return (i >= 0 && i < compiler->logical_local_size);
-
 	return function_check_src_mem(compiler, p, i);
 }
 
@@ -876,9 +876,6 @@
 	if (FUNCTION_CHECK_IS_REG(p))
 		return (i == 0);
 
-	if (p == SLJIT_MEM1(SLJIT_SP))
-		return (i >= 0 && i < compiler->logical_local_size);
-
 	return function_check_src_mem(compiler, p, i);
 }
 
@@ -893,9 +890,6 @@
 	if (FUNCTION_CHECK_IS_FREG(p))
 		return (i == 0);
 
-	if (p == SLJIT_MEM1(SLJIT_SP))
-		return (i >= 0 && i < compiler->logical_local_size);
-
 	return function_check_src_mem(compiler, p, i);
 }
 
@@ -913,7 +907,11 @@
 
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 #ifdef _WIN64
+#ifdef __GNUC__
+#	define SLJIT_PRINT_D	"ll"
+#else
 #	define SLJIT_PRINT_D	"I64"
+#endif
 #else
 #	define SLJIT_PRINT_D	"l"
 #endif
@@ -995,13 +993,14 @@
 static const char* op1_names[] = {
 	"", ".u8", ".s8", ".u16",
 	".s16", ".u32", ".s32", "32",
-	".p", "not", "clz",
+	".p", "not", "clz", "ctz"
 };
 
 static const char* op2_names[] = {
 	"add", "addc", "sub", "subc",
 	"mul", "and", "or", "xor",
-	"shl", "lshr", "ashr",
+	"shl", "mshl", "lshr", "mlshr",
+	"ashr", "mashr", "rotl", "rotr"
 };
 
 static const char* op_src_names[] = {
@@ -1020,10 +1019,6 @@
 	"add", "sub", "mul", "div"
 };
 
-#define JUMP_POSTFIX(type) \
-	((type & 0xff) <= SLJIT_NOT_OVERFLOW ? ((type & SLJIT_32) ? "32" : "") \
-	: ((type & 0xff) <= SLJIT_ORDERED_F64 ? ((type & SLJIT_32) ? ".f32" : ".f64") : ""))
-
 static const char* jump_names[] = {
 	"equal", "not_equal",
 	"less", "greater_equal",
@@ -1032,12 +1027,18 @@
 	"sig_greater", "sig_less_equal",
 	"overflow", "not_overflow",
 	"carry", "",
-	"equal", "not_equal",
-	"less", "greater_equal",
-	"greater", "less_equal",
+	"f_equal", "f_not_equal",
+	"f_less", "f_greater_equal",
+	"f_greater", "f_less_equal",
 	"unordered", "ordered",
+	"ordered_equal", "unordered_or_not_equal",
+	"ordered_less", "unordered_or_greater_equal",
+	"ordered_greater", "unordered_or_less_equal",
+	"unordered_or_equal", "ordered_not_equal",
+	"unordered_or_less", "ordered_greater_equal",
+	"unordered_or_greater", "ordered_less_equal",
 	"jump", "fast_call",
-	"call", "call.cdecl"
+	"call", "call_reg_arg"
 };
 
 static const char* call_arg_names[] = {
@@ -1053,6 +1054,8 @@
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) \
 	|| (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 
+#define SLJIT_SKIP_CHECKS(compiler) (compiler)->skip_checks = 1
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_generate_code(struct sljit_compiler *compiler)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
@@ -1080,7 +1083,12 @@
 	SLJIT_UNUSED_ARG(compiler);
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT(!(options & ~SLJIT_ENTER_CDECL));
+	if (options & SLJIT_ENTER_REG_ARG) {
+		CHECK_ARGUMENT(!(options & ~(0x3 | SLJIT_ENTER_REG_ARG)));
+	} else {
+		CHECK_ARGUMENT(options == 0);
+	}
+	CHECK_ARGUMENT(SLJIT_KEPT_SAVEDS_COUNT(options) <= 3 && SLJIT_KEPT_SAVEDS_COUNT(options) <= saveds);
 	CHECK_ARGUMENT(scratches >= 0 && scratches <= SLJIT_NUMBER_OF_REGISTERS);
 	CHECK_ARGUMENT(saveds >= 0 && saveds <= SLJIT_NUMBER_OF_SAVED_REGISTERS);
 	CHECK_ARGUMENT(scratches + saveds <= SLJIT_NUMBER_OF_REGISTERS);
@@ -1088,8 +1096,8 @@
 	CHECK_ARGUMENT(fsaveds >= 0 && fsaveds <= SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS);
 	CHECK_ARGUMENT(fscratches + fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS);
 	CHECK_ARGUMENT(local_size >= 0 && local_size <= SLJIT_MAX_LOCAL_SIZE);
-	CHECK_ARGUMENT((arg_types & SLJIT_ARG_FULL_MASK) < SLJIT_ARG_TYPE_F64);
-	CHECK_ARGUMENT(function_check_arguments(arg_types, scratches, saveds, fscratches));
+	CHECK_ARGUMENT((arg_types & SLJIT_ARG_FULL_MASK) <= SLJIT_ARG_TYPE_F32);
+	CHECK_ARGUMENT(function_check_arguments(arg_types, scratches, (options & SLJIT_ENTER_REG_ARG) ? 0 : saveds, fscratches));
 
 	compiler->last_flags = 0;
 #endif
@@ -1109,8 +1117,16 @@
 			} while (arg_types);
 		}
 
-		fprintf(compiler->verbose, "],%s scratches:%d, saveds:%d, fscratches:%d, fsaveds:%d, local_size:%d\n",
-			(options & SLJIT_ENTER_CDECL) ? " enter:cdecl," : "",
+		fprintf(compiler->verbose, "],");
+
+		if (options & SLJIT_ENTER_REG_ARG) {
+			fprintf(compiler->verbose, " enter:reg_arg,");
+
+			if (SLJIT_KEPT_SAVEDS_COUNT(options) > 0)
+				fprintf(compiler->verbose, " keep:%d,", SLJIT_KEPT_SAVEDS_COUNT(options));
+		}
+
+		fprintf(compiler->verbose, "scratches:%d, saveds:%d, fscratches:%d, fsaveds:%d, local_size:%d\n",
 			scratches, saveds, fscratches, fsaveds, local_size);
 	}
 #endif
@@ -1124,7 +1140,12 @@
 	SLJIT_UNUSED_ARG(compiler);
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT(!(options & ~SLJIT_ENTER_CDECL));
+	if (options & SLJIT_ENTER_REG_ARG) {
+		CHECK_ARGUMENT(!(options & ~(0x3 | SLJIT_ENTER_REG_ARG)));
+	} else {
+		CHECK_ARGUMENT(options == 0);
+	}
+	CHECK_ARGUMENT(SLJIT_KEPT_SAVEDS_COUNT(options) <= 3 && SLJIT_KEPT_SAVEDS_COUNT(options) <= saveds);
 	CHECK_ARGUMENT(scratches >= 0 && scratches <= SLJIT_NUMBER_OF_REGISTERS);
 	CHECK_ARGUMENT(saveds >= 0 && saveds <= SLJIT_NUMBER_OF_SAVED_REGISTERS);
 	CHECK_ARGUMENT(scratches + saveds <= SLJIT_NUMBER_OF_REGISTERS);
@@ -1133,7 +1154,7 @@
 	CHECK_ARGUMENT(fscratches + fsaveds <= SLJIT_NUMBER_OF_FLOAT_REGISTERS);
 	CHECK_ARGUMENT(local_size >= 0 && local_size <= SLJIT_MAX_LOCAL_SIZE);
 	CHECK_ARGUMENT((arg_types & SLJIT_ARG_FULL_MASK) < SLJIT_ARG_TYPE_F64);
-	CHECK_ARGUMENT(function_check_arguments(arg_types, scratches, saveds, fscratches));
+	CHECK_ARGUMENT(function_check_arguments(arg_types, scratches, (options & SLJIT_ENTER_REG_ARG) ? 0 : saveds, fscratches));
 
 	compiler->last_flags = 0;
 #endif
@@ -1153,8 +1174,16 @@
 			} while (arg_types);
 		}
 
-		fprintf(compiler->verbose, "],%s scratches:%d, saveds:%d, fscratches:%d, fsaveds:%d, local_size:%d\n",
-			(options & SLJIT_ENTER_CDECL) ? " enter:cdecl," : "",
+		fprintf(compiler->verbose, "],");
+
+		if (options & SLJIT_ENTER_REG_ARG) {
+			fprintf(compiler->verbose, " enter:reg_arg,");
+
+			if (SLJIT_KEPT_SAVEDS_COUNT(options) > 0)
+				fprintf(compiler->verbose, " keep:%d,", SLJIT_KEPT_SAVEDS_COUNT(options));
+		}
+
+		fprintf(compiler->verbose, " scratches:%d, saveds:%d, fscratches:%d, fsaveds:%d, local_size:%d\n",
 			scratches, saveds, fscratches, fsaveds, local_size);
 	}
 #endif
@@ -1195,18 +1224,52 @@
 	case SLJIT_ARG_TYPE_P:
 		CHECK_ARGUMENT(op == SLJIT_MOV_P);
 		break;
+	case SLJIT_ARG_TYPE_F64:
+		CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
+		CHECK_ARGUMENT(op == SLJIT_MOV_F64);
+		break;
+	case SLJIT_ARG_TYPE_F32:
+		CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
+		CHECK_ARGUMENT(op == SLJIT_MOV_F32);
+		break;
 	default:
 		/* Context not initialized, void, etc. */
 		CHECK_ARGUMENT(0);
 		break;
 	}
-	FUNCTION_CHECK_SRC(src, srcw);
+
+	if (GET_OPCODE(op) < SLJIT_MOV_F64) {
+		FUNCTION_CHECK_SRC(src, srcw);
+	} else {
+		FUNCTION_FCHECK(src, srcw);
+	}
 	compiler->last_flags = 0;
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
-		fprintf(compiler->verbose, "  return%s%s ", !(op & SLJIT_32) ? "" : "32",
-			op1_names[GET_OPCODE(op) - SLJIT_OP1_BASE]);
+		if (GET_OPCODE(op) < SLJIT_MOV_F64) {
+			fprintf(compiler->verbose, "  return%s%s ", !(op & SLJIT_32) ? "" : "32",
+				op1_names[GET_OPCODE(op) - SLJIT_OP1_BASE]);
+			sljit_verbose_param(compiler, src, srcw);
+		} else {
+			fprintf(compiler->verbose, "  return%s ", !(op & SLJIT_32) ? ".f64" : ".f32");
+			sljit_verbose_fparam(compiler, src, srcw);
+		}
+		fprintf(compiler->verbose, "\n");
+	}
+#endif
+	CHECK_RETURN_OK;
+}
+
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	FUNCTION_CHECK_SRC(src, srcw);
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		fprintf(compiler->verbose, "  return_to ");
 		sljit_verbose_param(compiler, src, srcw);
 		fprintf(compiler->verbose, "\n");
 	}
@@ -1263,7 +1326,7 @@
 	}
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_MOV && GET_OPCODE(op) <= SLJIT_CLZ);
+	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_MOV && GET_OPCODE(op) <= SLJIT_CTZ);
 
 	switch (GET_OPCODE(op)) {
 	case SLJIT_NOT:
@@ -1324,15 +1387,18 @@
 	}
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_ADD && GET_OPCODE(op) <= SLJIT_ASHR);
+	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_ADD && GET_OPCODE(op) <= SLJIT_ROTR);
 
 	switch (GET_OPCODE(op)) {
 	case SLJIT_AND:
 	case SLJIT_OR:
 	case SLJIT_XOR:
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK));
 		break;
 	case SLJIT_MUL:
@@ -1357,6 +1423,10 @@
 		CHECK_ARGUMENT((compiler->last_flags & 0xff) == GET_FLAG_TYPE(SLJIT_SET_CARRY));
 		CHECK_ARGUMENT((op & SLJIT_32) == (compiler->last_flags & SLJIT_32));
 		break;
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)));
+		break;
 	default:
 		SLJIT_UNREACHABLE();
 		break;
@@ -1390,6 +1460,35 @@
 	CHECK_RETURN_OK;
 }
 
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src_dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_LSHR
+		|| GET_OPCODE(op) == SLJIT_MSHL || GET_OPCODE(op) == SLJIT_MLSHR);
+	CHECK_ARGUMENT((op & ~(0xff | SLJIT_32 | SLJIT_SHIFT_INTO_NON_ZERO)) == 0);
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(src_dst));
+	FUNCTION_CHECK_SRC(src1, src1w);
+	FUNCTION_CHECK_SRC(src2, src2w);
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		fprintf(compiler->verbose, "  %s%s.into%s ", op2_names[GET_OPCODE(op) - SLJIT_OP2_BASE], !(op & SLJIT_32) ? "" : "32",
+			(op & SLJIT_SHIFT_INTO_NON_ZERO) ? ".nz" : "");
+
+		sljit_verbose_reg(compiler, src_dst);
+		fprintf(compiler->verbose, ", ");
+		sljit_verbose_param(compiler, src1, src1w);
+		fprintf(compiler->verbose, ", ");
+		sljit_verbose_param(compiler, src2, src2w);
+		fprintf(compiler->verbose, "\n");
+	}
+#endif
+	CHECK_RETURN_OK;
+}
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -1510,7 +1609,7 @@
 	sljit_s32 src2, sljit_sw src2w)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->last_flags = GET_FLAG_TYPE(op) | (op & (SLJIT_32 | SLJIT_SET_Z));
+	compiler->last_flags = GET_FLAG_TYPE(op) | (op & SLJIT_32);
 #endif
 
 	if (SLJIT_UNLIKELY(compiler->skip_checks)) {
@@ -1523,7 +1622,7 @@
 	CHECK_ARGUMENT(GET_OPCODE(op) == SLJIT_CMP_F64);
 	CHECK_ARGUMENT(!(op & SLJIT_SET_Z));
 	CHECK_ARGUMENT((op & VARIABLE_FLAG_MASK)
-		|| (GET_FLAG_TYPE(op) >= SLJIT_EQUAL_F64 && GET_FLAG_TYPE(op) <= SLJIT_ORDERED_F64));
+		|| (GET_FLAG_TYPE(op) >= SLJIT_F_EQUAL && GET_FLAG_TYPE(op) <= SLJIT_ORDERED_LESS_EQUAL));
 	FUNCTION_FCHECK(src1, src1w);
 	FUNCTION_FCHECK(src2, src2w);
 #endif
@@ -1531,7 +1630,7 @@
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
 		fprintf(compiler->verbose, "  %s%s", fop1_names[SLJIT_CMP_F64 - SLJIT_FOP1_BASE], (op & SLJIT_32) ? ".f32" : ".f64");
 		if (op & VARIABLE_FLAG_MASK) {
-			fprintf(compiler->verbose, ".%s_f", jump_names[GET_FLAG_TYPE(op)]);
+			fprintf(compiler->verbose, ".%s", jump_names[GET_FLAG_TYPE(op)]);
 		}
 		fprintf(compiler->verbose, " ");
 		sljit_verbose_fparam(compiler, src1, src1w);
@@ -1650,6 +1749,17 @@
 	CHECK_RETURN_OK;
 }
 
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
+	|| (defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM)
+#define CHECK_UNORDERED(type, last_flags) \
+	((((type) & 0xff) == SLJIT_UNORDERED || ((type) & 0xff) == SLJIT_ORDERED) && \
+		((last_flags) & 0xff) >= SLJIT_UNORDERED && ((last_flags) & 0xff) <= SLJIT_ORDERED_LESS_EQUAL)
+#else
+#define CHECK_UNORDERED(type, last_flags) 0
+#endif
+#endif /* SLJIT_ARGUMENT_CHECKS */
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
 {
 	if (SLJIT_UNLIKELY(compiler->skip_checks)) {
@@ -1658,9 +1768,8 @@
 	}
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_REWRITABLE_JUMP | SLJIT_32)));
+	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_REWRITABLE_JUMP)));
 	CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_FAST_CALL);
-	CHECK_ARGUMENT((type & 0xff) < SLJIT_JUMP || !(type & SLJIT_32));
 
 	if ((type & 0xff) < SLJIT_JUMP) {
 		if ((type & 0xff) <= SLJIT_NOT_ZERO)
@@ -1670,13 +1779,14 @@
 			compiler->last_flags = 0;
 		} else
 			CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff)
-				|| ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW));
+				|| ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
+				|| CHECK_UNORDERED(type, compiler->last_flags));
 	}
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose))
-		fprintf(compiler->verbose, "  jump%s %s%s\n", !(type & SLJIT_REWRITABLE_JUMP) ? "" : ".r",
-			jump_names[type & 0xff], JUMP_POSTFIX(type));
+		fprintf(compiler->verbose, "  jump%s %s\n", !(type & SLJIT_REWRITABLE_JUMP) ? "" : ".r",
+			jump_names[type & 0xff]);
 #endif
 	CHECK_RETURN_OK;
 }
@@ -1686,11 +1796,17 @@
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
 	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_REWRITABLE_JUMP | SLJIT_CALL_RETURN)));
-	CHECK_ARGUMENT((type & 0xff) == SLJIT_CALL || (type & 0xff) == SLJIT_CALL_CDECL);
+	CHECK_ARGUMENT((type & 0xff) >= SLJIT_CALL && (type & 0xff) <= SLJIT_CALL_REG_ARG);
 	CHECK_ARGUMENT(function_check_arguments(arg_types, compiler->scratches, -1, compiler->fscratches));
 
 	if (type & SLJIT_CALL_RETURN) {
 		CHECK_ARGUMENT((arg_types & SLJIT_ARG_MASK) == compiler->last_return);
+
+		if (compiler->options & SLJIT_ENTER_REG_ARG) {
+			CHECK_ARGUMENT((type & 0xff) == SLJIT_CALL_REG_ARG);
+		} else {
+			CHECK_ARGUMENT((type & 0xff) != SLJIT_CALL_REG_ARG);
+		}
 	}
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -1729,8 +1845,8 @@
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
-		fprintf(compiler->verbose, "  cmp%s %s%s, ", !(type & SLJIT_REWRITABLE_JUMP) ? "" : ".r",
-			jump_names[type & 0xff], (type & SLJIT_32) ? "32" : "");
+		fprintf(compiler->verbose, "  cmp%s%s %s, ", (type & SLJIT_32) ? "32" : "",
+			!(type & SLJIT_REWRITABLE_JUMP) ? "" : ".r", jump_names[type & 0xff]);
 		sljit_verbose_param(compiler, src1, src1w);
 		fprintf(compiler->verbose, ", ");
 		sljit_verbose_param(compiler, src2, src2w);
@@ -1747,15 +1863,16 @@
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
 	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
 	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_REWRITABLE_JUMP | SLJIT_32)));
-	CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL_F64 && (type & 0xff) <= SLJIT_ORDERED_F64);
+	CHECK_ARGUMENT((type & 0xff) >= SLJIT_F_EQUAL && (type & 0xff) <= SLJIT_ORDERED_LESS_EQUAL
+				&& ((type & 0xff) <= SLJIT_ORDERED || sljit_cmp_info(type & 0xff)));
 	FUNCTION_FCHECK(src1, src1w);
 	FUNCTION_FCHECK(src2, src2w);
 	compiler->last_flags = 0;
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
-		fprintf(compiler->verbose, "  fcmp%s %s%s, ", !(type & SLJIT_REWRITABLE_JUMP) ? "" : ".r",
-			jump_names[type & 0xff], (type & SLJIT_32) ? ".f32" : ".f64");
+		fprintf(compiler->verbose, "  fcmp%s%s %s, ", (type & SLJIT_32) ? ".f32" : ".f64",
+			!(type & SLJIT_REWRITABLE_JUMP) ? "" : ".r", jump_names[type & 0xff]);
 		sljit_verbose_fparam(compiler, src1, src1w);
 		fprintf(compiler->verbose, ", ");
 		sljit_verbose_fparam(compiler, src2, src2w);
@@ -1793,12 +1910,18 @@
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
 	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_CALL_RETURN)));
-	CHECK_ARGUMENT((type & 0xff) == SLJIT_CALL || (type & 0xff) == SLJIT_CALL_CDECL);
+	CHECK_ARGUMENT((type & 0xff) >= SLJIT_CALL && (type & 0xff) <= SLJIT_CALL_REG_ARG);
 	CHECK_ARGUMENT(function_check_arguments(arg_types, compiler->scratches, -1, compiler->fscratches));
 	FUNCTION_CHECK_SRC(src, srcw);
 
 	if (type & SLJIT_CALL_RETURN) {
 		CHECK_ARGUMENT((arg_types & SLJIT_ARG_MASK) == compiler->last_return);
+
+		if (compiler->options & SLJIT_ENTER_REG_ARG) {
+			CHECK_ARGUMENT((type & 0xff) == SLJIT_CALL_REG_ARG);
+		} else {
+			CHECK_ARGUMENT((type & 0xff) != SLJIT_CALL_REG_ARG);
+		}
 	}
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -1830,18 +1953,18 @@
 	sljit_s32 type)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_32)));
-	CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_ORDERED_F64);
+	CHECK_ARGUMENT(type >= SLJIT_EQUAL && type <= SLJIT_ORDERED_LESS_EQUAL);
 	CHECK_ARGUMENT(op == SLJIT_MOV || op == SLJIT_MOV32
 		|| (GET_OPCODE(op) >= SLJIT_AND && GET_OPCODE(op) <= SLJIT_XOR));
 	CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK));
 
-	if ((type & 0xff) <= SLJIT_NOT_ZERO)
+	if (type <= SLJIT_NOT_ZERO)
 		CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
 	else
-		CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff)
-			|| ((type & 0xff) == SLJIT_NOT_CARRY && (compiler->last_flags & 0xff) == SLJIT_CARRY)
-			|| ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW));
+		CHECK_ARGUMENT(type == (compiler->last_flags & 0xff)
+			|| (type == SLJIT_NOT_CARRY && (compiler->last_flags & 0xff) == SLJIT_CARRY)
+			|| (type == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
+			|| CHECK_UNORDERED(type, compiler->last_flags));
 
 	FUNCTION_CHECK_DST(dst, dstw);
 
@@ -1850,12 +1973,12 @@
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
-		fprintf(compiler->verbose, "  flags%s %s%s, ",
-			!(op & SLJIT_SET_Z) ? "" : ".z",
+		fprintf(compiler->verbose, "  flags.%s%s%s ",
 			GET_OPCODE(op) < SLJIT_OP2_BASE ? "mov" : op2_names[GET_OPCODE(op) - SLJIT_OP2_BASE],
-			GET_OPCODE(op) < SLJIT_OP2_BASE ? op1_names[GET_OPCODE(op) - SLJIT_OP1_BASE] : ((op & SLJIT_32) ? "32" : ""));
+			GET_OPCODE(op) < SLJIT_OP2_BASE ? op1_names[GET_OPCODE(op) - SLJIT_OP1_BASE] : ((op & SLJIT_32) ? "32" : ""),
+			!(op & SLJIT_SET_Z) ? "" : ".z");
 		sljit_verbose_param(compiler, dst, dstw);
-		fprintf(compiler->verbose, ", %s%s\n", jump_names[type & 0xff], JUMP_POSTFIX(type));
+		fprintf(compiler->verbose, ", %s\n", jump_names[type]);
 	}
 #endif
 	CHECK_RETURN_OK;
@@ -1866,28 +1989,31 @@
 	sljit_s32 src, sljit_sw srcw)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_32)));
-	CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_ORDERED_F64);
+	sljit_s32 cond = type & ~SLJIT_32;
+
+	CHECK_ARGUMENT(cond >= SLJIT_EQUAL && cond <= SLJIT_ORDERED_LESS_EQUAL);
 
 	CHECK_ARGUMENT(compiler->scratches != -1 && compiler->saveds != -1);
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_32));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg));
 	if (src != SLJIT_IMM) {
 		CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(src));
 		CHECK_ARGUMENT(srcw == 0);
 	}
 
-	if ((type & 0xff) <= SLJIT_NOT_ZERO)
+	if (cond <= SLJIT_NOT_ZERO)
 		CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
 	else
-		CHECK_ARGUMENT((type & 0xff) == (compiler->last_flags & 0xff)
-			|| ((type & 0xff) == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW));
+		CHECK_ARGUMENT(cond == (compiler->last_flags & 0xff)
+			|| (cond == SLJIT_NOT_CARRY && (compiler->last_flags & 0xff) == SLJIT_CARRY)
+			|| (cond == SLJIT_NOT_OVERFLOW && (compiler->last_flags & 0xff) == SLJIT_OVERFLOW)
+			|| CHECK_UNORDERED(cond, compiler->last_flags));
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
-		fprintf(compiler->verbose, "  cmov%s %s%s, ",
-			!(dst_reg & SLJIT_32) ? "" : "32",
-			jump_names[type & 0xff], JUMP_POSTFIX(type));
-		sljit_verbose_reg(compiler, dst_reg & ~SLJIT_32);
+		fprintf(compiler->verbose, "  cmov%s %s, ",
+			!(type & SLJIT_32) ? "" : "32",
+			jump_names[type & ~SLJIT_32]);
+		sljit_verbose_reg(compiler, dst_reg);
 		fprintf(compiler->verbose, ", ");
 		sljit_verbose_param(compiler, src, srcw);
 		fprintf(compiler->verbose, "\n");
@@ -1900,28 +2026,123 @@
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw)
 {
+	if (SLJIT_UNLIKELY(compiler->skip_checks)) {
+		compiler->skip_checks = 0;
+		CHECK_RETURN_OK;
+	}
+
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT((type & 0xff) >= SLJIT_MOV && (type & 0xff) <= SLJIT_MOV_P);
-	CHECK_ARGUMENT(!(type & SLJIT_32) || ((type & 0xff) != SLJIT_MOV && (type & 0xff) != SLJIT_MOV_U32 && (type & 0xff) != SLJIT_MOV_P));
-	CHECK_ARGUMENT((type & SLJIT_MEM_PRE) || (type & SLJIT_MEM_POST));
-	CHECK_ARGUMENT((type & (SLJIT_MEM_PRE | SLJIT_MEM_POST)) != (SLJIT_MEM_PRE | SLJIT_MEM_POST));
-	CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_SUPP | SLJIT_MEM_PRE | SLJIT_MEM_POST)) == 0);
+	sljit_s32 allowed_flags;
+
+	if (type & SLJIT_MEM_UNALIGNED) {
+		CHECK_ARGUMENT(!(type & (SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32)));
+	} else if (type & SLJIT_MEM_UNALIGNED_16) {
+		CHECK_ARGUMENT(!(type & SLJIT_MEM_UNALIGNED_32));
+	} else {
+		CHECK_ARGUMENT((reg & REG_PAIR_MASK) || (type & SLJIT_MEM_UNALIGNED_32));
+	}
+
+	allowed_flags = SLJIT_MEM_UNALIGNED;
+
+	switch (type & 0xff) {
+	case SLJIT_MOV_U32:
+	case SLJIT_MOV_S32:
+	case SLJIT_MOV32:
+		allowed_flags = SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16;
+		break;
+	case SLJIT_MOV:
+	case SLJIT_MOV_P:
+		allowed_flags = SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32;
+		break;
+	}
+
+	CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | allowed_flags)) == 0);
+
+	if (reg & REG_PAIR_MASK) {
+		CHECK_ARGUMENT((type & 0xff) == SLJIT_MOV);
+		CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(REG_PAIR_FIRST(reg)));
+		CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(REG_PAIR_SECOND(reg)));
+		CHECK_ARGUMENT(REG_PAIR_FIRST(reg) != REG_PAIR_SECOND(reg));
+	} else {
+		CHECK_ARGUMENT((type & 0xff) >= SLJIT_MOV && (type & 0xff) <= SLJIT_MOV_P);
+		CHECK_ARGUMENT(!(type & SLJIT_32) || ((type & 0xff) >= SLJIT_MOV_U8 && (type & 0xff) <= SLJIT_MOV_S16));
+		CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(reg));
+	}
 
 	FUNCTION_CHECK_SRC_MEM(mem, memw);
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(reg));
-
-	CHECK_ARGUMENT((mem & REG_MASK) != 0 && (mem & REG_MASK) != reg);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
-	if (!(type & SLJIT_MEM_SUPP) && SLJIT_UNLIKELY(!!compiler->verbose)) {
-		if (sljit_emit_mem(compiler, type | SLJIT_MEM_SUPP, reg, mem, memw) == SLJIT_ERR_UNSUPPORTED)
-			fprintf(compiler->verbose, "  //");
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		if ((type & 0xff) == SLJIT_MOV32)
+			fprintf(compiler->verbose, "  %s32",
+				(type & SLJIT_MEM_STORE) ? "store" : "load");
+		else
+			fprintf(compiler->verbose, "  %s%s%s",
+				(type & SLJIT_MEM_STORE) ? "store" : "load",
+				!(type & SLJIT_32) ? "" : "32",
+				op1_names[(type & 0xff) - SLJIT_OP1_BASE]);
 
-		fprintf(compiler->verbose, "  mem%s.%s%s%s ",
-			!(type & SLJIT_32) ? "" : "32",
-			(type & SLJIT_MEM_STORE) ? "st" : "ld",
-			op1_names[(type & 0xff) - SLJIT_OP1_BASE],
-			(type & SLJIT_MEM_PRE) ? ".pre" : ".post");
+		if (type & SLJIT_MEM_UNALIGNED)
+			printf(".un");
+		else if (type & SLJIT_MEM_UNALIGNED_16)
+			printf(".un16");
+		else if (type & SLJIT_MEM_UNALIGNED_32)
+			printf(".un32");
+
+		if (reg & REG_PAIR_MASK) {
+			fprintf(compiler->verbose, " {");
+			sljit_verbose_reg(compiler, REG_PAIR_FIRST(reg));
+			fprintf(compiler->verbose, ", ");
+			sljit_verbose_reg(compiler, REG_PAIR_SECOND(reg));
+			fprintf(compiler->verbose, "}, ");
+		} else {
+			fprintf(compiler->verbose, " ");
+			sljit_verbose_reg(compiler, reg);
+			fprintf(compiler->verbose, ", ");
+		}
+		sljit_verbose_param(compiler, mem, memw);
+		fprintf(compiler->verbose, "\n");
+	}
+#endif
+	CHECK_RETURN_OK;
+}
+
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	if (SLJIT_UNLIKELY(compiler->skip_checks)) {
+		compiler->skip_checks = 0;
+		CHECK_RETURN_OK;
+	}
+
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT((type & 0xff) >= SLJIT_MOV && (type & 0xff) <= SLJIT_MOV_P);
+	CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_SUPP | SLJIT_MEM_POST)) == 0);
+	CHECK_ARGUMENT((mem & REG_MASK) != 0 && (mem & REG_MASK) != reg);
+
+	FUNCTION_CHECK_SRC_MEM(mem, memw);
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		if (type & SLJIT_MEM_SUPP)
+			CHECK_RETURN_OK;
+		if (sljit_emit_mem_update(compiler, type | SLJIT_MEM_SUPP, reg, mem, memw) == SLJIT_ERR_UNSUPPORTED) {
+			fprintf(compiler->verbose, "    # mem: unsupported form, no instructions are emitted\n");
+			CHECK_RETURN_OK;
+		}
+
+		if ((type & 0xff) == SLJIT_MOV32)
+			fprintf(compiler->verbose, "  %s32.%s ",
+				(type & SLJIT_MEM_STORE) ? "store" : "load",
+				(type & SLJIT_MEM_POST) ? "post" : "pre");
+		else
+			fprintf(compiler->verbose, "  %s%s%s.%s ",
+				(type & SLJIT_MEM_STORE) ? "store" : "load",
+				!(type & SLJIT_32) ? "" : "32",
+				op1_names[(type & 0xff) - SLJIT_OP1_BASE],
+				(type & SLJIT_MEM_POST) ? "post" : "pre");
+
 		sljit_verbose_reg(compiler, reg);
 		fprintf(compiler->verbose, ", ");
 		sljit_verbose_param(compiler, mem, memw);
@@ -1937,22 +2158,34 @@
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
 	CHECK_ARGUMENT((type & 0xff) == SLJIT_MOV_F64);
-	CHECK_ARGUMENT((type & SLJIT_MEM_PRE) || (type & SLJIT_MEM_POST));
-	CHECK_ARGUMENT((type & (SLJIT_MEM_PRE | SLJIT_MEM_POST)) != (SLJIT_MEM_PRE | SLJIT_MEM_POST));
-	CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_SUPP | SLJIT_MEM_PRE | SLJIT_MEM_POST)) == 0);
 
-	FUNCTION_CHECK_SRC_MEM(mem, memw);
+	if (type & SLJIT_MEM_UNALIGNED) {
+		CHECK_ARGUMENT(!(type & (SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32)));
+	} else if (type & SLJIT_MEM_UNALIGNED_16) {
+		CHECK_ARGUMENT(!(type & SLJIT_MEM_UNALIGNED_32));
+	} else {
+		CHECK_ARGUMENT(type & SLJIT_MEM_UNALIGNED_32);
+		CHECK_ARGUMENT(!(type & SLJIT_32));
+	}
+
+	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32)));
 	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+	FUNCTION_CHECK_SRC_MEM(mem, memw);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
-	if (!(type & SLJIT_MEM_SUPP) && SLJIT_UNLIKELY(!!compiler->verbose)) {
-		if (sljit_emit_fmem(compiler, type | SLJIT_MEM_SUPP, freg, mem, memw) == SLJIT_ERR_UNSUPPORTED)
-			fprintf(compiler->verbose, "  //");
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		fprintf(compiler->verbose, "  %s.%s",
+			(type & SLJIT_MEM_STORE) ? "store" : "load",
+			!(type & SLJIT_32) ? "f64" : "f32");
 
-		fprintf(compiler->verbose, "  fmem.%s%s%s ",
-			(type & SLJIT_MEM_STORE) ? "st" : "ld",
-			!(type & SLJIT_32) ? ".f64" : ".f32",
-			(type & SLJIT_MEM_PRE) ? ".pre" : ".post");
+		if (type & SLJIT_MEM_UNALIGNED)
+			printf(".un");
+		else if (type & SLJIT_MEM_UNALIGNED_16)
+			printf(".un16");
+		else if (type & SLJIT_MEM_UNALIGNED_32)
+			printf(".un32");
+
+		fprintf(compiler->verbose, " ");
 		sljit_verbose_freg(compiler, freg);
 		fprintf(compiler->verbose, ", ");
 		sljit_verbose_param(compiler, mem, memw);
@@ -1962,6 +2195,40 @@
 	CHECK_RETURN_OK;
 }
 
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 mem, sljit_sw memw)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT((type & 0xff) == SLJIT_MOV_F64);
+	CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_SUPP | SLJIT_MEM_POST)) == 0);
+	FUNCTION_CHECK_SRC_MEM(mem, memw);
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		if (type & SLJIT_MEM_SUPP)
+			CHECK_RETURN_OK;
+		if (sljit_emit_fmem_update(compiler, type | SLJIT_MEM_SUPP, freg, mem, memw) == SLJIT_ERR_UNSUPPORTED) {
+			fprintf(compiler->verbose, "    # fmem: unsupported form, no instructions are emitted\n");
+			CHECK_RETURN_OK;
+		}
+
+		fprintf(compiler->verbose, "  %s.%s.%s ",
+			(type & SLJIT_MEM_STORE) ? "store" : "load",
+			!(type & SLJIT_32) ? "f64" : "f32",
+			(type & SLJIT_MEM_POST) ? "post" : "pre");
+
+		sljit_verbose_freg(compiler, freg);
+		fprintf(compiler->verbose, ", ");
+		sljit_verbose_param(compiler, mem, memw);
+		fprintf(compiler->verbose, "\n");
+	}
+#endif
+	CHECK_RETURN_OK;
+
+}
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
 {
 	/* Any offset is allowed. */
@@ -2012,6 +2279,10 @@
 	CHECK_RETURN_OK;
 }
 
+#else /* !SLJIT_ARGUMENT_CHECKS && !SLJIT_VERBOSE */
+
+#define SLJIT_SKIP_CHECKS(compiler)
+
 #endif /* SLJIT_ARGUMENT_CHECKS || SLJIT_VERBOSE */
 
 #define SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw) \
@@ -2039,46 +2310,11 @@
 	ADJUST_LOCAL_OFFSET(dst, dstw); \
 	ADJUST_LOCAL_OFFSET(src, srcw);
 
-static SLJIT_INLINE sljit_s32 emit_mov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
-{
-#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
-	/* At the moment the pointer size is always equal to sljit_sw. May be changed in the future. */
-	if (src == SLJIT_RETURN_REG && (op == SLJIT_MOV || op == SLJIT_MOV_P))
-		return SLJIT_SUCCESS;
-#else
-	if (src == SLJIT_RETURN_REG && (op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P))
-		return SLJIT_SUCCESS;
-#endif
-
-#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) \
-		|| (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
-	compiler->skip_checks = 1;
-#endif
-	return sljit_emit_op1(compiler, op, SLJIT_RETURN_REG, 0, src, srcw);
-}
-
-#if !(defined SLJIT_CONFIG_SPARC && SLJIT_CONFIG_SPARC)
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_return(compiler, op, src, srcw));
-
-	FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
-
-#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) \
-		|| (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
-	compiler->skip_checks = 1;
-#endif
-	return sljit_emit_return_void(compiler);
-}
-
-#endif
-
 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
 		|| (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC) \
-		|| (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32) \
-		|| ((defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) && !(defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1 && SLJIT_MIPS_REV < 6))
+		|| ((defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) && !(defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1 && SLJIT_MIPS_REV < 6)) \
+		|| (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV) \
+		|| (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
 
 static SLJIT_INLINE sljit_s32 sljit_emit_cmov_generic(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 dst_reg,
@@ -2086,33 +2322,57 @@
 {
 	struct sljit_label *label;
 	struct sljit_jump *jump;
-	sljit_s32 op = (dst_reg & SLJIT_32) ? SLJIT_MOV32 : SLJIT_MOV;
+	sljit_s32 op = (type & SLJIT_32) ? SLJIT_MOV32 : SLJIT_MOV;
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-	jump = sljit_emit_jump(compiler, type ^ 0x1);
+	SLJIT_SKIP_CHECKS(compiler);
+	jump = sljit_emit_jump(compiler, (type & ~SLJIT_32) ^ 0x1);
 	FAIL_IF(!jump);
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-	FAIL_IF(sljit_emit_op1(compiler, op, dst_reg & ~SLJIT_32, 0, src, srcw));
+	SLJIT_SKIP_CHECKS(compiler);
+	FAIL_IF(sljit_emit_op1(compiler, op, dst_reg, 0, src, srcw));
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	label = sljit_emit_label(compiler);
 	FAIL_IF(!label);
+
 	sljit_set_label(jump, label);
 	return SLJIT_SUCCESS;
 }
 
 #endif
 
+#if (!(defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) || (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)) \
+	&& !(defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+
+static sljit_s32 sljit_emit_mem_unaligned(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	SLJIT_SKIP_CHECKS(compiler);
+
+	if (type & SLJIT_MEM_STORE)
+		return sljit_emit_op1(compiler, type & (0xff | SLJIT_32), mem, memw, reg, 0);
+	return sljit_emit_op1(compiler, type & (0xff | SLJIT_32), reg, 0, mem, memw);
+}
+
+#endif /* (!SLJIT_CONFIG_MIPS || SLJIT_MIPS_REV >= 6) && !SLJIT_CONFIG_ARM_V5 */
+
+#if (!(defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) || (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)) \
+	&& !(defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32)
+
+static sljit_s32 sljit_emit_fmem_unaligned(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	SLJIT_SKIP_CHECKS(compiler);
+
+	if (type & SLJIT_MEM_STORE)
+		return sljit_emit_fop1(compiler, type & (0xff | SLJIT_32), mem, memw, freg, 0);
+	return sljit_emit_fop1(compiler, type & (0xff | SLJIT_32), freg, 0, mem, memw);
+}
+
+#endif /* (!SLJIT_CONFIG_MIPS || SLJIT_MIPS_REV >= 6) && !SLJIT_CONFIG_ARM */
+
 /* CPU description section */
 
 #if (defined SLJIT_32BIT_ARCHITECTURE && SLJIT_32BIT_ARCHITECTURE)
@@ -2153,13 +2413,58 @@
 #	include "sljitNativePPC_common.c"
 #elif (defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS)
 #	include "sljitNativeMIPS_common.c"
-#elif (defined SLJIT_CONFIG_SPARC && SLJIT_CONFIG_SPARC)
-#	include "sljitNativeSPARC_common.c"
+#elif (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV)
+#	include "sljitNativeRISCV_common.c"
 #elif (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
 #	include "sljitNativeS390X.c"
 #endif
 
-#if !(defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS)
+static SLJIT_INLINE sljit_s32 emit_mov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+#if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
+	/* At the moment the pointer size is always equal to sljit_sw. May be changed in the future. */
+	if (src == SLJIT_RETURN_REG && (op == SLJIT_MOV || op == SLJIT_MOV_P))
+		return SLJIT_SUCCESS;
+#else
+	if (src == SLJIT_RETURN_REG && (op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P))
+		return SLJIT_SUCCESS;
+#endif
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_op1(compiler, op, SLJIT_RETURN_REG, 0, src, srcw);
+}
+
+#if !(defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) \
+	&& !((defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) && defined __SOFTFP__)
+
+static SLJIT_INLINE sljit_s32 emit_fmov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+	if (src == SLJIT_FR0)
+		return SLJIT_SUCCESS;
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_fop1(compiler, op, SLJIT_RETURN_FREG, 0, src, srcw);
+}
+
+#endif /* !SLJIT_CONFIG_X86_32 && !(SLJIT_CONFIG_ARM_32 && __SOFTFP__) */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return(compiler, op, src, srcw));
+
+	if (GET_OPCODE(op) < SLJIT_MOV_F64) {
+		FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
+	} else {
+		FAIL_IF(emit_fmov_before_return(compiler, op, src, srcw));
+	}
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_return_void(compiler);
+}
+
+#if !(defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) \
+	&& !(defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV)
 
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_cmp(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 src1, sljit_sw src1w,
@@ -2229,20 +2534,33 @@
 	else
 		flags = condition << VARIABLE_FLAG_SHIFT;
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	PTR_FAIL_IF(sljit_emit_op2u(compiler,
 		SLJIT_SUB | flags | (type & SLJIT_32), src1, src1w, src2, src2w));
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_jump(compiler, condition | (type & (SLJIT_REWRITABLE_JUMP | SLJIT_32)));
 }
 
-#endif
+#endif /* !SLJIT_CONFIG_MIPS */
+
+#if (defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM)
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
+{
+	if (type < SLJIT_UNORDERED || type > SLJIT_ORDERED_LESS_EQUAL)
+		return 0;
+
+	switch (type) {
+	case SLJIT_UNORDERED_OR_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL:
+		return 0;
+	}
+
+	return 1;
+}
+
+#endif /* SLJIT_CONFIG_ARM */
 
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_fcmp(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 src1, sljit_sw src1w,
@@ -2251,61 +2569,65 @@
 	CHECK_ERROR_PTR();
 	CHECK_PTR(check_sljit_emit_fcmp(compiler, type, src1, src1w, src2, src2w));
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	sljit_emit_fop1(compiler, SLJIT_CMP_F64 | ((type & 0xff) << VARIABLE_FLAG_SHIFT) | (type & SLJIT_32), src1, src1w, src2, src2w);
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_jump(compiler, type);
 }
 
-#if !(defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \
-	&& !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
+#if !(defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM) \
 	&& !(defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw)
 {
-	SLJIT_UNUSED_ARG(compiler);
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem_update(compiler, type, reg, mem, memw));
 	SLJIT_UNUSED_ARG(type);
 	SLJIT_UNUSED_ARG(reg);
 	SLJIT_UNUSED_ARG(mem);
 	SLJIT_UNUSED_ARG(memw);
 
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
-
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
-#endif
+#endif /* !SLJIT_CONFIG_ARM && !SLJIT_CONFIG_PPC */
 
-#if !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
-	&& !(defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
+#if !(defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \
+	&& !(defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 freg,
 	sljit_s32 mem, sljit_sw memw)
 {
-	SLJIT_UNUSED_ARG(compiler);
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
+
+	return sljit_emit_fmem_unaligned(compiler, type, freg, mem, memw);
+}
+
+#endif /* !SLJIT_CONFIG_ARM_32 && !SLJIT_CONFIG_MIPS */
+
+#if !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) \
+	&& !(defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fmem_update(compiler, type, freg, mem, memw));
 	SLJIT_UNUSED_ARG(type);
 	SLJIT_UNUSED_ARG(freg);
 	SLJIT_UNUSED_ARG(mem);
 	SLJIT_UNUSED_ARG(memw);
 
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
-
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
-#endif
+#endif /* !SLJIT_CONFIG_ARM_64 && !SLJIT_CONFIG_PPC */
 
 #if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
 	&& !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
@@ -2316,10 +2638,9 @@
 	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
 
 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+
+	SLJIT_SKIP_CHECKS(compiler);
+
 	if (offset != 0)
 		return sljit_emit_op2(compiler, SLJIT_ADD, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
 	return sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, SLJIT_SP, 0);
@@ -2387,6 +2708,13 @@
 	return 0;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
+{
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNREACHABLE();
+	return 0;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE void sljit_free_code(void* code, void *exec_allocator_data)
 {
 	SLJIT_UNUSED_ARG(code);
@@ -2426,6 +2754,13 @@
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
+{
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNREACHABLE();
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
 {
 	SLJIT_UNUSED_ARG(compiler);
@@ -2436,9 +2771,11 @@
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler, sljit_s32 src, sljit_sw srcw)
 {
 	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(src);
+	SLJIT_UNUSED_ARG(srcw);
 	SLJIT_UNREACHABLE();
 	return SLJIT_ERR_UNSUPPORTED;
 }
@@ -2505,6 +2842,22 @@
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src_dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(op);
+	SLJIT_UNUSED_ARG(src_dst);
+	SLJIT_UNUSED_ARG(src1);
+	SLJIT_UNUSED_ARG(src1w);
+	SLJIT_UNUSED_ARG(src2);
+	SLJIT_UNUSED_ARG(src2w);
+	SLJIT_UNREACHABLE();
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -2703,6 +3056,17 @@
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 reg, sljit_s32 mem, sljit_sw memw)
+{
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(reg);
+	SLJIT_UNUSED_ARG(mem);
+	SLJIT_UNUSED_ARG(memw);
+	SLJIT_UNREACHABLE();
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 freg, sljit_s32 mem, sljit_sw memw)
 {
 	SLJIT_UNUSED_ARG(compiler);
@@ -2714,6 +3078,17 @@
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 freg, sljit_s32 mem, sljit_sw memw)
+{
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(freg);
+	SLJIT_UNUSED_ARG(mem);
+	SLJIT_UNUSED_ARG(memw);
+	SLJIT_UNREACHABLE();
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
 {
 	SLJIT_UNUSED_ARG(compiler);
diff --git a/src/sljit/sljitLir.h b/src/sljit/sljitLir.h
index 1162658..c6a0832 100644
--- a/src/sljit/sljitLir.h
+++ b/src/sljit/sljitLir.h
@@ -36,26 +36,24 @@
     Advantages:
       - The execution can be continued from any LIR instruction. In other
         words, it is possible to jump to any label from anywhere, even from
-        a code fragment, which is compiled later, if both compiled code
-        shares the same context. See sljit_emit_enter for more details
-      - Supports self modifying code: target of (conditional) jump and call
+        a code fragment, which is compiled later, as long as the compiling
+        context is the same. See sljit_emit_enter for more details.
+      - Supports self modifying code: target of any jump and call
         instructions and some constant values can be dynamically modified
-        during runtime
+        during runtime. See SLJIT_REWRITABLE_JUMP.
         - although it is not suggested to do it frequently
         - can be used for inline caching: save an important value once
           in the instruction stream
-        - since this feature limits the optimization possibilities, a
-          special flag must be passed at compile time when these
-          instructions are emitted
       - A fixed stack space can be allocated for local variables
       - The compiler is thread-safe
       - The compiler is highly configurable through preprocessor macros.
         You can disable unneeded features (multithreading in single
         threaded applications), and you can use your own system functions
-        (including memory allocators). See sljitConfig.h
+        (including memory allocators). See sljitConfig.h.
     Disadvantages:
-      - No automatic register allocation, and temporary results are
-        not stored on the stack. (hence the name comes)
+      - The compiler is more like a platform independent assembler, so
+        there is no built-in variable management. Registers and stack must
+        be managed manually (the name of the compiler refers to this).
     In practice:
       - This approach is very effective for interpreters
         - One of the saved registers typically points to a stack interface
@@ -77,7 +75,7 @@
 #include "sljitConfig.h"
 
 /* The following header file defines useful macros for fine tuning
-sljit based code generators. They are listed in the beginning
+SLJIT based code generators. They are listed in the beginning
 of sljitConfigInternal.h */
 
 #include "sljitConfigInternal.h"
@@ -90,6 +88,10 @@
 extern "C" {
 #endif
 
+/* Version numbers. */
+#define SLJIT_MAJOR_VERSION	0
+#define SLJIT_MINOR_VERSION	95
+
 /* --------------------------------------------------------------------- */
 /*  Error codes                                                          */
 /* --------------------------------------------------------------------- */
@@ -97,33 +99,31 @@
 /* Indicates no error. */
 #define SLJIT_SUCCESS			0
 /* After the call of sljit_generate_code(), the error code of the compiler
-   is set to this value to avoid future sljit calls (in debug mode at least).
+   is set to this value to avoid further code generation.
    The complier should be freed after sljit_generate_code(). */
 #define SLJIT_ERR_COMPILED		1
-/* Cannot allocate non executable memory. */
+/* Cannot allocate non-executable memory. */
 #define SLJIT_ERR_ALLOC_FAILED		2
 /* Cannot allocate executable memory.
-   Only for sljit_generate_code() */
+   Only sljit_generate_code() returns with this error code. */
 #define SLJIT_ERR_EX_ALLOC_FAILED	3
 /* Return value for SLJIT_CONFIG_UNSUPPORTED placeholder architecture. */
 #define SLJIT_ERR_UNSUPPORTED		4
 /* An ivalid argument is passed to any SLJIT function. */
 #define SLJIT_ERR_BAD_ARGUMENT		5
-/* Dynamic code modification is not enabled. */
-#define SLJIT_ERR_DYN_CODE_MOD		6
 
 /* --------------------------------------------------------------------- */
 /*  Registers                                                            */
 /* --------------------------------------------------------------------- */
 
 /*
-  Scratch (R) registers: registers whose may not preserve their values
+  Scratch (R) registers: registers which may not preserve their values
   across function calls.
 
-  Saved (S) registers: registers whose preserve their values across
+  Saved (S) registers: registers which preserve their values across
   function calls.
 
-  The scratch and saved register sets are overlap. The last scratch register
+  The scratch and saved register sets overlap. The last scratch register
   is the first saved register, the one before the last is the second saved
   register, and so on.
 
@@ -209,7 +209,7 @@
 /* The SLJIT_SP provides direct access to the linear stack space allocated by
    sljit_emit_enter. It can only be used in the following form: SLJIT_MEM1(SLJIT_SP).
    The immediate offset is extended by the relative stack offset automatically.
-   The sljit_get_local_base can be used to obtain the absolute offset. */
+   The sljit_get_local_base can be used to obtain the real address of a value. */
 #define SLJIT_SP	(SLJIT_NUMBER_OF_REGISTERS + 1)
 
 /* Return with machine word. */
@@ -249,6 +249,10 @@
 /* Float registers >= SLJIT_FIRST_SAVED_FLOAT_REG are saved registers. */
 #define SLJIT_FIRST_SAVED_FLOAT_REG (SLJIT_FS0 - SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS + 1)
 
+/* Return with floating point arg. */
+
+#define SLJIT_RETURN_FREG	SLJIT_FR0
+
 /* --------------------------------------------------------------------- */
 /*  Argument type definitions                                            */
 /* --------------------------------------------------------------------- */
@@ -386,6 +390,7 @@
 struct sljit_jump {
 	struct sljit_jump *next;
 	sljit_uw addr;
+	/* Architecture dependent flags. */
 	sljit_uw flags;
 	union {
 		sljit_uw target;
@@ -423,17 +428,17 @@
 	struct sljit_memory_fragment *buf;
 	struct sljit_memory_fragment *abuf;
 
-	/* Used scratch registers. */
+	/* Available scratch registers. */
 	sljit_s32 scratches;
-	/* Used saved registers. */
+	/* Available saved registers. */
 	sljit_s32 saveds;
-	/* Used float scratch registers. */
+	/* Available float scratch registers. */
 	sljit_s32 fscratches;
-	/* Used float saved registers. */
+	/* Available float saved registers. */
 	sljit_s32 fsaveds;
 	/* Local stack size. */
 	sljit_s32 local_size;
-	/* Code size. */
+	/* Maximum code size. */
 	sljit_uw size;
 	/* Relative offset of the executable mapping from the writable mapping. */
 	sljit_sw executable_offset;
@@ -446,8 +451,6 @@
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 	sljit_s32 args_size;
-	sljit_s32 locals_offset;
-	sljit_s32 scratches_offset;
 #endif
 
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
@@ -488,8 +491,7 @@
 	sljit_uw args_size;
 #endif
 
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-	sljit_s32 delay_slot;
+#if (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV)
 	sljit_s32 cache_arg;
 	sljit_sw cache_argw;
 #endif
@@ -517,7 +519,8 @@
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) \
 		|| (defined SLJIT_DEBUG && SLJIT_DEBUG) \
 		|| (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
-	/* Trust arguments when the API function is called. */
+	/* Trust arguments when an API function is called.
+	   Used internally for calling API functions. */
 	sljit_s32 skip_checks;
 #endif
 };
@@ -526,7 +529,7 @@
 /*  Main functions                                                       */
 /* --------------------------------------------------------------------- */
 
-/* Creates an sljit compiler. The allocator_data is required by some
+/* Creates an SLJIT compiler. The allocator_data is required by some
    custom memory managers. This pointer is passed to SLJIT_MALLOC
    and SLJIT_FREE macros. Most allocators (including the default
    one) ignores this value, and it is recommended to pass NULL
@@ -540,19 +543,19 @@
 /* Frees everything except the compiled machine code. */
 SLJIT_API_FUNC_ATTRIBUTE void sljit_free_compiler(struct sljit_compiler *compiler);
 
-/* Returns the current error code. If an error is occurred, future sljit
-   calls which uses the same compiler argument returns early with the same
+/* Returns the current error code. If an error occurres, future calls
+   which uses the same compiler argument returns early with the same
    error code. Thus there is no need for checking the error after every
-   call, it is enough to do it before the code is compiled. Removing
+   call, it is enough to do it after the code is compiled. Removing
    these checks increases the performance of the compiling process. */
 static SLJIT_INLINE sljit_s32 sljit_get_compiler_error(struct sljit_compiler *compiler) { return compiler->error; }
 
 /* Sets the compiler error code to SLJIT_ERR_ALLOC_FAILED except
    if an error was detected before. After the error code is set
    the compiler behaves as if the allocation failure happened
-   during an sljit function call. This can greatly simplify error
-   checking, since only the compiler status needs to be checked
-   after the compilation. */
+   during an SLJIT function call. This can greatly simplify error
+   checking, since it is enough to check the compiler status
+   after the code is compiled. */
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_compiler_memory_error(struct sljit_compiler *compiler);
 
 /*
@@ -560,8 +563,8 @@
    and <= 128 bytes on 64 bit architectures. The memory area is owned by the
    compiler, and freed by sljit_free_compiler. The returned pointer is
    sizeof(sljit_sw) aligned. Excellent for allocating small blocks during
-   the compiling, and no need to worry about freeing them. The size is
-   enough to contain at most 16 pointers. If the size is outside of the range,
+   compiling, and no need to worry about freeing them. The size is enough
+   to contain at most 16 pointers. If the size is outside of the range,
    the function will return with NULL. However, this return value does not
    indicate that there is no more memory (does not set the current error code
    of the compiler to out-of-memory status).
@@ -574,8 +577,8 @@
 #endif
 
 /*
-   Create executable code from the sljit instruction stream. This is the final step
-   of the code generation so no more instructions can be added after this call.
+   Create executable code from the instruction stream. This is the final step
+   of the code generation so no more instructions can be emitted after this call.
 */
 
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler);
@@ -606,13 +609,14 @@
 static SLJIT_INLINE sljit_uw sljit_get_generated_code_size(struct sljit_compiler *compiler) { return compiler->executable_size; }
 
 /* Returns with non-zero if the feature or limitation type passed as its
-   argument is present on the current CPU.
+   argument is present on the current CPU. The return value is one, if a
+   feature is fully supported, and it is two, if partially supported.
 
    Some features (e.g. floating point operations) require hardware (CPU)
    support while others (e.g. move with update) are emulated if not available.
-   However even if a feature is emulated, specialized code paths can be faster
-   than the emulation. Some limitations are emulated as well so their general
-   case is supported but it has extra performance costs. */
+   However, even when a feature is emulated, specialized code paths may be
+   faster than the emulation. Some limitations are emulated as well so their
+   general case is supported but it has extra performance costs. */
 
 /* [Not emulated] Floating-point support is available. */
 #define SLJIT_HAS_FPU			0
@@ -622,10 +626,14 @@
 #define SLJIT_HAS_ZERO_REGISTER		2
 /* [Emulated] Count leading zero is supported. */
 #define SLJIT_HAS_CLZ			3
+/* [Emulated] Count trailing zero is supported. */
+#define SLJIT_HAS_CTZ			4
+/* [Emulated] Rotate left/right is supported. */
+#define SLJIT_HAS_ROT			5
 /* [Emulated] Conditional move is supported. */
-#define SLJIT_HAS_CMOV			4
-/* [Emulated] Conditional move is supported. */
-#define SLJIT_HAS_PREFETCH		5
+#define SLJIT_HAS_CMOV			6
+/* [Emulated] Prefetch instruction is available (emulated as a nop). */
+#define SLJIT_HAS_PREFETCH		7
 
 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
 /* [Not emulated] SSE2 support is available on x86. */
@@ -634,8 +642,23 @@
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type);
 
-/* Instruction generation. Returns with any error code. If there is no
-   error, they return with SLJIT_SUCCESS. */
+/* If type is between SLJIT_ORDERED_EQUAL and SLJIT_ORDERED_LESS_EQUAL,
+   sljit_cmp_info returns one, if the cpu supports the passed floating
+   point comparison type.
+
+   If type is SLJIT_UNORDERED or SLJIT_ORDERED, sljit_cmp_info returns
+   one, if the cpu supports checking the unordered comparison result
+   regardless of the comparison type passed to the comparison instruction.
+   The returned value is always one, if there is at least one type between
+   SLJIT_ORDERED_EQUAL and SLJIT_ORDERED_LESS_EQUAL where sljit_cmp_info
+   returns with a zero value.
+
+   Otherwise it returns zero. */
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type);
+
+/* The following functions generate machine code. If there is no
+   error, they return with SLJIT_SUCCESS, otherwise they return
+   with an error code. */
 
 /*
    The executable code is a function from the viewpoint of the C
@@ -643,30 +666,29 @@
    Binary Interface) of the platform, which specify the purpose of
    machine registers and stack handling among other things. The
    sljit_emit_enter function emits the necessary instructions for
-   setting up a new context for the executable code and moves function
-   arguments to the saved registers. Furthermore the options argument
+   setting up a new context for the executable code. This is often
+   called as function prologue. Furthermore the options argument
    can be used to pass configuration options to the compiler. The
    available options are listed before sljit_emit_enter.
 
-   The function argument list is the combination of SLJIT_ARGx
-   (SLJIT_DEF_ARG1) macros. Currently maximum 4 arguments are
-   supported. The first integer argument is loaded into SLJIT_S0,
-   the second one is loaded into SLJIT_S1, and so on. Similarly,
-   the first floating point argument is loaded into SLJIT_FR0,
-   the second one is loaded into SLJIT_FR1, and so on. Furthermore
-   the register set used by the function must be declared as well.
-   The number of scratch and saved registers used by the function
-   must be passed to sljit_emit_enter. Only R registers between R0
+   The function argument list is specified by the SLJIT_ARGSx
+   (SLJIT_ARGS0 .. SLJIT_ARGS4) macros. Currently maximum four
+   arguments are supported. See the description of SLJIT_ARGSx
+   macros about argument passing. Furthermore the register set
+   used by the function must be declared as well. The number of
+   scratch and saved registers available to the function must
+   be passed to sljit_emit_enter. Only R registers between R0
    and "scratches" argument can be used later. E.g. if "scratches"
-   is set to 2, the scratch register set will be limited to SLJIT_R0
-    and SLJIT_R1. The S registers and the floating point registers
-   ("fscratches" and "fsaveds") are specified in a similar manner.
-   The sljit_emit_enter is also capable of allocating a stack space
-   for local variables. The "local_size" argument contains the size
-   in bytes of this local area and its staring address is stored
-   in SLJIT_SP. The memory area between SLJIT_SP (inclusive) and
-   SLJIT_SP + local_size (exclusive) can be modified freely until
-   the function returns. The stack space is not initialized.
+   is set to two, the scratch register set will be limited to
+   SLJIT_R0 and SLJIT_R1. The S registers and the floating point
+   registers ("fscratches" and "fsaveds") are specified in a
+   similar manner. The sljit_emit_enter is also capable of
+   allocating a stack space for local data. The "local_size"
+   argument contains the size in bytes of this local area, and
+   it can be accessed using SLJIT_MEM1(SLJIT_SP). The memory
+   area between SLJIT_SP (inclusive) and SLJIT_SP + local_size
+   (exclusive) can be modified freely until the function returns.
+   The stack space is not initialized to zero.
 
    Note: the following conditions must met:
          0 <= scratches <= SLJIT_NUMBER_OF_REGISTERS
@@ -683,9 +705,20 @@
          overwrites the previous context.
 */
 
-/* The compiled function uses cdecl calling
- * convention instead of SLJIT_FUNC. */
-#define SLJIT_ENTER_CDECL 0x00000001
+/* Saved registers between SLJIT_S0 and SLJIT_S(n - 1) (inclusive)
+   are not saved / restored on function enter / return. Instead,
+   these registers can be used to pass / return data (such as
+   global / local context pointers) across function calls. The
+   value of n must be between 1 and 3. This option is only
+   supported by SLJIT_ENTER_REG_ARG calling convention. */
+#define SLJIT_ENTER_KEEP(n)	(n)
+
+/* The compiled function uses an SLJIT specific register argument
+   calling convention. This is a lightweight function call type where
+   both the caller and the called functions must be compiled by
+   SLJIT. The type argument of the call must be SLJIT_CALL_REG_ARG
+   and all arguments must be stored in scratch registers. */
+#define SLJIT_ENTER_REG_ARG	0x00000004
 
 /* The local_size must be >= 0 and <= SLJIT_MAX_LOCAL_SIZE. */
 #define SLJIT_MAX_LOCAL_SIZE	65536
@@ -694,12 +727,15 @@
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size);
 
-/* The machine code has a context (which contains the local stack space size,
-   number of used registers, etc.) which initialized by sljit_emit_enter. Several
-   functions (such as sljit_emit_return) requres this context to be able to generate
-   the appropriate code. However, some code fragments (like inline cache) may have
-   no normal entry point so their context is unknown for the compiler. Their context
-   can be provided to the compiler by the sljit_set_context function.
+/* The SLJIT compiler has a current context (which contains the local
+   stack space size, number of used registers, etc.) which is initialized
+   by sljit_emit_enter. Several functions (such as sljit_emit_return)
+   requires this context to be able to generate the appropriate code.
+   However, some code fragments (compiled separately) may have no
+   normal entry point so their context is unknown for the compiler.
+
+   The sljit_set_context and sljit_emit_enter have the same arguments,
+   but sljit_set_context does not generate any machine code.
 
    Note: every call of sljit_emit_enter and sljit_set_context overwrites
          the previous context. */
@@ -708,16 +744,29 @@
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size);
 
-/* Return from machine code. The sljit_emit_return_void function does not return with
-   any value. The sljit_emit_return function returns with a single value which stores
-   the result of a data move instruction. The instruction is specified by the op
-   argument, and must be between SLJIT_MOV and SLJIT_MOV_P (see sljit_emit_op1). */
+/* Return to the caller function. The sljit_emit_return_void function
+   does not return with any value. The sljit_emit_return function returns
+   with a single value loaded from its source operand. The load operation
+   can be between SLJIT_MOV and SLJIT_MOV_P (see sljit_emit_op1) and
+   SLJIT_MOV_F32/SLJIT_MOV_F64 (see sljit_emit_fop1) depending on the
+   return value specified by sljit_emit_enter/sljit_set_context. */
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler);
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw);
 
+/* Restores the saved registers and free the stack area, then the execution
+   continues from the address specified by the source operand. This
+   operation is similar to sljit_emit_return, but it ignores the return
+   address. The code where the exection continues should use the same context
+   as the caller function (see sljit_set_context). A word (pointer) value
+   can be passed in the SLJIT_RETURN_REG register. This function can be used
+   to jump to exception handlers. */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw);
+
 /* Generating entry and exit points for fast call functions (see SLJIT_FAST_CALL).
    Both sljit_emit_fast_enter and SLJIT_FAST_RETURN operations preserve the
    values of all registers and stack frame. The return address is stored in the
@@ -726,9 +775,9 @@
 
    Fast calls are cheap operations (usually only a single call instruction is
    emitted) but they do not preserve any registers. However the callee function
-   can freely use / update any registers and stack values which can be
+   can freely use / update any registers and the local area which can be
    efficiently exploited by various optimizations. Registers can be saved
-   manually by the callee function if needed.
+   and restored manually if needed.
 
    Although returning to different address by SLJIT_FAST_RETURN is possible,
    this address usually cannot be predicted by the return address predictor of
@@ -743,16 +792,16 @@
 /*
    Source and destination operands for arithmetical instructions
     imm              - a simple immediate value (cannot be used as a destination)
-    reg              - any of the registers (immediate argument must be 0)
-    [imm]            - absolute immediate memory address
+    reg              - any of the available registers (immediate argument must be 0)
+    [imm]            - absolute memory address
     [reg+imm]        - indirect memory address
     [reg+(reg<<imm)] - indirect indexed memory address (shift must be between 0 and 3)
-                       useful for (byte, half, int, sljit_sw) array access
-                       (fully supported by both x86 and ARM architectures, and cheap operation on others)
+                       useful for accessing arrays (fully supported by both x86 and
+                       ARM architectures, and cheap operation on others)
 */
 
 /*
-   IMPORTANT NOTE: memory access MUST be naturally aligned unless
+   IMPORTANT NOTE: memory accesses MUST be naturally aligned unless
                    SLJIT_UNALIGNED macro is defined and its value is 1.
 
      length | alignment
@@ -792,8 +841,9 @@
            Write-back is supported except for one instruction: 32 bit signed
                 load with [reg+imm] addressing mode on 64 bit.
    mips:   [reg+imm], -65536 <= imm <= 65535
-   sparc:  [reg+imm], -4096 <= imm <= 4095
-           [reg+reg] is supported
+           Write-back is not supported
+   riscv:  [reg+imm], -2048 <= imm <= 2047
+           Write-back is not supported
    s390x:  [reg+imm], -2^19 <= imm < 2^19
            [reg+reg] is supported
            Write-back is not supported
@@ -805,20 +855,22 @@
 #define SLJIT_MEM1(r1)		(SLJIT_MEM | (r1))
 #define SLJIT_MEM2(r1, r2)	(SLJIT_MEM | (r1) | ((r2) << 8))
 #define SLJIT_IMM		0x40
+#define SLJIT_REG_PAIR(r1, r2)	((r1) | ((r2) << 8))
 
 /* Sets 32 bit operation mode on 64 bit CPUs. This option is ignored on
    32 bit CPUs. When this option is set for an arithmetic operation, only
-   the lower 32 bit of the input registers are used, and the CPU status
+   the lower 32 bits of the input registers are used, and the CPU status
    flags are set according to the 32 bit result. Although the higher 32 bit
    of the input and the result registers are not defined by SLJIT, it might
    be defined by the CPU architecture (e.g. MIPS). To satisfy these CPU
    requirements all source registers must be the result of those operations
    where this option was also set. Memory loads read 32 bit values rather
    than 64 bit ones. In other words 32 bit and 64 bit operations cannot be
-   mixed. The only exception is SLJIT_MOV32 whose source register can hold
+   mixed. The only exception is SLJIT_MOV32 which source register can hold
    any 32 or 64 bit value, and it is converted to a 32 bit compatible format
-   first. This conversion is free (no instructions are emitted) on most CPUs.
-   A 32 bit value can also be converted to a 64 bit value by SLJIT_MOV_S32
+   first. When the source and destination registers are the same, this
+   conversion is free (no instructions are emitted) on most CPUs. A 32 bit
+   value can also be converted to a 64 bit value by SLJIT_MOV_S32
    (sign extension) or SLJIT_MOV_U32 (zero extension).
 
    As for floating-point operations, this option sets 32 bit single
@@ -835,18 +887,20 @@
      SLJIT_ADD32 == (SLJIT_ADD | SLJIT_32) */
 #define SLJIT_32		0x100
 
-/* Many CPUs (x86, ARM, PPC) have status flags which can be set according
+/* Many CPUs (x86, ARM, PPC) have status flag bits which can be set according
    to the result of an operation. Other CPUs (MIPS) do not have status
-   flags, and results must be stored in registers. To cover both architecture
-   types efficiently only two flags are defined by SLJIT:
+   flag bits, and results must be stored in registers. To cover both
+   architecture types efficiently only two flags are defined by SLJIT:
 
     * Zero (equal) flag: it is set if the result is zero
-    * Variable flag: its value is defined by the last arithmetic operation
+    * Variable flag: its value is defined by the arithmetic operation
 
    SLJIT instructions can set any or both of these flags. The value of
-   these flags is undefined if the instruction does not specify their value.
-   The description of each instruction contains the list of allowed flag
-   types.
+   these flags is undefined if the instruction does not specify their
+   value. The description of each instruction contains the list of
+   allowed flag types.
+
+   Note: the logical or operation can be used to set flags.
 
    Example: SLJIT_ADD can set the Z, OVERFLOW, CARRY flags hence
 
@@ -867,32 +921,40 @@
        Sets the variable flag if unsigned overflow (carry) occurs,
        clears it otherwise.
 
-   If an instruction (e.g. SLJIT_MOV) does not modify flags the flags are
-   unchanged.
+   Certain instructions (e.g. SLJIT_MOV) does not modify flags, so
+   status flags are unchanged.
 
-   Using these flags can reduce the number of emitted instructions. E.g. a
-   fast loop can be implemented by decreasing a counter register and set the
-   zero flag to jump back if the counter register has not reached zero.
+   Example:
 
-   Motivation: although CPUs can set a large number of flags, usually their
-   values are ignored or only one of them is used. Emulating a large number
-   of flags on systems without flag register is complicated so SLJIT
-   instructions must specify the flag they want to use and only that flag
-   will be emulated. The last arithmetic instruction can be repeated if
+     sljit_op2(..., SLJIT_ADD | SLJIT_SET_Z, ...)
+     sljit_op1(..., SLJIT_MOV, ...)
+       Zero flag is set according to the result of SLJIT_ADD.
+
+     sljit_op2(..., SLJIT_ADD | SLJIT_SET_Z, ...)
+     sljit_op2(..., SLJIT_ADD, ...)
+       Zero flag has unknown value.
+
+   These flags can be used for code optimization. E.g. a fast loop can be
+   implemented by decreasing a counter register and set the zero flag
+   using a single instruction. The zero register can be used by a
+   conditional jump to restart the loop. A single comparison can set a
+   zero and less flags to check if a value is less, equal, or greater
+   than another value.
+
+   Motivation: although some CPUs can set a large number of flag bits,
+   usually their values are ignored or only a few of them are used. Emulating
+   a large number of flags on systems without a flag register is complicated
+   so SLJIT instructions must specify the flag they want to use and only
+   that flag is computed. The last arithmetic instruction can be repeated if
    multiple flags need to be checked.
 */
 
 /* Set Zero status flag. */
 #define SLJIT_SET_Z			0x0200
 /* Set the variable status flag if condition is true.
-   See comparison types. */
+   See comparison types (e.g. SLJIT_SET_LESS, SLJIT_SET_F_EQUAL). */
 #define SLJIT_SET(condition)			((condition) << 10)
 
-/* Notes:
-     - you cannot postpone conditional jump instructions except if noted that
-       the instruction does not set flags (See: SLJIT_KEEP_FLAGS).
-     - flag combinations: '|' means 'logical or'. */
-
 /* Starting index of opcodes for sljit_emit_op0. */
 #define SLJIT_OP0_BASE			0
 
@@ -943,10 +1005,12 @@
 /* Flags: - (does not modify flags)
    ENDBR32 instruction for x86-32 and ENDBR64 instruction for x86-64
    when Intel Control-flow Enforcement Technology (CET) is enabled.
-   No instruction for other architectures.  */
+   No instructions are emitted for other architectures. */
 #define SLJIT_ENDBR			(SLJIT_OP0_BASE + 8)
 /* Flags: - (may destroy flags)
-   Skip stack frames before return.  */
+   Skip stack frames before return when Intel Control-flow
+   Enforcement Technology (CET) is enabled. No instructions
+   are emitted for other architectures. */
 #define SLJIT_SKIP_FRAMES_BEFORE_RETURN	(SLJIT_OP0_BASE + 9)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op);
@@ -990,9 +1054,8 @@
 /* Flags: - (does not modify flags) */
 #define SLJIT_MOV32			(SLJIT_OP1_BASE + 7)
 /* Flags: - (does not modify flags)
-   Note: load a pointer sized data, useful on x32 (a 32 bit mode on x86-64
-         where all x64 features are available, e.g. 16 register) or similar
-         compiling modes */
+   Note: loads a pointer sized data, useful on x32 mode (a 64 bit mode
+         on x86-64 which uses 32 bit pointers) or similar compiling modes */
 #define SLJIT_MOV_P			(SLJIT_OP1_BASE + 8)
 /* Flags: Z
    Note: immediate source argument is not supported */
@@ -1003,6 +1066,11 @@
    Note: immediate source argument is not supported */
 #define SLJIT_CLZ			(SLJIT_OP1_BASE + 10)
 #define SLJIT_CLZ32			(SLJIT_CLZ | SLJIT_32)
+/* Count trailing zeroes
+   Flags: - (may destroy flags)
+   Note: immediate source argument is not supported */
+#define SLJIT_CTZ			(SLJIT_OP1_BASE + 11)
+#define SLJIT_CTZ32			(SLJIT_CTZ | SLJIT_32)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
@@ -1019,7 +1087,7 @@
 #define SLJIT_ADDC32			(SLJIT_ADDC | SLJIT_32)
 /* Flags: Z | LESS | GREATER_EQUAL | GREATER | LESS_EQUAL
           SIG_LESS | SIG_GREATER_EQUAL | SIG_GREATER
-          SIG_LESS_EQUAL | CARRY */
+          SIG_LESS_EQUAL | OVERFLOW | CARRY */
 #define SLJIT_SUB			(SLJIT_OP2_BASE + 2)
 #define SLJIT_SUB32			(SLJIT_SUB | SLJIT_32)
 /* Flags: CARRY */
@@ -1046,31 +1114,100 @@
 #define SLJIT_SHL			(SLJIT_OP2_BASE + 8)
 #define SLJIT_SHL32			(SLJIT_SHL | SLJIT_32)
 /* Flags: Z
-   Let bit_length be the length of the shift operation: 32 or 64.
-   If src2 is immediate, src2w is masked by (bit_length - 1).
-   Otherwise, if the content of src2 is outside the range from 0
-   to bit_length - 1, the result is undefined. */
-#define SLJIT_LSHR			(SLJIT_OP2_BASE + 9)
-#define SLJIT_LSHR32			(SLJIT_LSHR | SLJIT_32)
+   Same as SLJIT_SHL, except the the second operand is
+   always masked by the length of the shift operation. */
+#define SLJIT_MSHL			(SLJIT_OP2_BASE + 9)
+#define SLJIT_MSHL32			(SLJIT_MSHL | SLJIT_32)
 /* Flags: Z
    Let bit_length be the length of the shift operation: 32 or 64.
    If src2 is immediate, src2w is masked by (bit_length - 1).
    Otherwise, if the content of src2 is outside the range from 0
    to bit_length - 1, the result is undefined. */
-#define SLJIT_ASHR			(SLJIT_OP2_BASE + 10)
+#define SLJIT_LSHR			(SLJIT_OP2_BASE + 10)
+#define SLJIT_LSHR32			(SLJIT_LSHR | SLJIT_32)
+/* Flags: Z
+   Same as SLJIT_LSHR, except the the second operand is
+   always masked by the length of the shift operation. */
+#define SLJIT_MLSHR			(SLJIT_OP2_BASE + 11)
+#define SLJIT_MLSHR32			(SLJIT_MLSHR | SLJIT_32)
+/* Flags: Z
+   Let bit_length be the length of the shift operation: 32 or 64.
+   If src2 is immediate, src2w is masked by (bit_length - 1).
+   Otherwise, if the content of src2 is outside the range from 0
+   to bit_length - 1, the result is undefined. */
+#define SLJIT_ASHR			(SLJIT_OP2_BASE + 12)
 #define SLJIT_ASHR32			(SLJIT_ASHR | SLJIT_32)
+/* Flags: Z
+   Same as SLJIT_ASHR, except the the second operand is
+   always masked by the length of the shift operation. */
+#define SLJIT_MASHR			(SLJIT_OP2_BASE + 13)
+#define SLJIT_MASHR32			(SLJIT_MASHR | SLJIT_32)
+/* Flags: - (may destroy flags)
+   Let bit_length be the length of the rotate operation: 32 or 64.
+   The second operand is always masked by (bit_length - 1). */
+#define SLJIT_ROTL			(SLJIT_OP2_BASE + 14)
+#define SLJIT_ROTL32			(SLJIT_ROTL | SLJIT_32)
+/* Flags: - (may destroy flags)
+   Let bit_length be the length of the rotate operation: 32 or 64.
+   The second operand is always masked by (bit_length - 1). */
+#define SLJIT_ROTR			(SLJIT_OP2_BASE + 15)
+#define SLJIT_ROTR32			(SLJIT_ROTR | SLJIT_32)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w);
 
-/* The sljit_emit_op2u function is the same as sljit_emit_op2 except the result is discarded. */
+/* The sljit_emit_op2u function is the same as sljit_emit_op2
+   except the result is discarded. */
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w);
 
+/* Emit a left or right shift operation, where the bits shifted
+   in comes from a separate source operand. All operands are
+   interpreted as unsigned integers.
+
+   In the followings the value_mask variable is 31 for 32 bit
+     operations and word_size - 1 otherwise.
+
+   op must be one of the following operations:
+     SLJIT_SHL or SLJIT_SHL32:
+       src_dst <<= src2
+       src_dst |= ((src1 >> 1) >> (src2 ^ value_mask))
+     SLJIT_MSHL or SLJIT_MSHL32:
+       src2 &= value_mask
+       perform the SLJIT_SHL or SLJIT_SHL32 operation
+     SLJIT_LSHR or SLJIT_LSHR32:
+       src_dst >>= src2
+       src_dst |= ((src1 << 1) << (src2 ^ value_mask))
+     SLJIT_MLSHR or SLJIT_MLSHR32:
+       src2 &= value_mask
+       perform the SLJIT_LSHR or SLJIT_LSHR32 operation
+
+   op can be combined (or'ed) with SLJIT_SHIFT_INTO_NON_ZERO
+
+   src_dst must be a register which content is updated after
+     the operation is completed
+   src1 / src1w contains the bits which shifted into src_dst
+   src2 / src2w contains the shift amount
+
+   Note: a rotate operation can be performed if src_dst and
+         src1 are set to the same register
+
+   Flags: - (may destroy flags) */
+
+/* The src2 contains a non-zero value. Improves the generated
+   code on certain architectures, which provides a small
+   performance improvement. */
+#define SLJIT_SHIFT_INTO_NON_ZERO	0x200
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src_dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w);
+
 /* Starting index of opcodes for sljit_emit_op2. */
 #define SLJIT_OP_SRC_BASE		128
 
@@ -1116,8 +1253,8 @@
 #define SLJIT_MOV_F64			(SLJIT_FOP1_BASE + 0)
 #define SLJIT_MOV_F32			(SLJIT_MOV_F64 | SLJIT_32)
 /* Convert opcodes: CONV[DST_TYPE].FROM[SRC_TYPE]
-   SRC/DST TYPE can be: D - double, S - single, W - signed word, I - signed int
-   Rounding mode when the destination is W or I: round towards zero. */
+   SRC/DST TYPE can be: F64, F32, S32, SW
+   Rounding mode when the destination is SW or S32: round towards zero. */
 /* Flags: - (may destroy flags) */
 #define SLJIT_CONV_F64_FROM_F32		(SLJIT_FOP1_BASE + 1)
 #define SLJIT_CONV_F32_FROM_F64		(SLJIT_CONV_F64_FROM_F32 | SLJIT_32)
@@ -1133,7 +1270,7 @@
 /* Flags: - (may destroy flags) */
 #define SLJIT_CONV_F64_FROM_S32		(SLJIT_FOP1_BASE + 5)
 #define SLJIT_CONV_F32_FROM_S32		(SLJIT_CONV_F64_FROM_S32 | SLJIT_32)
-/* Note: dst is the left and src is the right operand for SLJIT_CMPD.
+/* Note: dst is the left and src is the right operand for SLJIT_CMP_F32/64.
    Flags: EQUAL_F | LESS_F | GREATER_EQUAL_F | GREATER_F | LESS_EQUAL_F */
 #define SLJIT_CMP_F64			(SLJIT_FOP1_BASE + 6)
 #define SLJIT_CMP_F32			(SLJIT_CMP_F64 | SLJIT_32)
@@ -1202,46 +1339,75 @@
 #define SLJIT_SET_OVERFLOW		SLJIT_SET(SLJIT_OVERFLOW)
 #define SLJIT_NOT_OVERFLOW		11
 
-/* Unlike other flags, sljit_emit_jump may destroy this flag. */
+/* Unlike other flags, sljit_emit_jump may destroy the carry flag. */
 #define SLJIT_CARRY			12
 #define SLJIT_SET_CARRY			SLJIT_SET(SLJIT_CARRY)
 #define SLJIT_NOT_CARRY			13
 
-/* Floating point comparison types. */
-#define SLJIT_EQUAL_F64			14
-#define SLJIT_EQUAL_F32			(SLJIT_EQUAL_F64 | SLJIT_32)
-#define SLJIT_SET_EQUAL_F		SLJIT_SET(SLJIT_EQUAL_F64)
-#define SLJIT_NOT_EQUAL_F64		15
-#define SLJIT_NOT_EQUAL_F32		(SLJIT_NOT_EQUAL_F64 | SLJIT_32)
-#define SLJIT_SET_NOT_EQUAL_F		SLJIT_SET(SLJIT_NOT_EQUAL_F64)
-#define SLJIT_LESS_F64			16
-#define SLJIT_LESS_F32			(SLJIT_LESS_F64 | SLJIT_32)
-#define SLJIT_SET_LESS_F		SLJIT_SET(SLJIT_LESS_F64)
-#define SLJIT_GREATER_EQUAL_F64		17
-#define SLJIT_GREATER_EQUAL_F32		(SLJIT_GREATER_EQUAL_F64 | SLJIT_32)
-#define SLJIT_SET_GREATER_EQUAL_F	SLJIT_SET(SLJIT_GREATER_EQUAL_F64)
-#define SLJIT_GREATER_F64		18
-#define SLJIT_GREATER_F32		(SLJIT_GREATER_F64 | SLJIT_32)
-#define SLJIT_SET_GREATER_F		SLJIT_SET(SLJIT_GREATER_F64)
-#define SLJIT_LESS_EQUAL_F64		19
-#define SLJIT_LESS_EQUAL_F32		(SLJIT_LESS_EQUAL_F64 | SLJIT_32)
-#define SLJIT_SET_LESS_EQUAL_F		SLJIT_SET(SLJIT_LESS_EQUAL_F64)
-#define SLJIT_UNORDERED_F64		20
-#define SLJIT_UNORDERED_F32		(SLJIT_UNORDERED_F64 | SLJIT_32)
-#define SLJIT_SET_UNORDERED_F		SLJIT_SET(SLJIT_UNORDERED_F64)
-#define SLJIT_ORDERED_F64		21
-#define SLJIT_ORDERED_F32		(SLJIT_ORDERED_F64 | SLJIT_32)
-#define SLJIT_SET_ORDERED_F		SLJIT_SET(SLJIT_ORDERED_F64)
+/* Basic floating point comparison types.
+
+   Note: when the comparison result is unordered, their behaviour is unspecified. */
+
+#define SLJIT_F_EQUAL				14
+#define SLJIT_SET_F_EQUAL			SLJIT_SET(SLJIT_F_EQUAL)
+#define SLJIT_F_NOT_EQUAL			15
+#define SLJIT_SET_F_NOT_EQUAL			SLJIT_SET(SLJIT_F_NOT_EQUAL)
+#define SLJIT_F_LESS				16
+#define SLJIT_SET_F_LESS			SLJIT_SET(SLJIT_F_LESS)
+#define SLJIT_F_GREATER_EQUAL			17
+#define SLJIT_SET_F_GREATER_EQUAL		SLJIT_SET(SLJIT_F_GREATER_EQUAL)
+#define SLJIT_F_GREATER				18
+#define SLJIT_SET_F_GREATER			SLJIT_SET(SLJIT_F_GREATER)
+#define SLJIT_F_LESS_EQUAL			19
+#define SLJIT_SET_F_LESS_EQUAL			SLJIT_SET(SLJIT_F_LESS_EQUAL)
+
+/* Jumps when either argument contains a NaN value. */
+#define SLJIT_UNORDERED				20
+#define SLJIT_SET_UNORDERED			SLJIT_SET(SLJIT_UNORDERED)
+/* Jumps when neither argument contains a NaN value. */
+#define SLJIT_ORDERED				21
+#define SLJIT_SET_ORDERED			SLJIT_SET(SLJIT_ORDERED)
+
+/* Ordered / unordered floating point comparison types.
+
+   Note: each comparison type has an ordered and unordered form. Some
+         architectures supports only either of them (see: sljit_cmp_info). */
+
+#define SLJIT_ORDERED_EQUAL			22
+#define SLJIT_SET_ORDERED_EQUAL			SLJIT_SET(SLJIT_ORDERED_EQUAL)
+#define SLJIT_UNORDERED_OR_NOT_EQUAL		23
+#define SLJIT_SET_UNORDERED_OR_NOT_EQUAL	SLJIT_SET(SLJIT_UNORDERED_OR_NOT_EQUAL)
+#define SLJIT_ORDERED_LESS			24
+#define SLJIT_SET_ORDERED_LESS			SLJIT_SET(SLJIT_ORDERED_LESS)
+#define SLJIT_UNORDERED_OR_GREATER_EQUAL	25
+#define SLJIT_SET_UNORDERED_OR_GREATER_EQUAL	SLJIT_SET(SLJIT_UNORDERED_OR_GREATER_EQUAL)
+#define SLJIT_ORDERED_GREATER			26
+#define SLJIT_SET_ORDERED_GREATER		SLJIT_SET(SLJIT_ORDERED_GREATER)
+#define SLJIT_UNORDERED_OR_LESS_EQUAL		27
+#define SLJIT_SET_UNORDERED_OR_LESS_EQUAL	SLJIT_SET(SLJIT_UNORDERED_OR_LESS_EQUAL)
+
+#define SLJIT_UNORDERED_OR_EQUAL		28
+#define SLJIT_SET_UNORDERED_OR_EQUAL		SLJIT_SET(SLJIT_UNORDERED_OR_EQUAL)
+#define SLJIT_ORDERED_NOT_EQUAL			29
+#define SLJIT_SET_ORDERED_NOT_EQUAL		SLJIT_SET(SLJIT_ORDERED_NOT_EQUAL)
+#define SLJIT_UNORDERED_OR_LESS			30
+#define SLJIT_SET_UNORDERED_OR_LESS		SLJIT_SET(SLJIT_UNORDERED_OR_LESS)
+#define SLJIT_ORDERED_GREATER_EQUAL		31
+#define SLJIT_SET_ORDERED_GREATER_EQUAL		SLJIT_SET(SLJIT_ORDERED_GREATER_EQUAL)
+#define SLJIT_UNORDERED_OR_GREATER		32
+#define SLJIT_SET_UNORDERED_OR_GREATER		SLJIT_SET(SLJIT_UNORDERED_OR_GREATER)
+#define SLJIT_ORDERED_LESS_EQUAL		33
+#define SLJIT_SET_ORDERED_LESS_EQUAL		SLJIT_SET(SLJIT_ORDERED_LESS_EQUAL)
 
 /* Unconditional jump types. */
-#define SLJIT_JUMP			22
-	/* Fast calling method. See sljit_emit_fast_enter / SLJIT_FAST_RETURN. */
-#define SLJIT_FAST_CALL			23
-	/* Called function must be declared with the SLJIT_FUNC attribute. */
-#define SLJIT_CALL			24
-	/* Called function must be declared with cdecl attribute.
-	   This is the default attribute for C functions. */
-#define SLJIT_CALL_CDECL		25
+#define SLJIT_JUMP			34
+/* Fast calling method. See sljit_emit_fast_enter / SLJIT_FAST_RETURN. */
+#define SLJIT_FAST_CALL			35
+/* Default C calling convention. */
+#define SLJIT_CALL			36
+/* Called function must be compiled by SLJIT.
+   See SLJIT_ENTER_REG_ARG option. */
+#define SLJIT_CALL_REG_ARG		37
 
 /* The target can be changed during runtime (see: sljit_set_jump_addr). */
 #define SLJIT_REWRITABLE_JUMP		0x1000
@@ -1249,11 +1415,8 @@
    the called function returns to the caller of the current function. The
    stack usage is reduced before the call, but it is not necessarily reduced
    to zero. In the latter case the compiler needs to allocate space for some
-   arguments and the return register must be kept as well.
-
-   This feature is highly experimental and not supported on SPARC platform
-   at the moment. */
-#define SLJIT_CALL_RETURN			0x2000
+   arguments and the return address must be stored on the stack as well. */
+#define SLJIT_CALL_RETURN		0x2000
 
 /* Emit a jump instruction. The destination is not set, only the type of the jump.
     type must be between SLJIT_EQUAL and SLJIT_FAST_CALL
@@ -1263,18 +1426,18 @@
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type);
 
 /* Emit a C compiler (ABI) compatible function call.
-    type must be SLJIT_CALL or SLJIT_CALL_CDECL
-    type can be combined (or'ed) with SLJIT_REWRITABLE_JUMP and SLJIT_CALL_RETURN
-    arg_types is the combination of SLJIT_RET / SLJIT_ARGx (SLJIT_DEF_RET / SLJIT_DEF_ARGx) macros
+    type must be SLJIT_CALL or SLJIT_CALL_REG_ARG
+    type can be combined (or'ed) with SLJIT_REWRITABLE_JUMP and/or SLJIT_CALL_RETURN
+    arg_types can be specified by SLJIT_ARGSx (SLJIT_ARG_RETURN / SLJIT_ARG_VALUE) macros
 
    Flags: destroy all flags. */
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 arg_types);
 
 /* Basic arithmetic comparison. In most architectures it is implemented as
-   an compare operation followed by a sljit_emit_jump. However some
-   architectures (i.e: ARM64 or MIPS) may employ special optimizations here.
-   It is suggested to use this comparison form when appropriate.
-    type must be between SLJIT_EQUAL and SLJIT_I_SIG_LESS_EQUAL
+   a compare operation followed by a sljit_emit_jump. However some
+   architectures (i.e: ARM64 or MIPS) may employ special optimizations
+   here. It is suggested to use this comparison form when appropriate.
+    type must be between SLJIT_EQUAL and SLJIT_SIG_LESS_EQUAL
     type can be combined (or'ed) with SLJIT_REWRITABLE_JUMP
 
    Flags: may destroy flags. */
@@ -1283,15 +1446,14 @@
 	sljit_s32 src2, sljit_sw src2w);
 
 /* Basic floating point comparison. In most architectures it is implemented as
-   an SLJIT_FCMP operation (setting appropriate flags) followed by a
+   a SLJIT_CMP_F32/64 operation (setting appropriate flags) followed by a
    sljit_emit_jump. However some architectures (i.e: MIPS) may employ
    special optimizations here. It is suggested to use this comparison form
    when appropriate.
-    type must be between SLJIT_EQUAL_F64 and SLJIT_ORDERED_F32
+    type must be between SLJIT_F_EQUAL and SLJIT_ORDERED_LESS_EQUAL
     type can be combined (or'ed) with SLJIT_REWRITABLE_JUMP
    Flags: destroy flags.
-   Note: if either operand is NaN, the behaviour is undefined for
-         types up to SLJIT_S_LESS_EQUAL. */
+   Note: when an operand is NaN the behaviour depends on the comparison type. */
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_fcmp(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w);
@@ -1312,22 +1474,22 @@
 /* Emit a C compiler (ABI) compatible function call.
    Direct form: set src to SLJIT_IMM() and srcw to the address
    Indirect form: any other valid addressing mode
-    type must be SLJIT_CALL or SLJIT_CALL_CDECL
+    type must be SLJIT_CALL or SLJIT_CALL_REG_ARG
     type can be combined (or'ed) with SLJIT_CALL_RETURN
-    arg_types is the combination of SLJIT_RET / SLJIT_ARGx (SLJIT_DEF_RET / SLJIT_DEF_ARGx) macros
+    arg_types can be specified by SLJIT_ARGSx (SLJIT_ARG_RETURN / SLJIT_ARG_VALUE) macros
 
    Flags: destroy all flags. */
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 arg_types, sljit_s32 src, sljit_sw srcw);
 
-/* Perform the operation using the conditional flags as the second argument.
-   Type must always be between SLJIT_EQUAL and SLJIT_ORDERED_F64. The value
-   represented by the type is 1, if the condition represented by the type
-   is fulfilled, and 0 otherwise.
+/* Perform an operation using the conditional flags as the second argument.
+   Type must always be between SLJIT_EQUAL and SLJIT_ORDERED_LESS_EQUAL.
+   The value represented by the type is 1, if the condition represented
+   by the type is fulfilled, and 0 otherwise.
 
-   If op == SLJIT_MOV, SLJIT_MOV32:
+   When op is SLJIT_MOV or SLJIT_MOV32:
      Set dst to the value represented by the type (0 or 1).
      Flags: - (does not modify flags)
-   If op == SLJIT_OR, op == SLJIT_AND, op == SLJIT_XOR
+   When op is SLJIT_AND, SLJIT_AND32, SLJIT_OR, SLJIT_OR32, SLJIT_XOR, or SLJIT_XOR32
      Performs the binary operation using dst as the first, and the value
      represented by type as the second argument. Result is written into dst.
      Flags: Z (may destroy flags) */
@@ -1339,69 +1501,139 @@
    if the condition is satisfied. Unlike other arithmetic operations this
    instruction does not support memory access.
 
-   type must be between SLJIT_EQUAL and SLJIT_ORDERED_F64
-   dst_reg must be a valid register and it can be combined
-      with SLJIT_32 to perform a 32 bit arithmetic operation
-   src must be register or immediate (SLJIT_IMM)
+   type must be between SLJIT_EQUAL and SLJIT_ORDERED_LESS_EQUAL
+   type can be combined (or'ed) with SLJIT_32
+   dst_reg must be a valid register
+   src must be a valid register or immediate (SLJIT_IMM)
 
    Flags: - (does not modify flags) */
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 dst_reg,
 	sljit_s32 src, sljit_sw srcw);
 
+/* The following flags are used by sljit_emit_mem(), sljit_emit_mem_update(),
+   sljit_emit_fmem(), and sljit_emit_fmem_update(). */
+
+/* Memory load operation. This is the default. */
+#define SLJIT_MEM_LOAD		0x000000
+/* Memory store operation. */
+#define SLJIT_MEM_STORE		0x000200
+
 /* The following flags are used by sljit_emit_mem() and sljit_emit_fmem(). */
 
+/* Load or stora data from an unaligned (byte aligned) address. */
+#define SLJIT_MEM_UNALIGNED	0x000400
+/* Load or stora data from a 16 bit aligned address. */
+#define SLJIT_MEM_UNALIGNED_16	0x000800
+/* Load or stora data from a 32 bit aligned address. */
+#define SLJIT_MEM_UNALIGNED_32	0x001000
+
+/* The following flags are used by sljit_emit_mem_update(),
+   and sljit_emit_fmem_update(). */
+
+/* Base register is updated before the memory access (default). */
+#define SLJIT_MEM_PRE		0x000000
+/* Base register is updated after the memory access. */
+#define SLJIT_MEM_POST		0x000400
+
 /* When SLJIT_MEM_SUPP is passed, no instructions are emitted.
    Instead the function returns with SLJIT_SUCCESS if the instruction
    form is supported and SLJIT_ERR_UNSUPPORTED otherwise. This flag
    allows runtime checking of available instruction forms. */
-#define SLJIT_MEM_SUPP		0x0200
-/* Memory load operation. This is the default. */
-#define SLJIT_MEM_LOAD		0x0000
-/* Memory store operation. */
-#define SLJIT_MEM_STORE		0x0400
-/* Base register is updated before the memory access. */
-#define SLJIT_MEM_PRE		0x0800
-/* Base register is updated after the memory access. */
-#define SLJIT_MEM_POST		0x1000
+#define SLJIT_MEM_SUPP		0x000800
 
-/* Emit a single memory load or store with update instruction. When the
-   requested instruction form is not supported by the CPU, it returns
-   with SLJIT_ERR_UNSUPPORTED instead of emulating the instruction. This
-   allows specializing tight loops based on the supported instruction
-   forms (see SLJIT_MEM_SUPP flag).
+/* The sljit_emit_mem emits instructions for various memory operations:
+
+   When SLJIT_MEM_UNALIGNED / SLJIT_MEM_UNALIGNED_16 /
+        SLJIT_MEM_UNALIGNED_32 is set in type argument:
+     Emit instructions for unaligned memory loads or stores. When
+     SLJIT_UNALIGNED is not defined, the only way to access unaligned
+     memory data is using sljit_emit_mem. Otherwise all operations (e.g.
+     sljit_emit_op1/2, or sljit_emit_fop1/2) supports unaligned access.
+     In general, the performance of unaligned memory accesses are often
+     lower than aligned and should be avoided.
+
+   When a pair of registers is passed in reg argument:
+     Emit instructions for moving data between a register pair and
+     memory. The register pair can be specified by the SLJIT_REG_PAIR
+     macro. The first register is loaded from or stored into the
+     location specified by the mem/memw arguments, and the end address
+     of this operation is the starting address of the data transfer
+     between the second register and memory. The type argument must
+     be SLJIT_MOV. The SLJIT_MEM_UNALIGNED* options are allowed for
+     this operation.
 
    type must be between SLJIT_MOV and SLJIT_MOV_P and can be
-     combined with SLJIT_MEM_* flags. Either SLJIT_MEM_PRE
-     or SLJIT_MEM_POST must be specified.
-   reg is the source or destination register, and must be
-     different from the base register of the mem operand
-   mem must be a SLJIT_MEM1() or SLJIT_MEM2() operand
+     combined (or'ed) with SLJIT_MEM_* flags
+   reg is a register or register pair, which is the source or
+     destination of the operation
+   mem must be a memory operand
 
    Flags: - (does not modify flags) */
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw);
 
+/* Emit a single memory load or store with update instruction.
+   When the requested instruction form is not supported by the CPU,
+   it returns with SLJIT_ERR_UNSUPPORTED instead of emulating the
+   instruction. This allows specializing tight loops based on
+   the supported instruction forms (see SLJIT_MEM_SUPP flag).
+   Absolute address (SLJIT_MEM0) forms are never supported
+   and the base (first) register specified by the mem argument
+   must not be SLJIT_SP and must also be different from the
+   register specified by the reg argument.
+
+   type must be between SLJIT_MOV and SLJIT_MOV_P and can be
+     combined (or'ed) with SLJIT_MEM_* flags
+   reg is the source or destination register of the operation
+   mem must be a memory operand
+
+   Flags: - (does not modify flags) */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw);
+
 /* Same as sljit_emit_mem except the followings:
 
+   Loading or storing a pair of registers is not supported.
+
    type must be SLJIT_MOV_F64 or SLJIT_MOV_F32 and can be
-     combined with SLJIT_MEM_* flags. Either SLJIT_MEM_PRE
-     or SLJIT_MEM_POST must be specified.
-   freg is the source or destination floating point register */
+     combined (or'ed) with SLJIT_MEM_* flags.
+   freg is the source or destination floating point register
+     of the operation
+   mem must be a memory operand
+
+   Flags: - (does not modify flags) */
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 freg,
 	sljit_s32 mem, sljit_sw memw);
 
-/* Copies the base address of SLJIT_SP + offset to dst. The offset can be
-   anything to negate the effect of relative addressing. For example if an
-   array of sljit_sw values is stored on the stack from offset 0x40, and R0
-   contains the offset of an array item plus 0x120, this item can be
-   overwritten by two SLJIT instructions:
+/* Same as sljit_emit_mem_update except the followings:
+
+   type must be SLJIT_MOV_F64 or SLJIT_MOV_F32 and can be
+     combined (or'ed) with SLJIT_MEM_* flags
+   freg is the source or destination floating point register
+     of the operation
+   mem must be a memory operand
+
+   Flags: - (does not modify flags) */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 mem, sljit_sw memw);
+
+/* Copies the base address of SLJIT_SP + offset to dst. The offset can
+   represent the starting address of a value in the local data (stack).
+   The offset is not limited by the local data limits, it can be any value.
+   For example if an array of bytes are stored on the stack from
+   offset 0x40, and R0 contains the offset of an array item plus 0x120,
+   this item can be changed by two SLJIT instructions:
 
    sljit_get_local_base(compiler, SLJIT_R1, 0, 0x40 - 0x120);
-   sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 0, SLJIT_IMM, 0x5);
+   sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM2(SLJIT_R1, SLJIT_R0), 0, SLJIT_IMM, 0x5);
 
    Flags: - (may destroy flags) */
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset);
@@ -1430,15 +1662,67 @@
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset);
 
 /* --------------------------------------------------------------------- */
+/*  CPU specific functions                                               */
+/* --------------------------------------------------------------------- */
+
+/* The following function is a helper function for sljit_emit_op_custom.
+   It returns with the real machine register index ( >=0 ) of any SLJIT_R,
+   SLJIT_S and SLJIT_SP registers.
+
+   Note: it returns with -1 for virtual registers (only on x86-32). */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg);
+
+/* The following function is a helper function for sljit_emit_op_custom.
+   It returns with the real machine register ( >= 0 ) index of any SLJIT_FR,
+   and SLJIT_FS register.
+
+   Note: the index is always an even number on ARM-32, MIPS. */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg);
+
+/* Any instruction can be inserted into the instruction stream by
+   sljit_emit_op_custom. It has a similar purpose as inline assembly.
+   The size parameter must match to the instruction size of the target
+   architecture:
+
+         x86: 0 < size <= 15. The instruction argument can be byte aligned.
+      Thumb2: if size == 2, the instruction argument must be 2 byte aligned.
+              if size == 4, the instruction argument must be 4 byte aligned.
+   Otherwise: size must be 4 and instruction argument must be 4 byte aligned. */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
+	void *instruction, sljit_u32 size);
+
+/* Flags were set by a 32 bit operation. */
+#define SLJIT_CURRENT_FLAGS_32			SLJIT_32
+
+/* Flags were set by an ADD or ADDC operations. */
+#define SLJIT_CURRENT_FLAGS_ADD			0x01
+/* Flags were set by a SUB, SUBC, or NEG operation. */
+#define SLJIT_CURRENT_FLAGS_SUB			0x02
+
+/* Flags were set by sljit_emit_op2u with SLJIT_SUB opcode.
+   Must be combined with SLJIT_CURRENT_FLAGS_SUB. */
+#define SLJIT_CURRENT_FLAGS_COMPARE		0x04
+
+/* Define the currently available CPU status flags. It is usually used after
+   an sljit_emit_label or sljit_emit_op_custom operations to define which CPU
+   status flags are available.
+
+   The current_flags must be a valid combination of SLJIT_SET_* and
+   SLJIT_CURRENT_FLAGS_* constants. */
+
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_current_flags(struct sljit_compiler *compiler,
+	sljit_s32 current_flags);
+
+/* --------------------------------------------------------------------- */
 /*  Miscellaneous utility functions                                      */
 /* --------------------------------------------------------------------- */
 
-#define SLJIT_MAJOR_VERSION	0
-#define SLJIT_MINOR_VERSION	94
-
 /* Get the human readable name of the platform. Can be useful on platforms
-   like ARM, where ARM and Thumb2 functions can be mixed, and
-   it is useful to know the type of the code generator. */
+   like ARM, where ARM and Thumb2 functions can be mixed, and it is useful
+   to know the type of the code generator. */
 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void);
 
 /* Portable helper function to get an offset of a member. */
@@ -1532,60 +1816,6 @@
 SLJIT_API_FUNC_ATTRIBUTE void sljit_free_unused_memory_exec(void);
 #endif
 
-/* --------------------------------------------------------------------- */
-/*  CPU specific functions                                               */
-/* --------------------------------------------------------------------- */
-
-/* The following function is a helper function for sljit_emit_op_custom.
-   It returns with the real machine register index ( >=0 ) of any SLJIT_R,
-   SLJIT_S and SLJIT_SP registers.
-
-   Note: it returns with -1 for virtual registers (only on x86-32). */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg);
-
-/* The following function is a helper function for sljit_emit_op_custom.
-   It returns with the real machine register index of any SLJIT_FLOAT register.
-
-   Note: the index is always an even number on ARM (except ARM-64), MIPS, and SPARC. */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg);
-
-/* Any instruction can be inserted into the instruction stream by
-   sljit_emit_op_custom. It has a similar purpose as inline assembly.
-   The size parameter must match to the instruction size of the target
-   architecture:
-
-         x86: 0 < size <= 15. The instruction argument can be byte aligned.
-      Thumb2: if size == 2, the instruction argument must be 2 byte aligned.
-              if size == 4, the instruction argument must be 4 byte aligned.
-   Otherwise: size must be 4 and instruction argument must be 4 byte aligned. */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
-	void *instruction, sljit_u32 size);
-
-/* Flags were set by a 32 bit operation. */
-#define SLJIT_CURRENT_FLAGS_32			SLJIT_32
-
-/* Flags were set by an ADD or ADDC operations. */
-#define SLJIT_CURRENT_FLAGS_ADD			0x01
-/* Flags were set by a SUB, SUBC, or NEG operation. */
-#define SLJIT_CURRENT_FLAGS_SUB			0x02
-
-/* Flags were set by sljit_emit_op2u with SLJIT_SUB opcode.
-   Must be combined with SLJIT_CURRENT_FLAGS_SUB. */
-#define SLJIT_CURRENT_FLAGS_COMPARE		0x04
-
-/* Define the currently available CPU status flags. It is usually used after
-   an sljit_emit_label or sljit_emit_op_custom operations to define which CPU
-   status flags are available.
-
-   The current_flags must be a valid combination of SLJIT_SET_* and
-   SLJIT_CURRENT_FLAGS_* constants. */
-
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_current_flags(struct sljit_compiler *compiler,
-	sljit_s32 current_flags);
-
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/sljit/sljitNativeARM_32.c b/src/sljit/sljitNativeARM_32.c
index 7b87f59..54b8ade 100644
--- a/src/sljit/sljitNativeARM_32.c
+++ b/src/sljit/sljitNativeARM_32.c
@@ -100,6 +100,8 @@
 #define CMP		0xe1400000
 #define BKPT		0xe1200070
 #define EOR		0xe0200000
+#define LDR		0xe5100000
+#define LDR_POST	0xe4100000
 #define MOV		0xe1a00000
 #define MUL		0xe0000090
 #define MVN		0xe1e00000
@@ -107,10 +109,12 @@
 #define ORR		0xe1800000
 #define PUSH		0xe92d0000
 #define POP		0xe8bd0000
+#define RBIT		0xe6ff0f30
 #define RSB		0xe0600000
 #define RSC		0xe0e00000
 #define SBC		0xe0c00000
 #define SMULL		0xe0c00090
+#define STR		0xe5000000
 #define SUB		0xe0400000
 #define TST		0xe1000000
 #define UMULL		0xe0800090
@@ -564,6 +568,7 @@
 
 static sljit_uw get_imm(sljit_uw imm);
 static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 reg, sljit_uw imm);
+static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg);
 
 static SLJIT_INLINE void inline_set_const(sljit_uw addr, sljit_sw executable_offset, sljit_uw new_constant, sljit_s32 flush_cache)
 {
@@ -955,12 +960,19 @@
 #endif
 
 	case SLJIT_HAS_CLZ:
+	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_CMOV:
 #if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
+	case SLJIT_HAS_CTZ:
 	case SLJIT_HAS_PREFETCH:
 #endif
 		return 1;
 
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+	case SLJIT_HAS_CTZ:
+		return 2;
+#endif
+
 	default:
 		return 0;
 	}
@@ -1049,7 +1061,8 @@
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
 	sljit_uw imm, offset;
-	sljit_s32 i, tmp, size, word_arg_count, saved_arg_count;
+	sljit_s32 i, tmp, size, word_arg_count;
+	sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options);
 #ifdef __SOFTFP__
 	sljit_u32 float_arg_count;
 #else
@@ -1065,7 +1078,7 @@
 	imm = 0;
 
 	tmp = SLJIT_S0 - saveds;
-	for (i = SLJIT_S0; i > tmp; i--)
+	for (i = SLJIT_S0 - saved_arg_count; i > tmp; i--)
 		imm |= (sljit_uw)1 << reg_map[i];
 
 	for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--)
@@ -1082,7 +1095,7 @@
 		FAIL_IF(push_inst(compiler, 0xe52d0004 | RD(TMP_REG2)));
 
 	/* Stack must be aligned to 8 bytes: */
-	size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
+	size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 1);
 
 	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
 		if ((size & SSIZE_OF(sw)) != 0) {
@@ -1103,6 +1116,9 @@
 	local_size = ((size + local_size + 0x7) & ~0x7) - size;
 	compiler->local_size = local_size;
 
+	if (options & SLJIT_ENTER_REG_ARG)
+		arg_types = 0;
+
 	arg_types >>= SLJIT_ARG_SHIFT;
 	word_arg_count = 0;
 	saved_arg_count = 0;
@@ -1148,8 +1164,7 @@
 			if (offset < 4 * sizeof(sljit_sw))
 				FAIL_IF(push_inst(compiler, MOV | RD(tmp) | (offset >> 2)));
 			else
-				FAIL_IF(push_inst(compiler, data_transfer_insts[WORD_SIZE | LOAD_DATA] | 0x800000
-						| RN(SLJIT_SP) | RD(tmp) | (offset + (sljit_uw)size - 4 * sizeof(sljit_sw))));
+				FAIL_IF(push_inst(compiler, LDR | 0x800000 | RN(SLJIT_SP) | RD(tmp) | (offset + (sljit_uw)size - 4 * sizeof(sljit_sw))));
 			break;
 		}
 
@@ -1217,7 +1232,7 @@
 	CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
 	set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
-	size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
+	size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1);
 
 	if ((size & SSIZE_OF(sw)) != 0 && (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG))
 		size += SSIZE_OF(sw);
@@ -1231,8 +1246,11 @@
 	sljit_uw imm2 = get_imm(imm);
 
 	if (imm2 == 0) {
-		FAIL_IF(load_immediate(compiler, TMP_REG2, imm));
-		imm2 = RM(TMP_REG2);
+		imm2 = (imm & ~(sljit_uw)0x3ff) >> 10;
+		imm = (imm & 0x3ff) >> 2;
+
+		FAIL_IF(push_inst(compiler, ADD | SRC2_IMM | RD(SLJIT_SP) | RN(SLJIT_SP) | 0xb00 | imm2));
+		return push_inst(compiler, ADD | SRC2_IMM | RD(SLJIT_SP) | RN(SLJIT_SP) | 0xf00 | (imm & 0xff));
 	}
 
 	return push_inst(compiler, ADD | RD(SLJIT_SP) | RN(SLJIT_SP) | imm2);
@@ -1241,10 +1259,11 @@
 static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 frame_size)
 {
 	sljit_s32 local_size, fscratches, fsaveds, i, tmp;
+	sljit_s32 restored_reg = 0;
 	sljit_s32 lr_dst = TMP_PC;
-	sljit_uw reg_list;
+	sljit_uw reg_list = 0;
 
-	SLJIT_ASSERT(reg_map[TMP_REG2] == 14);
+	SLJIT_ASSERT(reg_map[TMP_REG2] == 14 && frame_size <= 128);
 
 	local_size = compiler->local_size;
 	fscratches = compiler->fscratches;
@@ -1269,45 +1288,84 @@
 	if (frame_size < 0) {
 		lr_dst = TMP_REG2;
 		frame_size = 0;
-	} else if (frame_size > 0)
+	} else if (frame_size > 0) {
+		SLJIT_ASSERT(frame_size == 1 || (frame_size & 0x7) == 0);
 		lr_dst = 0;
+		frame_size &= ~0x7;
+	}
 
-	reg_list = 0;
 	if (lr_dst != 0)
 		reg_list |= (sljit_uw)1 << reg_map[lr_dst];
 
 	tmp = SLJIT_S0 - compiler->saveds;
-	for (i = SLJIT_S0; i > tmp; i--)
-		reg_list |= (sljit_uw)1 << reg_map[i];
+	i = SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
+	if (tmp < i) {
+		restored_reg = i;
+		do {
+			reg_list |= (sljit_uw)1 << reg_map[i];
+		} while (--i > tmp);
+	}
 
-	for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--)
-		reg_list |= (sljit_uw)1 << reg_map[i];
+	i = compiler->scratches;
+	if (i >= SLJIT_FIRST_SAVED_REG) {
+		restored_reg = i;
+		do {
+			reg_list |= (sljit_uw)1 << reg_map[i];
+		} while (--i >= SLJIT_FIRST_SAVED_REG);
+	}
+
+	if (lr_dst == TMP_REG2 && reg_list == 0) {
+		restored_reg = TMP_REG2;
+		lr_dst = 0;
+	}
 
 	if (lr_dst == 0 && (reg_list & (reg_list - 1)) == 0) {
 		/* The local_size does not include the saved registers. */
-		local_size += SSIZE_OF(sw);
+		tmp = 0;
+		if (reg_list != 0) {
+			tmp = 2;
+			if (local_size <= 0xfff) {
+				if (local_size == 0) {
+					SLJIT_ASSERT(restored_reg != TMP_REG2);
+					if (frame_size == 0)
+						return push_inst(compiler, LDR_POST | RN(SLJIT_SP) | RD(restored_reg) | 0x800008);
+					if (frame_size > 2 * SSIZE_OF(sw))
+						return push_inst(compiler, LDR_POST | RN(SLJIT_SP) | RD(restored_reg) | (sljit_uw)(frame_size - (2 * SSIZE_OF(sw))));
+				}
 
-		if (reg_list != 0)
-			local_size += SSIZE_OF(sw);
+				FAIL_IF(push_inst(compiler, LDR | 0x800000 | RN(SLJIT_SP) | RD(restored_reg) | (sljit_uw)local_size));
+				tmp = 1;
+			} else if (frame_size == 0) {
+				frame_size = (restored_reg == TMP_REG2) ? SSIZE_OF(sw) : 2 * SSIZE_OF(sw);
+				tmp = 3;
+			}
+
+			/* Place for the saved register. */
+			if (restored_reg != TMP_REG2)
+				local_size += SSIZE_OF(sw);
+		}
+
+		/* Place for the lr register. */
+		local_size += SSIZE_OF(sw);
 
 		if (frame_size > local_size)
 			FAIL_IF(push_inst(compiler, SUB | RD(SLJIT_SP) | RN(SLJIT_SP) | (1 << 25) | (sljit_uw)(frame_size - local_size)));
 		else if (frame_size < local_size)
 			FAIL_IF(emit_add_sp(compiler, (sljit_uw)(local_size - frame_size)));
 
-		if (reg_list == 0)
+		if (tmp <= 1)
 			return SLJIT_SUCCESS;
 
-		if (compiler->saveds > 0) {
-			SLJIT_ASSERT(reg_list == ((sljit_uw)1 << reg_map[SLJIT_S0]));
-			lr_dst = SLJIT_S0;
-		} else {
-			SLJIT_ASSERT(reg_list == ((sljit_uw)1 << reg_map[SLJIT_FIRST_SAVED_REG]));
-			lr_dst = SLJIT_FIRST_SAVED_REG;
+		if (tmp == 2) {
+			frame_size -= SSIZE_OF(sw);
+			if (restored_reg != TMP_REG2)
+				frame_size -= SSIZE_OF(sw);
+
+			return push_inst(compiler, LDR | 0x800000 | RN(SLJIT_SP) | RD(restored_reg) | (sljit_uw)frame_size);
 		}
 
-		return push_inst(compiler, data_transfer_insts[WORD_SIZE | LOAD_DATA] | 0x800000
-			| RN(SLJIT_SP) | RD(lr_dst) | (sljit_uw)(frame_size - 2 * SSIZE_OF(sw)));
+		tmp = (restored_reg == TMP_REG2) ? 0x800004 : 0x800008;
+		return push_inst(compiler, LDR_POST | RN(SLJIT_SP) | RD(restored_reg) | (sljit_uw)tmp);
 	}
 
 	if (local_size > 0)
@@ -1320,13 +1378,18 @@
 		SLJIT_ASSERT(lr_dst != 0);
 		SLJIT_ASSERT(reg_list == (sljit_uw)1 << reg_map[lr_dst]);
 
-		return push_inst(compiler, 0xe49d0004 | RD(lr_dst));
+		return push_inst(compiler, LDR_POST | RN(SLJIT_SP) | RD(lr_dst) | 0x800004);
 	}
 
 	FAIL_IF(push_inst(compiler, POP | reg_list));
+
 	if (frame_size > 0)
 		return push_inst(compiler, SUB | RD(SLJIT_SP) | RN(SLJIT_SP) | (1 << 25) | ((sljit_uw)frame_size - sizeof(sljit_sw)));
-	return SLJIT_SUCCESS;
+
+	if (lr_dst != 0)
+		return SLJIT_SUCCESS;
+
+	return push_inst(compiler, ADD | RD(SLJIT_SP) | RN(SLJIT_SP) | (1 << 25) | sizeof(sljit_sw));
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
@@ -1337,28 +1400,38 @@
 	return emit_stack_frame_release(compiler, 0);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG1, src, srcw, TMP_REG1));
+		src = TMP_REG1;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | RM(src)));
+		src = TMP_REG1;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */
 
-#define EMIT_SHIFT_INS_AND_RETURN(opcode) \
-	SLJIT_ASSERT(!(flags & INV_IMM) && !(src2 & SRC2_IMM)); \
-	if (compiler->shift_imm != 0x20) { \
-		SLJIT_ASSERT(src1 == TMP_REG1); \
-		SLJIT_ASSERT(!(flags & ARGS_SWAPPED)); \
-		\
-		if (compiler->shift_imm != 0) \
-			return push_inst(compiler, MOV | (flags & SET_FLAGS) | \
-				RD(dst) | (compiler->shift_imm << 7) | (opcode << 5) | RM(src2)); \
-		return push_inst(compiler, MOV | (flags & SET_FLAGS) | RD(dst) | RM(src2)); \
-	} \
-	return push_inst(compiler, MOV | (flags & SET_FLAGS) | RD(dst) \
-		| RM8((flags & ARGS_SWAPPED) ? src1 : src2) | (sljit_uw)(opcode << 5) \
-		| 0x10 | RM((flags & ARGS_SWAPPED) ? src2 : src1));
-
 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
 	sljit_uw dst, sljit_uw src1, sljit_uw src2)
 {
+	sljit_s32 is_masked;
+	sljit_uw shift_type;
+
 	switch (GET_OPCODE(op)) {
 	case SLJIT_MOV:
 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & ARGS_SWAPPED));
@@ -1413,11 +1486,24 @@
 		return push_inst(compiler, MVN | (flags & SET_FLAGS) | RD(dst) | RM(src2));
 
 	case SLJIT_CLZ:
-		SLJIT_ASSERT(!(flags & INV_IMM));
-		SLJIT_ASSERT(!(src2 & SRC2_IMM));
+		SLJIT_ASSERT(!(flags & INV_IMM) && !(src2 & SRC2_IMM));
 		FAIL_IF(push_inst(compiler, CLZ | RD(dst) | RM(src2)));
 		return SLJIT_SUCCESS;
 
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(!(flags & INV_IMM) && !(src2 & SRC2_IMM));
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & ARGS_SWAPPED));
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+		FAIL_IF(push_inst(compiler, RSB | SRC2_IMM | RD(TMP_REG1) | RN(src2) | 0));
+		FAIL_IF(push_inst(compiler, AND | RD(TMP_REG2) | RN(src2) | RM(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, CLZ | RD(dst) | RM(TMP_REG2)));
+		FAIL_IF(push_inst(compiler, CMP | SET_FLAGS | SRC2_IMM | RN(dst) | 32));
+		return push_inst(compiler, (EOR ^ 0xf0000000) | SRC2_IMM | RD(dst) | RN(dst) | 0x1f);
+#else /* !SLJIT_CONFIG_ARM_V5 */
+		FAIL_IF(push_inst(compiler, RBIT | RD(dst) | RM(src2)));
+		return push_inst(compiler, CLZ | RD(dst) | RM(dst));
+#endif /* SLJIT_CONFIG_ARM_V5 */
+
 	case SLJIT_ADD:
 		SLJIT_ASSERT(!(flags & INV_IMM));
 
@@ -1471,17 +1557,61 @@
 		return push_inst(compiler, EOR | (flags & SET_FLAGS) | RD(dst) | RN(src1) | ((src2 & SRC2_IMM) ? src2 : RM(src2)));
 
 	case SLJIT_SHL:
-		EMIT_SHIFT_INS_AND_RETURN(0);
+	case SLJIT_MSHL:
+		shift_type = 0;
+		is_masked = GET_OPCODE(op) == SLJIT_MSHL;
+		break;
 
 	case SLJIT_LSHR:
-		EMIT_SHIFT_INS_AND_RETURN(1);
+	case SLJIT_MLSHR:
+		shift_type = 1;
+		is_masked = GET_OPCODE(op) == SLJIT_MLSHR;
+		break;
 
 	case SLJIT_ASHR:
-		EMIT_SHIFT_INS_AND_RETURN(2);
+	case SLJIT_MASHR:
+		shift_type = 2;
+		is_masked = GET_OPCODE(op) == SLJIT_MASHR;
+		break;
+
+	case SLJIT_ROTL:
+		if (compiler->shift_imm == 0x20) {
+			FAIL_IF(push_inst(compiler, RSB | SRC2_IMM | RD(TMP_REG2) | RN(src2) | 0));
+			src2 = TMP_REG2;
+		} else
+			compiler->shift_imm = (sljit_uw)(-(sljit_sw)compiler->shift_imm) & 0x1f;
+		/* fallthrough */
+
+	case SLJIT_ROTR:
+		shift_type = 3;
+		is_masked = 0;
+		break;
+
+	default:
+		SLJIT_UNREACHABLE();
+		return SLJIT_SUCCESS;
 	}
 
-	SLJIT_UNREACHABLE();
-	return SLJIT_SUCCESS;
+	SLJIT_ASSERT(!(flags & ARGS_SWAPPED) && !(flags & INV_IMM) && !(src2 & SRC2_IMM));
+
+	if (compiler->shift_imm != 0x20) {
+		SLJIT_ASSERT(src1 == TMP_REG1);
+
+		if (compiler->shift_imm != 0)
+			return push_inst(compiler, MOV | (flags & SET_FLAGS) |
+				RD(dst) | (compiler->shift_imm << 7) | (shift_type << 5) | RM(src2));
+		return push_inst(compiler, MOV | (flags & SET_FLAGS) | RD(dst) | RM(src2));
+	}
+
+	SLJIT_ASSERT(src1 != TMP_REG2);
+
+	if (is_masked) {
+		FAIL_IF(push_inst(compiler, AND | RD(TMP_REG2) | RN(src2) | SRC2_IMM | 0x1f));
+		src2 = TMP_REG2;
+	}
+
+	return push_inst(compiler, MOV | (flags & SET_FLAGS) | RD(dst)
+		| RM8(src2) | (sljit_uw)(shift_type << 5) | 0x10 | RM(src1));
 }
 
 #undef EMIT_SHIFT_INS_AND_RETURN
@@ -1670,27 +1800,32 @@
 #endif
 }
 
-static SLJIT_INLINE sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg,
+static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg,
 	sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg)
 {
-	sljit_uw imm, offset_reg;
-	sljit_uw is_type1_transfer = IS_TYPE1_TRANSFER(flags);
+	sljit_uw imm, offset_reg, tmp;
+	sljit_sw mask = IS_TYPE1_TRANSFER(flags) ? 0xfff : 0xff;
+	sljit_sw sign = IS_TYPE1_TRANSFER(flags) ? 0x1000 : 0x100;
 
-	SLJIT_ASSERT (arg & SLJIT_MEM);
-	SLJIT_ASSERT((arg & REG_MASK) != tmp_reg);
+	SLJIT_ASSERT(arg & SLJIT_MEM);
+	SLJIT_ASSERT((arg & REG_MASK) != tmp_reg || (arg == SLJIT_MEM1(tmp_reg) && argw >= -mask && argw <= mask));
 
-	if (!(arg & REG_MASK)) {
-		if (is_type1_transfer) {
-			FAIL_IF(load_immediate(compiler, tmp_reg, (sljit_uw)argw & ~(sljit_uw)0xfff));
-			argw &= 0xfff;
-		}
-		else {
-			FAIL_IF(load_immediate(compiler, tmp_reg, (sljit_uw)argw & ~(sljit_uw)0xff));
-			argw &= 0xff;
+	if (SLJIT_UNLIKELY(!(arg & REG_MASK))) {
+		tmp = (sljit_uw)(argw & (sign | mask));
+		tmp = (sljit_uw)((argw + (tmp <= (sljit_uw)sign ? 0 : sign)) & ~mask);
+
+		FAIL_IF(load_immediate(compiler, tmp_reg, tmp));
+
+		argw -= (sljit_sw)tmp;
+		tmp = 1;
+
+		if (argw < 0) {
+			argw = -argw;
+			tmp = 0;
 		}
 
-		return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, tmp_reg,
-			is_type1_transfer ? argw : TYPE2_TRANSFER_IMM(argw)));
+		return push_inst(compiler, EMIT_DATA_TRANSFER(flags, tmp, reg, tmp_reg,
+			(mask == 0xff) ? TYPE2_TRANSFER_IMM(argw) : argw));
 	}
 
 	if (arg & OFFS_REG_MASK) {
@@ -1698,72 +1833,62 @@
 		arg &= REG_MASK;
 		argw &= 0x3;
 
-		if (argw != 0 && !is_type1_transfer) {
+		if (argw != 0 && (mask == 0xff)) {
 			FAIL_IF(push_inst(compiler, ADD | RD(tmp_reg) | RN(arg) | RM(offset_reg) | ((sljit_uw)argw << 7)));
 			return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, tmp_reg, TYPE2_TRANSFER_IMM(0)));
 		}
 
 		/* Bit 25: RM is offset. */
 		return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, arg,
-			RM(offset_reg) | (is_type1_transfer ? (1 << 25) : 0) | ((sljit_uw)argw << 7)));
+			RM(offset_reg) | (mask == 0xff ? 0 : (1 << 25)) | ((sljit_uw)argw << 7)));
 	}
 
 	arg &= REG_MASK;
 
-	if (is_type1_transfer) {
-		if (argw > 0xfff) {
-			imm = get_imm((sljit_uw)argw & ~(sljit_uw)0xfff);
-			if (imm) {
-				FAIL_IF(push_inst(compiler, ADD | RD(tmp_reg) | RN(arg) | imm));
-				argw = argw & 0xfff;
-				arg = tmp_reg;
-			}
-		}
-		else if (argw < -0xfff) {
-			imm = get_imm((sljit_uw)-argw & ~(sljit_uw)0xfff);
-			if (imm) {
-				FAIL_IF(push_inst(compiler, SUB | RD(tmp_reg) | RN(arg) | imm));
-				argw = -(-argw & 0xfff);
-				arg = tmp_reg;
-			}
-		}
+	if (argw > mask) {
+		tmp = (sljit_uw)(argw & (sign | mask));
+		tmp = (sljit_uw)((argw + (tmp <= (sljit_uw)sign ? 0 : sign)) & ~mask);
+		imm = get_imm(tmp);
 
-		if (argw >= 0 && argw <= 0xfff)
-			return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, arg, argw));
+		if (imm) {
+			FAIL_IF(push_inst(compiler, ADD | RD(tmp_reg) | RN(arg) | imm));
+			argw -= (sljit_sw)tmp;
+			arg = tmp_reg;
 
-		if (argw < 0 && argw >= -0xfff)
-			return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 0, reg, arg, -argw));
+			SLJIT_ASSERT(argw >= -mask && argw <= mask);
+		}
+	} else if (argw < -mask) {
+		tmp = (sljit_uw)(-argw & (sign | mask));
+		tmp = (sljit_uw)((-argw + (tmp <= (sljit_uw)sign ? 0 : sign)) & ~mask);
+		imm = get_imm(tmp);
+
+		if (imm) {
+			FAIL_IF(push_inst(compiler, SUB | RD(tmp_reg) | RN(arg) | imm));
+			argw += (sljit_sw)tmp;
+			arg = tmp_reg;
+
+			SLJIT_ASSERT(argw >= -mask && argw <= mask);
+		}
 	}
-	else {
-		if (argw > 0xff) {
-			imm = get_imm((sljit_uw)argw & ~(sljit_uw)0xff);
-			if (imm) {
-				FAIL_IF(push_inst(compiler, ADD | RD(tmp_reg) | RN(arg) | imm));
-				argw = argw & 0xff;
-				arg = tmp_reg;
-			}
-		}
-		else if (argw < -0xff) {
-			imm = get_imm((sljit_uw)-argw & ~(sljit_uw)0xff);
-			if (imm) {
-				FAIL_IF(push_inst(compiler, SUB | RD(tmp_reg) | RN(arg) | imm));
-				argw = -(-argw & 0xff);
-				arg = tmp_reg;
-			}
+
+	if (argw <= mask && argw >= -mask) {
+		if (argw >= 0) {
+			if (mask == 0xff)
+				argw = TYPE2_TRANSFER_IMM(argw);
+			return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, arg, argw));
 		}
 
-		if (argw >= 0 && argw <= 0xff)
-			return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, arg, TYPE2_TRANSFER_IMM(argw)));
+		argw = -argw;
 
-		if (argw < 0 && argw >= -0xff) {
-			argw = -argw;
-			return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 0, reg, arg, TYPE2_TRANSFER_IMM(argw)));
-		}
+		if (mask == 0xff)
+			argw = TYPE2_TRANSFER_IMM(argw);
+
+		return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 0, reg, arg, argw));
 	}
 
 	FAIL_IF(load_immediate(compiler, tmp_reg, (sljit_uw)argw));
 	return push_inst(compiler, EMIT_DATA_TRANSFER(flags, 1, reg, arg,
-		RM(tmp_reg) | (is_type1_transfer ? (1 << 25) : 0)));
+		RM(tmp_reg) | (mask == 0xff ? 0 : (1 << 25))));
 }
 
 static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 inp_flags,
@@ -1961,15 +2086,15 @@
 			saved_reg_list[saved_reg_count++] = 1;
 
 		if (saved_reg_count > 0) {
-			FAIL_IF(push_inst(compiler, 0xe52d0000 | (saved_reg_count >= 3 ? 16 : 8)
+			FAIL_IF(push_inst(compiler, STR | 0x2d0000 | (saved_reg_count >= 3 ? 16 : 8)
 						| (saved_reg_list[0] << 12) /* str rX, [sp, #-8/-16]! */));
 			if (saved_reg_count >= 2) {
 				SLJIT_ASSERT(saved_reg_list[1] < 8);
-				FAIL_IF(push_inst(compiler, 0xe58d0004 | (saved_reg_list[1] << 12) /* str rX, [sp, #4] */));
+				FAIL_IF(push_inst(compiler, STR | 0x8d0004 | (saved_reg_list[1] << 12) /* str rX, [sp, #4] */));
 			}
 			if (saved_reg_count >= 3) {
 				SLJIT_ASSERT(saved_reg_list[2] < 8);
-				FAIL_IF(push_inst(compiler, 0xe58d0008 | (saved_reg_list[2] << 12) /* str rX, [sp, #8] */));
+				FAIL_IF(push_inst(compiler, STR | 0x8d0008 | (saved_reg_list[2] << 12) /* str rX, [sp, #8] */));
 			}
 		}
 
@@ -1983,13 +2108,13 @@
 		if (saved_reg_count > 0) {
 			if (saved_reg_count >= 3) {
 				SLJIT_ASSERT(saved_reg_list[2] < 8);
-				FAIL_IF(push_inst(compiler, 0xe59d0008 | (saved_reg_list[2] << 12) /* ldr rX, [sp, #8] */));
+				FAIL_IF(push_inst(compiler, LDR | 0x8d0008 | (saved_reg_list[2] << 12) /* ldr rX, [sp, #8] */));
 			}
 			if (saved_reg_count >= 2) {
 				SLJIT_ASSERT(saved_reg_list[1] < 8);
-				FAIL_IF(push_inst(compiler, 0xe59d0004 | (saved_reg_list[1] << 12) /* ldr rX, [sp, #4] */));
+				FAIL_IF(push_inst(compiler, LDR | 0x8d0004 | (saved_reg_list[1] << 12) /* ldr rX, [sp, #4] */));
 			}
-			return push_inst(compiler, 0xe49d0000 | (sljit_uw)(saved_reg_count >= 3 ? 16 : 8)
+			return push_inst(compiler, (LDR ^ (1 << 24)) | 0x8d0000 | (sljit_uw)(saved_reg_count >= 3 ? 16 : 8)
 						| (saved_reg_list[0] << 12) /* ldr rX, [sp], #8/16 */);
 		}
 		return SLJIT_SUCCESS;
@@ -2034,6 +2159,7 @@
 		return emit_op(compiler, op, ALLOW_ANY_IMM, dst, dstw, TMP_REG1, 0, src, srcw);
 
 	case SLJIT_CLZ:
+	case SLJIT_CTZ:
 		return emit_op(compiler, op, 0, dst, dstw, TMP_REG1, 0, src, srcw);
 	}
 
@@ -2069,13 +2195,17 @@
 		return emit_op(compiler, op, ALLOW_ANY_IMM, dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
 		if (src2 & SLJIT_IMM) {
 			compiler->shift_imm = src2w & 0x1f;
 			return emit_op(compiler, op, 0, dst, dstw, TMP_REG1, 0, src1, src1w);
-		}
-		else {
+		} else {
 			compiler->shift_imm = 0x20;
 			return emit_op(compiler, op, 0, dst, dstw, src1, src1w, src2, src2w);
 		}
@@ -2091,13 +2221,67 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src_dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_s32 is_left;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, src_dst, src1, src1w, src2, src2w));
+
+	op = GET_OPCODE(op);
+	is_left = (op == SLJIT_SHL || op == SLJIT_MSHL);
+
+	if (src_dst == src1) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, is_left ? SLJIT_ROTL : SLJIT_ROTR, src_dst, 0, src_dst, 0, src2, src2w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+	/* Shift type of ROR is 3. */
+	if (src2 & SLJIT_IMM) {
+		src2w &= 0x1f;
+
+		if (src2w == 0)
+			return SLJIT_SUCCESS;
+	} else if (src2 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG2, src2, src2w, TMP_REG2));
+		src2 = TMP_REG2;
+	}
+
+	if (src1 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG1, src1, src1w, TMP_REG1));
+		src1 = TMP_REG1;
+	} else if (src1 & SLJIT_IMM) {
+		FAIL_IF(load_immediate(compiler, TMP_REG1, (sljit_uw)src1w));
+		src1 = TMP_REG1;
+	}
+
+	if (src2 & SLJIT_IMM) {
+		FAIL_IF(push_inst(compiler, MOV | RD(src_dst) | RM(src_dst) | ((sljit_uw)(is_left ? 0 : 1) << 5) | ((sljit_uw)src2w << 7)));
+		src2w = (src2w ^ 0x1f) + 1;
+		return push_inst(compiler, ORR | RD(src_dst) | RN(src_dst) | RM(src1) | ((sljit_uw)(is_left ? 1 : 0) << 5) | ((sljit_uw)src2w << 7));
+	}
+
+	if (op == SLJIT_MSHL || op == SLJIT_MLSHR) {
+		FAIL_IF(push_inst(compiler, AND | SRC2_IMM | RD(TMP_REG2) | RN(src2) | 0x1f));
+		src2 = TMP_REG2;
+	}
+
+	FAIL_IF(push_inst(compiler, MOV | RD(src_dst) | RM8(src2) | ((sljit_uw)(is_left ? 0 : 1) << 5) | 0x10 | RM(src_dst)));
+	FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | RM(src1) | ((sljit_uw)(is_left ? 1 : 0) << 5) | (1 << 7)));
+	FAIL_IF(push_inst(compiler, EOR | SRC2_IMM | RD(TMP_REG2) | RN(src2) | 0x1f));
+	return push_inst(compiler, ORR | RD(src_dst) | RN(src_dst) | RM(TMP_REG1) | ((sljit_uw)(is_left ? 1 : 0) << 5) | 0x10 | RM8(TMP_REG2));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -2370,7 +2554,6 @@
 	return SLJIT_SUCCESS;
 }
 
-#undef FPU_LOAD
 #undef EMIT_FPU_DATA_TRANSFER
 
 /* --------------------------------------------------------------------- */
@@ -2400,11 +2583,15 @@
 {
 	switch (type) {
 	case SLJIT_EQUAL:
-	case SLJIT_EQUAL_F64:
+	case SLJIT_F_EQUAL:
+	case SLJIT_ORDERED_EQUAL:
+	case SLJIT_UNORDERED_OR_EQUAL: /* Not supported. */
 		return 0x00000000;
 
 	case SLJIT_NOT_EQUAL:
-	case SLJIT_NOT_EQUAL_F64:
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL: /* Not supported. */
 		return 0x10000000;
 
 	case SLJIT_CARRY:
@@ -2413,7 +2600,6 @@
 		/* fallthrough */
 
 	case SLJIT_LESS:
-	case SLJIT_LESS_F64:
 		return 0x30000000;
 
 	case SLJIT_NOT_CARRY:
@@ -2422,27 +2608,33 @@
 		/* fallthrough */
 
 	case SLJIT_GREATER_EQUAL:
-	case SLJIT_GREATER_EQUAL_F64:
 		return 0x20000000;
 
 	case SLJIT_GREATER:
-	case SLJIT_GREATER_F64:
+	case SLJIT_UNORDERED_OR_GREATER:
 		return 0x80000000;
 
 	case SLJIT_LESS_EQUAL:
-	case SLJIT_LESS_EQUAL_F64:
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_ORDERED_LESS_EQUAL:
 		return 0x90000000;
 
 	case SLJIT_SIG_LESS:
+	case SLJIT_UNORDERED_OR_LESS:
 		return 0xb0000000;
 
 	case SLJIT_SIG_GREATER_EQUAL:
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_ORDERED_GREATER_EQUAL:
 		return 0xa0000000;
 
 	case SLJIT_SIG_GREATER:
+	case SLJIT_F_GREATER:
+	case SLJIT_ORDERED_GREATER:
 		return 0xc0000000;
 
 	case SLJIT_SIG_LESS_EQUAL:
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
 		return 0xd0000000;
 
 	case SLJIT_OVERFLOW:
@@ -2450,7 +2642,7 @@
 			return 0x10000000;
 		/* fallthrough */
 
-	case SLJIT_UNORDERED_F64:
+	case SLJIT_UNORDERED:
 		return 0x60000000;
 
 	case SLJIT_NOT_OVERFLOW:
@@ -2458,11 +2650,18 @@
 			return 0x00000000;
 		/* fallthrough */
 
-	case SLJIT_ORDERED_F64:
+	case SLJIT_ORDERED:
 		return 0x70000000;
 
+	case SLJIT_F_LESS:
+	case SLJIT_ORDERED_LESS:
+		return 0x40000000;
+
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+		return 0x50000000;
+
 	default:
-		SLJIT_ASSERT(type >= SLJIT_JUMP && type <= SLJIT_CALL_CDECL);
+		SLJIT_ASSERT(type >= SLJIT_JUMP && type <= SLJIT_CALL_REG_ARG);
 		return 0xe0000000;
 	}
 }
@@ -2639,7 +2838,7 @@
 					}
 					FAIL_IF(push_inst(compiler, MOV | (offset << 10) | (word_arg_offset >> 2)));
 				} else
-					FAIL_IF(push_inst(compiler, data_transfer_insts[WORD_SIZE] | 0x800000 | RN(SLJIT_SP) | (word_arg_offset << 10) | (offset - 4 * sizeof(sljit_sw))));
+					FAIL_IF(push_inst(compiler, STR | 0x800000 | RN(SLJIT_SP) | (word_arg_offset << 10) | (offset - 4 * sizeof(sljit_sw))));
 			}
 			break;
 		}
@@ -2718,51 +2917,48 @@
 	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
 
 #ifdef __SOFTFP__
-	PTR_FAIL_IF(softfloat_call_with_args(compiler, arg_types, NULL, &extra_space));
-	SLJIT_ASSERT((extra_space & 0x7) == 0);
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG) {
+		PTR_FAIL_IF(softfloat_call_with_args(compiler, arg_types, NULL, &extra_space));
+		SLJIT_ASSERT((extra_space & 0x7) == 0);
 
-	if ((type & SLJIT_CALL_RETURN) && extra_space == 0)
-		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
+		if ((type & SLJIT_CALL_RETURN) && extra_space == 0)
+			type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+		SLJIT_SKIP_CHECKS(compiler);
+		jump = sljit_emit_jump(compiler, type);
+		PTR_FAIL_IF(jump == NULL);
 
-	jump = sljit_emit_jump(compiler, type);
-	PTR_FAIL_IF(jump == NULL);
+		if (extra_space > 0) {
+			if (type & SLJIT_CALL_RETURN)
+				PTR_FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(WORD_SIZE | LOAD_DATA, 1,
+					TMP_REG2, SLJIT_SP, extra_space - sizeof(sljit_sw))));
 
-	if (extra_space > 0) {
-		if (type & SLJIT_CALL_RETURN)
-			PTR_FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(WORD_SIZE | LOAD_DATA, 1,
-				TMP_REG2, SLJIT_SP, extra_space - sizeof(sljit_sw))));
+			PTR_FAIL_IF(push_inst(compiler, ADD | RD(SLJIT_SP) | RN(SLJIT_SP) | SRC2_IMM | extra_space));
 
-		PTR_FAIL_IF(push_inst(compiler, ADD | RD(SLJIT_SP) | RN(SLJIT_SP) | SRC2_IMM | extra_space));
-
-		if (type & SLJIT_CALL_RETURN) {
-			PTR_FAIL_IF(push_inst(compiler, BX | RM(TMP_REG2)));
-			return jump;
+			if (type & SLJIT_CALL_RETURN) {
+				PTR_FAIL_IF(push_inst(compiler, BX | RM(TMP_REG2)));
+				return jump;
+			}
 		}
-	}
 
-	SLJIT_ASSERT(!(type & SLJIT_CALL_RETURN));
-	PTR_FAIL_IF(softfloat_post_call_with_args(compiler, arg_types));
-	return jump;
-#else /* !__SOFTFP__ */
+		SLJIT_ASSERT(!(type & SLJIT_CALL_RETURN));
+		PTR_FAIL_IF(softfloat_post_call_with_args(compiler, arg_types));
+		return jump;
+	}
+#endif /* __SOFTFP__ */
+
 	if (type & SLJIT_CALL_RETURN) {
 		PTR_FAIL_IF(emit_stack_frame_release(compiler, -1));
 		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 	}
 
-	PTR_FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+#ifndef __SOFTFP__
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG)
+		PTR_FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+#endif /* !__SOFTFP__ */
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_jump(compiler, type);
-#endif /* __SOFTFP__ */
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
@@ -2822,55 +3018,79 @@
 		src = TMP_REG1;
 	}
 
-	if ((type & SLJIT_CALL_RETURN) && (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0)) {
+	if ((type & SLJIT_CALL_RETURN) && (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options)))) {
 		FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG1) | RM(src)));
 		src = TMP_REG1;
 	}
 
 #ifdef __SOFTFP__
-	FAIL_IF(softfloat_call_with_args(compiler, arg_types, &src, &extra_space));
-	SLJIT_ASSERT((extra_space & 0x7) == 0);
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG) {
+		FAIL_IF(softfloat_call_with_args(compiler, arg_types, &src, &extra_space));
+		SLJIT_ASSERT((extra_space & 0x7) == 0);
 
-	if ((type & SLJIT_CALL_RETURN) && extra_space == 0)
-		type = SLJIT_JUMP;
+		if ((type & SLJIT_CALL_RETURN) && extra_space == 0)
+			type = SLJIT_JUMP;
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+		SLJIT_SKIP_CHECKS(compiler);
+		FAIL_IF(sljit_emit_ijump(compiler, type, src, srcw));
 
-	FAIL_IF(sljit_emit_ijump(compiler, type, src, srcw));
+		if (extra_space > 0) {
+			if (type & SLJIT_CALL_RETURN)
+				FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(WORD_SIZE | LOAD_DATA, 1,
+					TMP_REG2, SLJIT_SP, extra_space - sizeof(sljit_sw))));
 
-	if (extra_space > 0) {
-		if (type & SLJIT_CALL_RETURN)
-			FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(WORD_SIZE | LOAD_DATA, 1,
-				TMP_REG2, SLJIT_SP, extra_space - sizeof(sljit_sw))));
+			FAIL_IF(push_inst(compiler, ADD | RD(SLJIT_SP) | RN(SLJIT_SP) | SRC2_IMM | extra_space));
 
-		FAIL_IF(push_inst(compiler, ADD | RD(SLJIT_SP) | RN(SLJIT_SP) | SRC2_IMM | extra_space));
+			if (type & SLJIT_CALL_RETURN)
+				return push_inst(compiler, BX | RM(TMP_REG2));
+		}
 
-		if (type & SLJIT_CALL_RETURN)
-			return push_inst(compiler, BX | RM(TMP_REG2));
+		SLJIT_ASSERT(!(type & SLJIT_CALL_RETURN));
+		return softfloat_post_call_with_args(compiler, arg_types);
 	}
+#endif /* __SOFTFP__ */
 
-	SLJIT_ASSERT(!(type & SLJIT_CALL_RETURN));
-	return softfloat_post_call_with_args(compiler, arg_types);
-#else /* !__SOFTFP__ */
 	if (type & SLJIT_CALL_RETURN) {
 		FAIL_IF(emit_stack_frame_release(compiler, -1));
 		type = SLJIT_JUMP;
 	}
 
-	FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+#ifndef __SOFTFP__
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG)
+		FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+#endif /* !__SOFTFP__ */
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_ijump(compiler, type, src, srcw);
-#endif /* __SOFTFP__ */
 }
 
+#ifdef __SOFTFP__
+
+static SLJIT_INLINE sljit_s32 emit_fmov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+	if (compiler->options & SLJIT_ENTER_REG_ARG) {
+		if (src == SLJIT_FR0)
+			return SLJIT_SUCCESS;
+
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_fop1(compiler, op, SLJIT_RETURN_FREG, 0, src, srcw);
+	}
+
+	if (FAST_IS_REG(src)) {
+		if (op & SLJIT_32)
+			return push_inst(compiler, VMOV | (1 << 20) | RD(SLJIT_R0) | VN(src));
+		return push_inst(compiler, VMOV2 | (1 << 20) | RD(SLJIT_R0) | RN(SLJIT_R1) | VM(src));
+	}
+
+	SLJIT_SKIP_CHECKS(compiler);
+
+	if (op & SLJIT_32)
+		return sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src, srcw);
+	return sljit_emit_mem(compiler, SLJIT_MOV, SLJIT_REG_PAIR(SLJIT_R0, SLJIT_R1), src, srcw);
+}
+
+#endif /* __SOFTFP__ */
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 type)
@@ -2883,7 +3103,7 @@
 	ADJUST_LOCAL_OFFSET(dst, dstw);
 
 	op = GET_OPCODE(op);
-	cc = get_cc(compiler, type & 0xff);
+	cc = get_cc(compiler, type);
 	dst_reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
 
 	if (op < SLJIT_ADD) {
@@ -2921,9 +3141,7 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
 
-	dst_reg &= ~SLJIT_32;
-
-	cc = get_cc(compiler, type & 0xff);
+	cc = get_cc(compiler, type & ~SLJIT_32);
 
 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
 		tmp = get_imm((sljit_uw)srcw);
@@ -2949,15 +3167,284 @@
 	return push_inst(compiler, ((MOV | RD(dst_reg) | RM(src)) & ~COND_MASK) | cc);
 }
 
+static sljit_s32 update_mem_addr(struct sljit_compiler *compiler, sljit_s32 *mem, sljit_sw *memw, sljit_s32 max_offset)
+{
+	sljit_s32 arg = *mem;
+	sljit_sw argw = *memw;
+	sljit_uw imm, tmp;
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+	sljit_sw mask = max_offset >= 0xf00 ? 0xfff : 0xff;
+	sljit_sw sign = max_offset >= 0xf00 ? 0x1000 : 0x100;
+#else /* !SLJIT_CONFIG_ARM_V5 */
+	sljit_sw mask = 0xfff;
+	sljit_sw sign = 0x1000;
+
+	SLJIT_ASSERT(max_offset >= 0xf00);
+#endif /* SLJIT_CONFIG_ARM_V5 */
+
+	*mem = TMP_REG1;
+
+	if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
+		*memw = 0;
+		return push_inst(compiler, ADD | RD(TMP_REG1) | RN(arg & REG_MASK) | RM(OFFS_REG(arg)) | ((sljit_uw)(argw & 0x3) << 7));
+	}
+
+	arg &= REG_MASK;
+
+	if (arg) {
+		if (argw <= max_offset && argw >= -mask) {
+			*mem = arg;
+			return SLJIT_SUCCESS;
+		}
+
+		if (argw >= 0) {
+			tmp = (sljit_uw)(argw & (sign | mask));
+			tmp = (sljit_uw)((argw + ((tmp <= (sljit_uw)max_offset || tmp == (sljit_uw)sign) ? 0 : sign)) & ~mask);
+			imm = get_imm(tmp);
+
+			if (imm) {
+				*memw = argw - (sljit_sw)tmp;
+				SLJIT_ASSERT(*memw >= -mask && *memw <= max_offset);
+
+				return push_inst(compiler, ADD | RD(TMP_REG1) | RN(arg) | imm);
+			}
+		} else {
+			tmp = (sljit_uw)(-argw & (sign | mask));
+			tmp = (sljit_uw)((-argw + ((tmp <= (sljit_uw)((sign << 1) - max_offset - 1)) ? 0 : sign)) & ~mask);
+			imm = get_imm(tmp);
+
+			if (imm) {
+				*memw = argw + (sljit_sw)tmp;
+				SLJIT_ASSERT(*memw >= -mask && *memw <= max_offset);
+
+				return push_inst(compiler, SUB | RD(TMP_REG1) | RN(arg) | imm);
+			}
+		}
+	}
+
+	tmp = (sljit_uw)(argw & (sign | mask));
+	tmp = (sljit_uw)((argw + ((tmp <= (sljit_uw)max_offset || tmp == (sljit_uw)sign) ? 0 : sign)) & ~mask);
+	*memw = argw - (sljit_sw)tmp;
+
+	FAIL_IF(load_immediate(compiler, TMP_REG1, tmp));
+
+	if (arg == 0)
+		return SLJIT_SUCCESS;
+
+	return push_inst(compiler, ADD | RD(TMP_REG1) | RN(TMP_REG1) | RM(arg));
+}
+
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+
+static sljit_s32 sljit_emit_mem_unaligned(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_s32 flags, steps, tmp_reg;
+	sljit_uw add, shift;
+
+	switch (type & 0xff) {
+	case SLJIT_MOV_U8:
+	case SLJIT_MOV_S8:
+		flags = BYTE_SIZE;
+		if (!(type & SLJIT_MEM_STORE))
+			flags |= LOAD_DATA;
+		if ((type & 0xff) == SLJIT_MOV_S8)
+			flags |= SIGNED;
+
+		return emit_op_mem(compiler, flags, reg, mem, memw, TMP_REG1);
+
+	case SLJIT_MOV_U16:
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xfff - 1));
+		flags = BYTE_SIZE;
+		steps = 1;
+		break;
+
+	case SLJIT_MOV_S16:
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xff - 1));
+		flags = BYTE_SIZE | SIGNED;
+		steps = 1;
+		break;
+
+	default:
+		if (type & SLJIT_MEM_UNALIGNED_32) {
+			flags = WORD_SIZE;
+			if (!(type & SLJIT_MEM_STORE))
+				flags |= LOAD_DATA;
+
+			return emit_op_mem(compiler, flags, reg, mem, memw, TMP_REG1);
+		}
+
+		if (!(type & SLJIT_MEM_UNALIGNED_16)) {
+			FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xfff - 3));
+			flags = BYTE_SIZE;
+			steps = 3;
+			break;
+		}
+
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xff - 2));
+
+		add = 1;
+		if (memw < 0) {
+			add = 0;
+			memw = -memw;
+		}
+
+		tmp_reg = reg;
+
+		if (type & SLJIT_MEM_STORE) {
+			FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(HALF_SIZE, add, reg, mem, TYPE2_TRANSFER_IMM(memw))));
+			FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG2) | RM(reg) | (16 << 7) | (2 << 4)));
+		} else {
+			if (reg == mem) {
+				SLJIT_ASSERT(reg != TMP_REG1);
+				tmp_reg = TMP_REG1;
+			}
+
+			FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(HALF_SIZE | LOAD_DATA, add, tmp_reg, mem, TYPE2_TRANSFER_IMM(memw))));
+		}
+
+		if (!add) {
+			memw -= 2;
+			if (memw <= 0) {
+				memw = -memw;
+				add = 1;
+			}
+		} else
+			memw += 2;
+
+		if (type & SLJIT_MEM_STORE)
+			return push_inst(compiler, EMIT_DATA_TRANSFER(HALF_SIZE, add, TMP_REG2, mem, TYPE2_TRANSFER_IMM(memw)));
+
+		FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(HALF_SIZE | LOAD_DATA, add, TMP_REG2, mem, TYPE2_TRANSFER_IMM(memw))));
+		return push_inst(compiler, ORR | RD(reg) | RN(tmp_reg) | RM(TMP_REG2) | (16 << 7));
+	}
+
+	SLJIT_ASSERT(steps > 0);
+
+	add = 1;
+	if (memw < 0) {
+		add = 0;
+		memw = -memw;
+	}
+
+	if (type & SLJIT_MEM_STORE) {
+		FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(BYTE_SIZE, add, reg, mem, memw)));
+		FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG2) | RM(reg) | (8 << 7) | (2 << 4)));
+
+		while (1) {
+			if (!add) {
+				memw -= 1;
+				if (memw == 0)
+					add = 1;
+			} else
+				memw += 1;
+
+			FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(BYTE_SIZE, add, TMP_REG2, mem, memw)));
+
+			if (--steps == 0)
+				return SLJIT_SUCCESS;
+
+			FAIL_IF(push_inst(compiler, MOV | RD(TMP_REG2) | RM(TMP_REG2) | (8 << 7) | (2 << 4)));
+		}
+	}
+
+	tmp_reg = reg;
+
+	if (reg == mem) {
+		SLJIT_ASSERT(reg != TMP_REG1);
+		tmp_reg = TMP_REG1;
+	}
+
+	shift = 8;
+	FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(BYTE_SIZE | LOAD_DATA, add, tmp_reg, mem, memw)));
+
+	do {
+		if (!add) {
+			memw -= 1;
+			if (memw == 0)
+				add = 1;
+		} else
+			memw += 1;
+
+		if (steps > 1) {
+			FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(BYTE_SIZE | LOAD_DATA, add, TMP_REG2, mem, memw)));
+			FAIL_IF(push_inst(compiler, ORR | RD(tmp_reg) | RN(tmp_reg) | RM(TMP_REG2) | (shift << 7)));
+			shift += 8;
+		}
+	} while (--steps != 0);
+
+	flags |= LOAD_DATA;
+
+	if (flags & SIGNED)
+		FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(flags, add, TMP_REG2, mem, TYPE2_TRANSFER_IMM(memw))));
+	else
+		FAIL_IF(push_inst(compiler, EMIT_DATA_TRANSFER(flags, add, TMP_REG2, mem, memw)));
+
+	return push_inst(compiler, ORR | RD(reg) | RN(tmp_reg) | RM(TMP_REG2) | (shift << 7));
+}
+
+#endif /* SLJIT_CONFIG_ARM_V5 */
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw)
 {
 	sljit_s32 flags;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+	if (!(reg & REG_PAIR_MASK)) {
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+		ADJUST_LOCAL_OFFSET(mem, memw);
+#endif /* SLJIT_CONFIG_ARM_V5 */
+
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+	}
+
+	ADJUST_LOCAL_OFFSET(mem, memw);
+
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+	if (type & (SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16)) {
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, (type & SLJIT_MEM_UNALIGNED_16) ? 0xfff - 6 : 0xfff - 7));
+
+		if (!(type & SLJIT_MEM_STORE) && REG_PAIR_FIRST(reg) == (mem & REG_MASK)) {
+			FAIL_IF(sljit_emit_mem_unaligned(compiler, type, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw)));
+			return sljit_emit_mem_unaligned(compiler, type, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw);
+		}
+
+		FAIL_IF(sljit_emit_mem_unaligned(compiler, type, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw));
+		return sljit_emit_mem_unaligned(compiler, type, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw));
+	}
+#endif /* SLJIT_CONFIG_ARM_V5 */
+
+	FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xfff - 4));
+
+	flags = WORD_SIZE;
+
+	if (!(type & SLJIT_MEM_STORE)) {
+		if (REG_PAIR_FIRST(reg) == (mem & REG_MASK)) {
+			FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw), TMP_REG1));
+			return emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw, TMP_REG1);
+		}
+
+		flags = WORD_SIZE | LOAD_DATA;
+	}
+
+	FAIL_IF(emit_op_mem(compiler, flags, REG_PAIR_FIRST(reg), SLJIT_MEM1(mem), memw, TMP_REG1));
+	return emit_op_mem(compiler, flags, REG_PAIR_SECOND(reg), SLJIT_MEM1(mem), memw + SSIZE_OF(sw), TMP_REG1);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_s32 flags;
 	sljit_uw is_type1_transfer, inst;
 
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+	CHECK(check_sljit_emit_mem_update(compiler, type, reg, mem, memw));
 
 	is_type1_transfer = 1;
 
@@ -2999,16 +3486,12 @@
 	if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
 		if (!is_type1_transfer && memw != 0)
 			return SLJIT_ERR_UNSUPPORTED;
-	}
-	else {
+	} else {
 		if (is_type1_transfer) {
 			if (memw > 4095 || memw < -4095)
 				return SLJIT_ERR_UNSUPPORTED;
-		}
-		else {
-			if (memw > 255 || memw < -255)
-				return SLJIT_ERR_UNSUPPORTED;
-		}
+		} else if (memw > 255 || memw < -255)
+			return SLJIT_ERR_UNSUPPORTED;
 	}
 
 	if (type & SLJIT_MEM_SUPP)
@@ -3022,20 +3505,20 @@
 		if (is_type1_transfer)
 			inst |= (1 << 25);
 
-		if (type & SLJIT_MEM_PRE)
-			inst |= (1 << 21);
-		else
+		if (type & SLJIT_MEM_POST)
 			inst ^= (1 << 24);
+		else
+			inst |= (1 << 21);
 
 		return push_inst(compiler, inst);
 	}
 
 	inst = EMIT_DATA_TRANSFER(flags, 0, reg, mem & REG_MASK, 0);
 
-	if (type & SLJIT_MEM_PRE)
-		inst |= (1 << 21);
-	else
+	if (type & SLJIT_MEM_POST)
 		inst ^= (1 << 24);
+	else
+		inst |= (1 << 21);
 
 	if (is_type1_transfer) {
 		if (memw >= 0)
@@ -3054,6 +3537,103 @@
 	return push_inst(compiler, inst | TYPE2_TRANSFER_IMM((sljit_uw)memw));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 mem, sljit_sw memw)
+{
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+	sljit_s32 max_offset;
+	sljit_s32 dst;
+#endif /* SLJIT_CONFIG_ARM_V5 */
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
+
+	if (type & SLJIT_MEM_UNALIGNED_32)
+		return emit_fop_mem(compiler, ((type ^ SLJIT_32) & SLJIT_32) | ((type & SLJIT_MEM_STORE) ? 0 : FPU_LOAD), freg, mem, memw);
+
+#if (defined SLJIT_CONFIG_ARM_V5 && SLJIT_CONFIG_ARM_V5)
+	if (type & SLJIT_MEM_STORE) {
+		FAIL_IF(push_inst(compiler, VMOV | (1 << 20) | VN(freg) | RD(TMP_REG2)));
+
+		if (type & SLJIT_32)
+			return sljit_emit_mem_unaligned(compiler, SLJIT_MOV | SLJIT_MEM_STORE | (type & SLJIT_MEM_UNALIGNED_16), TMP_REG2, mem, memw);
+
+		max_offset = 0xfff - 7;
+		if (type & SLJIT_MEM_UNALIGNED_16)
+			max_offset++;
+
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, max_offset));
+		mem |= SLJIT_MEM;
+
+		FAIL_IF(sljit_emit_mem_unaligned(compiler, SLJIT_MOV | SLJIT_MEM_STORE | (type & SLJIT_MEM_UNALIGNED_16), TMP_REG2, mem, memw));
+
+		FAIL_IF(push_inst(compiler, VMOV | (1 << 20) | VN(freg) | 0x80 | RD(TMP_REG2)));
+		return sljit_emit_mem_unaligned(compiler, SLJIT_MOV | SLJIT_MEM_STORE | (type & SLJIT_MEM_UNALIGNED_16), TMP_REG2, mem, memw + 4);
+	}
+
+	max_offset = (type & SLJIT_32) ? 0xfff - 3 : 0xfff - 7;
+	if (type & SLJIT_MEM_UNALIGNED_16)
+		max_offset++;
+
+	FAIL_IF(update_mem_addr(compiler, &mem, &memw, max_offset));
+
+	dst = TMP_REG1;
+
+	/* Stack offset adjustment is not needed because dst
+	   is not stored on the stack when mem is SLJIT_SP. */
+
+	if (mem == TMP_REG1) {
+		dst = SLJIT_R3;
+
+		if (compiler->scratches >= 4)
+			FAIL_IF(push_inst(compiler, STR | (1 << 21) | RN(SLJIT_SP) | RD(SLJIT_R3) | 8));
+	}
+
+	mem |= SLJIT_MEM;
+
+	FAIL_IF(sljit_emit_mem_unaligned(compiler, SLJIT_MOV | (type & SLJIT_MEM_UNALIGNED_16), dst, mem, memw));
+	FAIL_IF(push_inst(compiler, VMOV | VN(freg) | RD(dst)));
+
+	if (!(type & SLJIT_32)) {
+		FAIL_IF(sljit_emit_mem_unaligned(compiler, SLJIT_MOV | (type & SLJIT_MEM_UNALIGNED_16), dst, mem, memw + 4));
+		FAIL_IF(push_inst(compiler, VMOV | VN(freg) | 0x80 | RD(dst)));
+	}
+
+	if (dst == SLJIT_R3 && compiler->scratches >= 4)
+		FAIL_IF(push_inst(compiler, (LDR ^ (0x1 << 24)) | (0x1 << 23) | RN(SLJIT_SP) | RD(SLJIT_R3) | 8));
+	return SLJIT_SUCCESS;
+#else /* !SLJIT_CONFIG_ARM_V5 */
+	if (type & SLJIT_MEM_STORE) {
+		FAIL_IF(push_inst(compiler, VMOV | (1 << 20) | VN(freg) | RD(TMP_REG2)));
+
+		if (type & SLJIT_32)
+			return emit_op_mem(compiler, WORD_SIZE, TMP_REG2, mem, memw, TMP_REG1);
+
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xfff - 4));
+		mem |= SLJIT_MEM;
+
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG2, mem, memw, TMP_REG1));
+		FAIL_IF(push_inst(compiler, VMOV | (1 << 20) | VN(freg) | 0x80 | RD(TMP_REG2)));
+		return emit_op_mem(compiler, WORD_SIZE, TMP_REG2, mem, memw + 4, TMP_REG1);
+	}
+
+	if (type & SLJIT_32) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG2, mem, memw, TMP_REG1));
+		return push_inst(compiler, VMOV | VN(freg) | RD(TMP_REG2));
+	}
+
+	FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xfff - 4));
+	mem |= SLJIT_MEM;
+
+	FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG2, mem, memw, TMP_REG1));
+	FAIL_IF(emit_op_mem(compiler, WORD_SIZE | LOAD_DATA, TMP_REG1, mem, memw + 4, TMP_REG1));
+	return push_inst(compiler, VMOV2 | VM(freg) | RD(TMP_REG2) | RN(TMP_REG1));
+#endif /* SLJIT_CONFIG_ARM_V5 */
+}
+
+#undef FPU_LOAD
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
 {
 	struct sljit_const *const_;
diff --git a/src/sljit/sljitNativeARM_64.c b/src/sljit/sljitNativeARM_64.c
index 96453b4..89f747e 100644
--- a/src/sljit/sljitNativeARM_64.c
+++ b/src/sljit/sljitNativeARM_64.c
@@ -86,6 +86,7 @@
 #define CSINC 0x9a800400
 #define EOR 0xca000000
 #define EORI 0xd2000000
+#define EXTR 0x93c00000
 #define FABS 0x1e60c000
 #define FADD 0x1e602800
 #define FCMP 0x1e602000
@@ -98,6 +99,7 @@
 #define FSUB 0x1e603800
 #define LDRI 0xf9400000
 #define LDRI_F64 0xfd400000
+#define LDRI_POST 0xf8400400
 #define LDP 0xa9400000
 #define LDP_F64 0x6d400000
 #define LDP_POST 0xa8c00000
@@ -112,7 +114,9 @@
 #define ORN 0xaa200000
 #define ORR 0xaa000000
 #define ORRI 0xb2000000
+#define RBIT 0xdac00000
 #define RET 0xd65f0000
+#define RORV 0x9ac02c00
 #define SBC 0xda000000
 #define SBFM 0x93000000
 #define SCVTF 0x9e620000
@@ -137,8 +141,6 @@
 #define UDIV 0x9ac00800
 #define UMULH 0x9bc03c00
 
-/* dest_reg is the absolute name of the register
-   Useful for reordering instructions in the delay slot. */
 static sljit_s32 push_inst(struct sljit_compiler *compiler, sljit_ins ins)
 {
 	sljit_ins *ptr = (sljit_ins*)ensure_buf(compiler, sizeof(sljit_ins));
@@ -296,8 +298,8 @@
 				}
 				next_addr = compute_next_addr(label, jump, const_, put_label);
 			}
-			code_ptr ++;
-			word_count ++;
+			code_ptr++;
+			word_count++;
 		} while (buf_ptr < buf_end);
 
 		buf = buf->next;
@@ -391,6 +393,8 @@
 #endif
 
 	case SLJIT_HAS_CLZ:
+	case SLJIT_HAS_CTZ:
+	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_CMOV:
 	case SLJIT_HAS_PREFETCH:
 		return 1;
@@ -631,6 +635,7 @@
 		switch (op) {
 		case SLJIT_MUL:
 		case SLJIT_CLZ:
+		case SLJIT_CTZ:
 		case SLJIT_ADDC:
 		case SLJIT_SUBC:
 			/* No form with immediate operand (except imm 0, which
@@ -701,36 +706,50 @@
 			FAIL_IF(push_inst(compiler, (inst_bits ^ inv_bits) | RD(dst) | RN(reg)));
 			goto set_flags;
 		case SLJIT_SHL:
+		case SLJIT_MSHL:
 			if (flags & ARG1_IMM)
 				break;
+
 			if (flags & INT_OP) {
 				imm &= 0x1f;
-				FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1)
-					| (((sljit_ins)-imm & 0x1f) << 16) | ((31 - (sljit_ins)imm) << 10)));
-			}
-			else {
+				inst_bits = (((sljit_ins)-imm & 0x1f) << 16) | ((31 - (sljit_ins)imm) << 10);
+			} else {
 				imm &= 0x3f;
-				FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1) | (1 << 22)
-					| (((sljit_ins)-imm & 0x3f) << 16) | ((63 - (sljit_ins)imm) << 10)));
+				inst_bits = ((sljit_ins)1 << 22) | (((sljit_ins)-imm & 0x3f) << 16) | ((63 - (sljit_ins)imm) << 10);
 			}
+
+			FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1) | inst_bits));
 			goto set_flags;
 		case SLJIT_LSHR:
+		case SLJIT_MLSHR:
 		case SLJIT_ASHR:
+		case SLJIT_MASHR:
 			if (flags & ARG1_IMM)
 				break;
-			if (op == SLJIT_ASHR)
+
+			if (op >= SLJIT_ASHR)
 				inv_bits |= 1 << 30;
+
 			if (flags & INT_OP) {
 				imm &= 0x1f;
-				FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1)
-					| ((sljit_ins)imm << 16) | (31 << 10)));
-			}
-			else {
+				inst_bits = ((sljit_ins)imm << 16) | (31 << 10);
+			} else {
 				imm &= 0x3f;
-				FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1)
-					| (1 << 22) | ((sljit_ins)imm << 16) | (63 << 10)));
+				inst_bits = ((sljit_ins)1 << 22) | ((sljit_ins)imm << 16) | (63 << 10);
 			}
+
+			FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(dst) | RN(arg1) | inst_bits));
 			goto set_flags;
+		case SLJIT_ROTL:
+		case SLJIT_ROTR:
+			if (flags & ARG1_IMM)
+				break;
+
+			if (op == SLJIT_ROTL)
+				imm = -imm;
+
+			imm &= (flags & INT_OP) ? 0x1f : 0x3f;
+			return push_inst(compiler, (EXTR ^ (inv_bits | (inv_bits >> 9))) | RD(dst) | RN(arg1) | RM(arg1) | ((sljit_ins)imm << 10));
 		default:
 			SLJIT_UNREACHABLE();
 			break;
@@ -796,6 +815,10 @@
 	case SLJIT_CLZ:
 		SLJIT_ASSERT(arg1 == TMP_REG1);
 		return push_inst(compiler, (CLZ ^ inv_bits) | RD(dst) | RN(arg2));
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(arg1 == TMP_REG1);
+		FAIL_IF(push_inst(compiler, (RBIT ^ inv_bits) | RD(dst) | RN(arg2)));
+		return push_inst(compiler, (CLZ ^ inv_bits) | RD(dst) | RN(dst));
 	case SLJIT_ADD:
 		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
 		CHECK_FLAGS(1 << 29);
@@ -834,14 +857,23 @@
 		FAIL_IF(push_inst(compiler, (EOR ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
 		break; /* Set flags. */
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 		FAIL_IF(push_inst(compiler, (LSLV ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
 		break; /* Set flags. */
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 		FAIL_IF(push_inst(compiler, (LSRV ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
 		break; /* Set flags. */
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		FAIL_IF(push_inst(compiler, (ASRV ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2)));
 		break; /* Set flags. */
+	case SLJIT_ROTL:
+		FAIL_IF(push_inst(compiler, (SUB ^ inv_bits) | RD(TMP_REG2) | RN(TMP_ZERO) | RM(arg2)));
+		arg2 = TMP_REG2;
+		/* fallthrough */
+	case SLJIT_ROTR:
+		return push_inst(compiler, (RORV ^ inv_bits) | RD(dst) | RN(arg1) | RM(arg2));
 	default:
 		SLJIT_UNREACHABLE();
 		return SLJIT_SUCCESS;
@@ -895,21 +927,37 @@
 		return push_inst(compiler, STRBI | type | RT(reg) | RN(tmp_reg) | ((sljit_ins)argw << 10));
 	}
 
-	if (argw >= 0 && (argw & ((1 << shift) - 1)) == 0) {
-		if ((argw >> shift) <= 0xfff)
-			return push_inst(compiler, STRBI | type | RT(reg) | RN(arg) | ((sljit_ins)argw << (10 - shift)));
+	if ((argw & ((1 << shift) - 1)) == 0) {
+		if (argw >= 0) {
+			if ((argw >> shift) <= 0xfff)
+				return push_inst(compiler, STRBI | type | RT(reg) | RN(arg) | ((sljit_ins)argw << (10 - shift)));
 
-		if (argw <= 0xffffff) {
-			FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(tmp_reg) | RN(arg) | (((sljit_ins)argw >> 12) << 10)));
+			if (argw <= 0xffffff) {
+				FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(tmp_reg) | RN(arg) | (((sljit_ins)argw >> 12) << 10)));
 
-			argw = ((argw & 0xfff) >> shift);
+				argw = ((argw & 0xfff) >> shift);
+				return push_inst(compiler, STRBI | type | RT(reg) | RN(tmp_reg) | ((sljit_ins)argw << 10));
+			}
+		} else if (argw < -256 && argw >= -0xfff000) {
+			FAIL_IF(push_inst(compiler, SUBI | (1 << 22) | RD(tmp_reg) | RN(arg) | (((sljit_ins)(-argw + 0xfff) >> 12) << 10)));
+			argw = ((0x1000 + argw) & 0xfff) >> shift;
 			return push_inst(compiler, STRBI | type | RT(reg) | RN(tmp_reg) | ((sljit_ins)argw << 10));
 		}
 	}
 
-	if (argw <= 255 && argw >= -256)
+	if (argw <= 0xff && argw >= -0x100)
 		return push_inst(compiler, STURBI | type | RT(reg) | RN(arg) | (((sljit_ins)argw & 0x1ff) << 12));
 
+	if (argw >= 0) {
+		if (argw <= 0xfff0ff && ((argw + 0x100) & 0xfff) <= 0x1ff) {
+			FAIL_IF(push_inst(compiler, ADDI | (1 << 22) | RD(tmp_reg) | RN(arg) | (((sljit_ins)argw >> 12) << 10)));
+			return push_inst(compiler, STURBI | type | RT(reg) | RN(tmp_reg) | (((sljit_ins)argw & 0x1ff) << 12));
+		}
+	} else if (argw >= -0xfff100 && ((-argw + 0xff) & 0xfff) <= 0x1ff) {
+		FAIL_IF(push_inst(compiler, SUBI | (1 << 22) | RD(tmp_reg) | RN(arg) | (((sljit_ins)-argw >> 12) << 10)));
+		return push_inst(compiler, STURBI | type | RT(reg) | RN(tmp_reg) | (((sljit_ins)argw & 0x1ff) << 12));
+	}
+
 	FAIL_IF(load_immediate(compiler, tmp_reg, argw));
 
 	return push_inst(compiler, STRB | type | RT(reg) | RN(arg) | RM(tmp_reg));
@@ -924,14 +972,14 @@
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
 	sljit_s32 prev, fprev, saved_regs_size, i, tmp;
-	sljit_s32 word_arg_count = 0;
+	sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options);
 	sljit_ins offs;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
 	set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
-	saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 2);
+	saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 2);
 	saved_regs_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, SSIZE_OF(f64));
 
 	local_size = (local_size + saved_regs_size + 0xf) & ~0xf;
@@ -954,7 +1002,7 @@
 	prev = -1;
 
 	tmp = SLJIT_S0 - saveds;
-	for (i = SLJIT_S0; i > tmp; i--) {
+	for (i = SLJIT_S0 - saved_arg_count; i > tmp; i--) {
 		if (prev == -1) {
 			prev = i;
 			continue;
@@ -1003,23 +1051,27 @@
 	if (prev != -1)
 		FAIL_IF(push_inst(compiler, STRI | RT(prev) | RN(SLJIT_SP) | (offs >> 5) | ((fprev == -1) ? (1 << 10) : 0)));
 
-	arg_types >>= SLJIT_ARG_SHIFT;
 
 #ifdef _WIN32
 	if (local_size > 4096)
 		FAIL_IF(push_inst(compiler, SUBI | RD(SLJIT_SP) | RN(SLJIT_SP) | (1 << 10) | (1 << 22)));
 #endif /* _WIN32 */
 
-	tmp = 0;
-	while (arg_types > 0) {
-		if ((arg_types & SLJIT_ARG_MASK) < SLJIT_ARG_TYPE_F64) {
-			if (!(arg_types & SLJIT_ARG_TYPE_SCRATCH_REG)) {
-				FAIL_IF(push_inst(compiler, ORR | RD(SLJIT_S0 - tmp) | RN(TMP_ZERO) | RM(SLJIT_R0 + word_arg_count)));
+	if (!(options & SLJIT_ENTER_REG_ARG)) {
+		arg_types >>= SLJIT_ARG_SHIFT;
+		saved_arg_count = 0;
+		tmp = SLJIT_R0;
+
+		while (arg_types) {
+			if ((arg_types & SLJIT_ARG_MASK) < SLJIT_ARG_TYPE_F64) {
+				if (!(arg_types & SLJIT_ARG_TYPE_SCRATCH_REG)) {
+					FAIL_IF(push_inst(compiler, ORR | RD(SLJIT_S0 - saved_arg_count) | RN(TMP_ZERO) | RM(tmp)));
+					saved_arg_count++;
+				}
 				tmp++;
 			}
-			word_arg_count++;
+			arg_types >>= SLJIT_ARG_SHIFT;
 		}
-		arg_types >>= SLJIT_ARG_SHIFT;
 	}
 
 #ifdef _WIN32
@@ -1100,26 +1152,34 @@
 	CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
 	set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
-	saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 2);
+	saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 2);
 	saved_regs_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, SSIZE_OF(f64));
 
 	compiler->local_size = (local_size + saved_regs_size + 0xf) & ~0xf;
 	return SLJIT_SUCCESS;
 }
 
-static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 is_return_to)
 {
 	sljit_s32 local_size, prev, fprev, i, tmp;
 	sljit_ins offs;
 
 	local_size = compiler->local_size;
 
-	if (local_size > 512 && local_size <= 512 + 496) {
-		FAIL_IF(push_inst(compiler, LDP_POST | RT(TMP_FP) | RT2(TMP_LR)
-			| RN(SLJIT_SP) | ((sljit_ins)(local_size - 512) << (15 - 3))));
-		local_size = 512;
-	} else
-		FAIL_IF(push_inst(compiler, LDP | RT(TMP_FP) | RT2(TMP_LR) | RN(SLJIT_SP)));
+	if (!is_return_to) {
+		if (local_size > 512 && local_size <= 512 + 496) {
+			FAIL_IF(push_inst(compiler, LDP_POST | RT(TMP_FP) | RT2(TMP_LR)
+				| RN(SLJIT_SP) | ((sljit_ins)(local_size - 512) << (15 - 3))));
+			local_size = 512;
+		} else
+			FAIL_IF(push_inst(compiler, LDP | RT(TMP_FP) | RT2(TMP_LR) | RN(SLJIT_SP)));
+	} else {
+		if (local_size > 512 && local_size <= 512 + 248) {
+			FAIL_IF(push_inst(compiler, LDRI_POST | RT(TMP_FP) | RN(SLJIT_SP) | ((sljit_ins)(local_size - 512) << 12)));
+			local_size = 512;
+		} else
+			FAIL_IF(push_inst(compiler, LDRI | RT(TMP_FP) | RN(SLJIT_SP) | 0));
+	}
 
 	if (local_size > 512) {
 		local_size -= 512;
@@ -1137,7 +1197,7 @@
 	prev = -1;
 
 	tmp = SLJIT_S0 - compiler->saveds;
-	for (i = SLJIT_S0; i > tmp; i--) {
+	for (i = SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options); i > tmp; i--) {
 		if (prev == -1) {
 			prev = i;
 			continue;
@@ -1195,11 +1255,34 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_return_void(compiler));
 
-	FAIL_IF(emit_stack_frame_release(compiler));
+	FAIL_IF(emit_stack_frame_release(compiler, 0));
 
 	return push_inst(compiler, RET | RN(TMP_LR));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
+		src = TMP_REG1;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, ORR | RD(TMP_REG1) | RN(TMP_ZERO) | RM(src)));
+		src = TMP_REG1;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */
@@ -1392,13 +1475,84 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_op2(compiler, op, TMP_REG1, 0, src1, src1w, src2, src2w);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src_dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_ins inv_bits, imm;
+	sljit_s32 is_left;
+	sljit_sw mask;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, src_dst, src1, src1w, src2, src2w));
+
+	is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL);
+
+	if (src_dst == src1) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, (is_left ? SLJIT_ROTL : SLJIT_ROTR) | (op & SLJIT_32), src_dst, 0, src_dst, 0, src2, src2w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+	inv_bits = (op & SLJIT_32) ? W_OP : 0;
+	mask = inv_bits ? 0x1f : 0x3f;
+
+	if (src2 & SLJIT_IMM) {
+		src2w &= mask;
+
+		if (src2w == 0)
+			return SLJIT_SUCCESS;
+	} else if (src2 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inv_bits ? INT_SIZE : WORD_SIZE, TMP_REG2, src2, src2w, TMP_REG2));
+		src2 = TMP_REG2;
+	}
+
+	if (src1 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inv_bits ? INT_SIZE : WORD_SIZE, TMP_REG1, src1, src1w, TMP_REG1));
+		src1 = TMP_REG1;
+	} else if (src1 & SLJIT_IMM) {
+		FAIL_IF(load_immediate(compiler, TMP_REG1, src1w));
+		src1 = TMP_REG1;
+	}
+
+	if (src2 & SLJIT_IMM) {
+		if (is_left)
+			src2w = (src2w ^ mask) + 1;
+
+		return push_inst(compiler, (EXTR ^ (inv_bits | (inv_bits >> 9))) | RD(src_dst)
+			| RN(is_left ? src_dst : src1) | RM(is_left ? src1 : src_dst) | ((sljit_ins)src2w << 10));
+	}
+
+	FAIL_IF(push_inst(compiler, ((is_left ? LSLV : LSRV) ^ inv_bits) | RD(src_dst) | RN(src_dst) | RM(src2)));
+
+	if (!(op & SLJIT_SHIFT_INTO_NON_ZERO)) {
+		/* Shift left/right by 1. */
+		if (is_left)
+			imm = (sljit_ins)(inv_bits ? ((1 << 16) | (31 << 10)) : ((1 << 16) | (63 << 10) | (1 << 22)));
+		else
+			imm = (sljit_ins)(inv_bits ? ((31 << 16) | (30 << 10)) : ((63 << 16) | (62 << 10) | (1 << 22)));
+
+		FAIL_IF(push_inst(compiler, (UBFM ^ inv_bits) | RD(TMP_REG1) | RN(src1) | imm));
+
+		/* Set imm to mask. */
+		imm = (sljit_ins)(inv_bits ? (4 << 10) : ((5 << 10) | (1 << 22)));
+		FAIL_IF(push_inst(compiler, (EORI ^ inv_bits) | RD(TMP_REG2) | RN(src2) | imm));
+
+		src1 = TMP_REG1;
+	} else
+		FAIL_IF(push_inst(compiler, (SUB ^ inv_bits) | RD(TMP_REG2) | RN(TMP_ZERO) | RM(src2)));
+
+	FAIL_IF(push_inst(compiler, ((is_left ? LSRV : LSLV) ^ inv_bits) | RD(TMP_REG1) | RN(src1) | RM(TMP_REG2)));
+	return push_inst(compiler, (ORR ^ inv_bits) | RD(src_dst) | RN(src_dst) | RM(TMP_REG1));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -1550,10 +1704,9 @@
 		emit_op_mem(compiler, ((GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32) ? INT_SIZE : WORD_SIZE), TMP_REG1, src, srcw, TMP_REG1);
 		src = TMP_REG1;
 	} else if (src & SLJIT_IMM) {
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
 			srcw = (sljit_s32)srcw;
-#endif
+
 		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
 		src = TMP_REG1;
 	}
@@ -1699,11 +1852,15 @@
 {
 	switch (type) {
 	case SLJIT_EQUAL:
-	case SLJIT_EQUAL_F64:
+	case SLJIT_F_EQUAL:
+	case SLJIT_ORDERED_EQUAL:
+	case SLJIT_UNORDERED_OR_EQUAL: /* Not supported. */
 		return 0x1;
 
 	case SLJIT_NOT_EQUAL:
-	case SLJIT_NOT_EQUAL_F64:
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL: /* Not supported. */
 		return 0x0;
 
 	case SLJIT_CARRY:
@@ -1712,7 +1869,6 @@
 		/* fallthrough */
 
 	case SLJIT_LESS:
-	case SLJIT_LESS_F64:
 		return 0x2;
 
 	case SLJIT_NOT_CARRY:
@@ -1721,27 +1877,33 @@
 		/* fallthrough */
 
 	case SLJIT_GREATER_EQUAL:
-	case SLJIT_GREATER_EQUAL_F64:
 		return 0x3;
 
 	case SLJIT_GREATER:
-	case SLJIT_GREATER_F64:
+	case SLJIT_UNORDERED_OR_GREATER:
 		return 0x9;
 
 	case SLJIT_LESS_EQUAL:
-	case SLJIT_LESS_EQUAL_F64:
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_ORDERED_LESS_EQUAL:
 		return 0x8;
 
 	case SLJIT_SIG_LESS:
+	case SLJIT_UNORDERED_OR_LESS:
 		return 0xa;
 
 	case SLJIT_SIG_GREATER_EQUAL:
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_ORDERED_GREATER_EQUAL:
 		return 0xb;
 
 	case SLJIT_SIG_GREATER:
+	case SLJIT_F_GREATER:
+	case SLJIT_ORDERED_GREATER:
 		return 0xd;
 
 	case SLJIT_SIG_LESS_EQUAL:
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
 		return 0xc;
 
 	case SLJIT_OVERFLOW:
@@ -1749,7 +1911,7 @@
 			return 0x0;
 		/* fallthrough */
 
-	case SLJIT_UNORDERED_F64:
+	case SLJIT_UNORDERED:
 		return 0x7;
 
 	case SLJIT_NOT_OVERFLOW:
@@ -1757,9 +1919,16 @@
 			return 0x1;
 		/* fallthrough */
 
-	case SLJIT_ORDERED_F64:
+	case SLJIT_ORDERED:
 		return 0x6;
 
+	case SLJIT_F_LESS:
+	case SLJIT_ORDERED_LESS:
+		return 0x5;
+
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+		return 0x4;
+
 	default:
 		SLJIT_UNREACHABLE();
 		return 0xe;
@@ -1816,15 +1985,11 @@
 	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
 
 	if (type & SLJIT_CALL_RETURN) {
-		PTR_FAIL_IF(emit_stack_frame_release(compiler));
+		PTR_FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 	}
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_jump(compiler, type);
 }
 
@@ -1869,10 +2034,10 @@
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
 
 	if (!(src & SLJIT_IMM)) {
 		if (src & SLJIT_MEM) {
+			ADJUST_LOCAL_OFFSET(src, srcw);
 			FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
 			src = TMP_REG1;
 		}
@@ -1897,28 +2062,24 @@
 	SLJIT_UNUSED_ARG(arg_types);
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
 
 	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
 		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
 		src = TMP_REG1;
 	}
 
 	if (type & SLJIT_CALL_RETURN) {
-		if (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0) {
+		if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
 			FAIL_IF(push_inst(compiler, ORR | RD(TMP_REG1) | RN(TMP_ZERO) | RM(src)));
 			src = TMP_REG1;
 		}
 
-		FAIL_IF(emit_stack_frame_release(compiler));
+		FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP;
 	}
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_ijump(compiler, type, src, srcw);
 }
 
@@ -1933,7 +2094,7 @@
 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
 	ADJUST_LOCAL_OFFSET(dst, dstw);
 
-	cc = get_cc(compiler, type & 0xff);
+	cc = get_cc(compiler, type);
 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
 
 	if (GET_OPCODE(op) < SLJIT_ADD) {
@@ -1974,22 +2135,21 @@
 	sljit_s32 dst_reg,
 	sljit_s32 src, sljit_sw srcw)
 {
-	sljit_ins inv_bits = (dst_reg & SLJIT_32) ? W_OP : 0;
+	sljit_ins inv_bits = (type & SLJIT_32) ? W_OP : 0;
 	sljit_ins cc;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
 
 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
-		if (dst_reg & SLJIT_32)
+		if (type & SLJIT_32)
 			srcw = (sljit_s32)srcw;
 		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
 		src = TMP_REG1;
 		srcw = 0;
 	}
 
-	cc = get_cc(compiler, type & 0xff);
-	dst_reg &= ~SLJIT_32;
+	cc = get_cc(compiler, type & ~SLJIT_32);
 
 	return push_inst(compiler, (CSEL ^ inv_bits) | (cc << 12) | RD(dst_reg) | RN(dst_reg) | RM(src));
 }
@@ -1998,11 +2158,82 @@
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw)
 {
-	sljit_u32 sign = 0, inst;
+	sljit_u32 inst;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
 
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+
+	ADJUST_LOCAL_OFFSET(mem, memw);
+
+	if (!(mem & REG_MASK)) {
+		FAIL_IF(load_immediate(compiler, TMP_REG1, memw & ~0x1f8));
+
+		mem = SLJIT_MEM1(TMP_REG1);
+		memw &= 0x1f8;
+	} else if (mem & OFFS_REG_MASK) {
+		FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RN(mem & REG_MASK) | RM(OFFS_REG(mem)) | ((sljit_ins)(memw & 0x3) << 10)));
+
+		mem = SLJIT_MEM1(TMP_REG1);
+		memw = 0;
+	} else if ((memw & 0x7) != 0 || memw > 0x1f8 || memw < -0x200) {
+		inst = ADDI;
+
+		if (memw < 0) {
+			/* Remains negative for integer min. */
+			memw = -memw;
+			inst = SUBI;
+		} else if ((memw & 0x7) == 0 && memw <= 0x7ff0) {
+			if (!(type & SLJIT_MEM_STORE) && (mem & REG_MASK) == REG_PAIR_FIRST(reg)) {
+				FAIL_IF(push_inst(compiler, LDRI | RD(REG_PAIR_SECOND(reg)) | RN(mem & REG_MASK) | ((sljit_ins)memw << 7)));
+				return push_inst(compiler, LDRI | RD(REG_PAIR_FIRST(reg)) | RN(mem & REG_MASK) | ((sljit_ins)(memw + 0x8) << 7));
+			}
+
+			inst = (type & SLJIT_MEM_STORE) ? STRI : LDRI;
+
+			FAIL_IF(push_inst(compiler, inst | RD(REG_PAIR_FIRST(reg)) | RN(mem & REG_MASK) | ((sljit_ins)memw << 7)));
+			return push_inst(compiler, inst | RD(REG_PAIR_SECOND(reg)) | RN(mem & REG_MASK) | ((sljit_ins)(memw + 0x8) << 7));
+		}
+
+		if ((sljit_uw)memw <= 0xfff) {
+			FAIL_IF(push_inst(compiler, inst | RD(TMP_REG1) | RN(mem & REG_MASK) | ((sljit_ins)memw << 10)));
+			memw = 0;
+		} else if ((sljit_uw)memw <= 0xffffff) {
+			FAIL_IF(push_inst(compiler, inst | (1 << 22) | RD(TMP_REG1) | RN(mem & REG_MASK) | (((sljit_ins)memw >> 12) << 10)));
+
+			if ((memw & 0xe07) != 0) {
+				FAIL_IF(push_inst(compiler, inst | RD(TMP_REG1) | RN(TMP_REG1) | (((sljit_ins)memw & 0xfff) << 10)));
+				memw = 0;
+			} else {
+				memw &= 0xfff;
+			}
+		} else {
+			FAIL_IF(load_immediate(compiler, TMP_REG1, memw));
+			FAIL_IF(push_inst(compiler, (inst == ADDI ? ADD : SUB) | RD(TMP_REG1) | RN(mem & REG_MASK) | RM(TMP_REG1)));
+			memw = 0;
+		}
+
+		mem = SLJIT_MEM1(TMP_REG1);
+
+		if (inst == SUBI)
+			memw = -memw;
+	}
+
+	SLJIT_ASSERT((memw & 0x7) == 0 && memw <= 0x1f8 && memw >= -0x200);
+	return push_inst(compiler, ((type & SLJIT_MEM_STORE) ? STP : LDP) | RT(REG_PAIR_FIRST(reg)) | RT2(REG_PAIR_SECOND(reg)) | RN(mem & REG_MASK) | (sljit_ins)((memw & 0x3f8) << 12));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_u32 sign = 0, inst;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem_update(compiler, type, reg, mem, memw));
+
 	if ((mem & OFFS_REG_MASK) || (memw > 255 || memw < -256))
 		return SLJIT_ERR_UNSUPPORTED;
 
@@ -2042,20 +2273,20 @@
 	if (!(type & SLJIT_MEM_STORE))
 		inst |= sign ? 0x00800000 : 0x00400000;
 
-	if (type & SLJIT_MEM_PRE)
+	if (!(type & SLJIT_MEM_POST))
 		inst |= 0x800;
 
 	return push_inst(compiler, inst | RT(reg) | RN(mem & REG_MASK) | (sljit_ins)((memw & 0x1ff) << 12));
 }
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 freg,
 	sljit_s32 mem, sljit_sw memw)
 {
 	sljit_u32 inst;
 
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
+	CHECK(check_sljit_emit_fmem_update(compiler, type, freg, mem, memw));
 
 	if ((mem & OFFS_REG_MASK) || (memw > 255 || memw < -256))
 		return SLJIT_ERR_UNSUPPORTED;
@@ -2071,7 +2302,7 @@
 	if (!(type & SLJIT_MEM_STORE))
 		inst |= 0x00400000;
 
-	if (type & SLJIT_MEM_PRE)
+	if (!(type & SLJIT_MEM_POST))
 		inst |= 0x800;
 
 	return push_inst(compiler, inst | VT(freg) | RN(mem & REG_MASK) | (sljit_ins)((memw & 0x1ff) << 12));
diff --git a/src/sljit/sljitNativeARM_T2_32.c b/src/sljit/sljitNativeARM_T2_32.c
index ed21ea7..7d6bac0 100644
--- a/src/sljit/sljitNativeARM_T2_32.c
+++ b/src/sljit/sljitNativeARM_T2_32.c
@@ -100,7 +100,6 @@
 #define ADDS		0x1800
 #define ADDSI3		0x1c00
 #define ADDSI8		0x3000
-#define ADD_W		0xeb000000
 #define ADDWI		0xf2000000
 #define ADD_SP		0x4485
 #define ADD_SP_I	0xb000
@@ -131,6 +130,7 @@
 #define IT		0xbf00
 #define LDR_SP		0x9800
 #define LDR		0xf8d00000
+#define LDRD		0xe9500000
 #define LDRI		0xf8500800
 #define LSLS		0x4080
 #define LSLSI		0x0000
@@ -160,6 +160,10 @@
 #define POP_W		0xe8bd0000
 #define PUSH		0xb400
 #define PUSH_W		0xe92d0000
+#define RBIT		0xfa90f0a0
+#define RORS		0x41c0
+#define ROR_W		0xfa60f000
+#define ROR_WI		0xea4f0030
 #define RSB_WI		0xf1c00000
 #define RSBSI		0x4240
 #define SBCI		0xf1600000
@@ -167,6 +171,7 @@
 #define SBC_W		0xeb600000
 #define SDIV		0xfb90f0f0
 #define SMULL		0xfb800000
+#define STRD		0xe9400000
 #define STR_SP		0x9000
 #define SUBS		0x1a00
 #define SUBSI3		0x1e00
@@ -434,8 +439,8 @@
 				}
 				next_addr = compute_next_addr(label, jump, const_, put_label);
 			}
-			code_ptr ++;
-			half_count ++;
+			code_ptr++;
+			half_count++;
 		} while (buf_ptr < buf_end);
 
 		buf = buf->next;
@@ -491,6 +496,8 @@
 #endif
 
 	case SLJIT_HAS_CLZ:
+	case SLJIT_HAS_CTZ:
+	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_CMOV:
 	case SLJIT_HAS_PREFETCH:
 		return 1;
@@ -592,7 +599,7 @@
 	   arg1 must be register, imm
 	   arg2 must be register, imm */
 	sljit_s32 reg;
-	sljit_uw imm, nimm;
+	sljit_uw imm, imm2;
 
 	if (SLJIT_UNLIKELY((flags & (ARG1_IMM | ARG2_IMM)) == (ARG1_IMM | ARG2_IMM))) {
 		/* Both are immediates, no temporaries are used. */
@@ -607,6 +614,7 @@
 
 		switch (flags & 0xffff) {
 		case SLJIT_CLZ:
+		case SLJIT_CTZ:
 		case SLJIT_MUL:
 			/* No form with immediate operand. */
 			break;
@@ -621,31 +629,31 @@
 			break;
 		case SLJIT_ADD:
 			compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
-			nimm = NEGATE(imm);
+			imm2 = NEGATE(imm);
 			if (IS_2_LO_REGS(reg, dst)) {
 				if (imm <= 0x7)
 					return push_inst16(compiler, ADDSI3 | IMM3(imm) | RD3(dst) | RN3(reg));
-				if (nimm <= 0x7)
-					return push_inst16(compiler, SUBSI3 | IMM3(nimm) | RD3(dst) | RN3(reg));
+				if (imm2 <= 0x7)
+					return push_inst16(compiler, SUBSI3 | IMM3(imm2) | RD3(dst) | RN3(reg));
 				if (reg == dst) {
 					if (imm <= 0xff)
 						return push_inst16(compiler, ADDSI8 | IMM8(imm) | RDN3(dst));
-					if (nimm <= 0xff)
-						return push_inst16(compiler, SUBSI8 | IMM8(nimm) | RDN3(dst));
+					if (imm2 <= 0xff)
+						return push_inst16(compiler, SUBSI8 | IMM8(imm2) | RDN3(dst));
 				}
 			}
 			if (!(flags & SET_FLAGS)) {
 				if (imm <= 0xfff)
 					return push_inst32(compiler, ADDWI | RD4(dst) | RN4(reg) | IMM12(imm));
-				if (nimm <= 0xfff)
-					return push_inst32(compiler, SUBWI | RD4(dst) | RN4(reg) | IMM12(nimm));
+				if (imm2 <= 0xfff)
+					return push_inst32(compiler, SUBWI | RD4(dst) | RN4(reg) | IMM12(imm2));
 			}
-			nimm = get_imm(imm);
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, ADD_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
-			nimm = get_imm(NEGATE(imm));
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, SUB_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
+			imm2 = get_imm(imm);
+			if (imm2 != INVALID_IMM)
+				return push_inst32(compiler, ADD_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm2);
+			imm = get_imm(NEGATE(imm));
+			if (imm != INVALID_IMM)
+				return push_inst32(compiler, SUB_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_ADDC:
 			compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
@@ -666,39 +674,39 @@
 			if (flags & UNUSED_RETURN) {
 				if (imm <= 0xff && reg_map[reg] <= 7)
 					return push_inst16(compiler, CMPI | IMM8(imm) | RDN3(reg));
-				nimm = get_imm(imm);
-				if (nimm != INVALID_IMM)
-					return push_inst32(compiler, CMPI_W | RN4(reg) | nimm);
-				nimm = get_imm(NEGATE(imm));
-				if (nimm != INVALID_IMM)
-					return push_inst32(compiler, CMNI_W | RN4(reg) | nimm);
+				imm2 = get_imm(imm);
+				if (imm2 != INVALID_IMM)
+					return push_inst32(compiler, CMPI_W | RN4(reg) | imm2);
+				imm = get_imm(NEGATE(imm));
+				if (imm != INVALID_IMM)
+					return push_inst32(compiler, CMNI_W | RN4(reg) | imm);
 				break;
 			}
-			nimm = NEGATE(imm);
+			imm2 = NEGATE(imm);
 			if (IS_2_LO_REGS(reg, dst)) {
 				if (imm <= 0x7)
 					return push_inst16(compiler, SUBSI3 | IMM3(imm) | RD3(dst) | RN3(reg));
-				if (nimm <= 0x7)
-					return push_inst16(compiler, ADDSI3 | IMM3(nimm) | RD3(dst) | RN3(reg));
+				if (imm2 <= 0x7)
+					return push_inst16(compiler, ADDSI3 | IMM3(imm2) | RD3(dst) | RN3(reg));
 				if (reg == dst) {
 					if (imm <= 0xff)
 						return push_inst16(compiler, SUBSI8 | IMM8(imm) | RDN3(dst));
-					if (nimm <= 0xff)
-						return push_inst16(compiler, ADDSI8 | IMM8(nimm) | RDN3(dst));
+					if (imm2 <= 0xff)
+						return push_inst16(compiler, ADDSI8 | IMM8(imm2) | RDN3(dst));
 				}
 			}
 			if (!(flags & SET_FLAGS)) {
 				if (imm <= 0xfff)
 					return push_inst32(compiler, SUBWI | RD4(dst) | RN4(reg) | IMM12(imm));
-				if (nimm <= 0xfff)
-					return push_inst32(compiler, ADDWI | RD4(dst) | RN4(reg) | IMM12(nimm));
+				if (imm2 <= 0xfff)
+					return push_inst32(compiler, ADDWI | RD4(dst) | RN4(reg) | IMM12(imm2));
 			}
-			nimm = get_imm(imm);
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, SUB_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
-			nimm = get_imm(NEGATE(imm));
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, ADD_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
+			imm2 = get_imm(imm);
+			if (imm2 != INVALID_IMM)
+				return push_inst32(compiler, SUB_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm2);
+			imm = get_imm(NEGATE(imm));
+			if (imm != INVALID_IMM)
+				return push_inst32(compiler, ADD_WI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_SUBC:
 			compiler->status_flags_state = SLJIT_CURRENT_FLAGS_SUB;
@@ -709,17 +717,17 @@
 				return push_inst32(compiler, SBCI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_AND:
-			nimm = get_imm(imm);
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, ((flags & UNUSED_RETURN) ? TSTI : ANDI) | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
+			imm2 = get_imm(imm);
+			if (imm2 != INVALID_IMM)
+				return push_inst32(compiler, ((flags & UNUSED_RETURN) ? TSTI : ANDI) | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm2);
 			imm = get_imm(~imm);
 			if (imm != INVALID_IMM)
 				return push_inst32(compiler, BICI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_OR:
-			nimm = get_imm(imm);
-			if (nimm != INVALID_IMM)
-				return push_inst32(compiler, ORRI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | nimm);
+			imm2 = get_imm(imm);
+			if (imm2 != INVALID_IMM)
+				return push_inst32(compiler, ORRI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm2);
 			imm = get_imm(~imm);
 			if (imm != INVALID_IMM)
 				return push_inst32(compiler, ORNI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
@@ -730,11 +738,17 @@
 				return push_inst32(compiler, EORI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_SHL:
+		case SLJIT_MSHL:
 		case SLJIT_LSHR:
+		case SLJIT_MLSHR:
 		case SLJIT_ASHR:
+		case SLJIT_MASHR:
+		case SLJIT_ROTL:
+		case SLJIT_ROTR:
 			if (flags & ARG1_IMM)
 				break;
 			imm &= 0x1f;
+
 			if (imm == 0) {
 				if (!(flags & SET_FLAGS))
 					return push_inst16(compiler, MOV | SET_REGS44(dst, reg));
@@ -742,19 +756,28 @@
 					return push_inst16(compiler, MOVS | RD3(dst) | RN3(reg));
 				return push_inst32(compiler, MOV_W | SET_FLAGS | RD4(dst) | RM4(reg));
 			}
+
 			switch (flags & 0xffff) {
 			case SLJIT_SHL:
+			case SLJIT_MSHL:
 				if (IS_2_LO_REGS(dst, reg))
 					return push_inst16(compiler, LSLSI | RD3(dst) | RN3(reg) | (imm << 6));
 				return push_inst32(compiler, LSL_WI | (flags & SET_FLAGS) | RD4(dst) | RM4(reg) | IMM5(imm));
 			case SLJIT_LSHR:
+			case SLJIT_MLSHR:
 				if (IS_2_LO_REGS(dst, reg))
 					return push_inst16(compiler, LSRSI | RD3(dst) | RN3(reg) | (imm << 6));
 				return push_inst32(compiler, LSR_WI | (flags & SET_FLAGS) | RD4(dst) | RM4(reg) | IMM5(imm));
-			default: /* SLJIT_ASHR */
+			case SLJIT_ASHR:
+			case SLJIT_MASHR:
 				if (IS_2_LO_REGS(dst, reg))
 					return push_inst16(compiler, ASRSI | RD3(dst) | RN3(reg) | (imm << 6));
 				return push_inst32(compiler, ASR_WI | (flags & SET_FLAGS) | RD4(dst) | RM4(reg) | IMM5(imm));
+			case SLJIT_ROTL:
+				imm = (imm ^ 0x1f) + 1;
+				/* fallthrough */
+			default: /* SLJIT_ROTR */
+				return push_inst32(compiler, ROR_WI | RD4(dst) | RM4(reg) | IMM5(imm));
 			}
 		default:
 			SLJIT_UNREACHABLE();
@@ -813,8 +836,11 @@
 		return push_inst32(compiler, MVN_W | (flags & SET_FLAGS) | RD4(dst) | RM4(arg2));
 	case SLJIT_CLZ:
 		SLJIT_ASSERT(arg1 == TMP_REG2);
-		FAIL_IF(push_inst32(compiler, CLZ | RN4(arg2) | RD4(dst) | RM4(arg2)));
-		return SLJIT_SUCCESS;
+		return push_inst32(compiler, CLZ | RN4(arg2) | RD4(dst) | RM4(arg2));
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(arg1 == TMP_REG2);
+		FAIL_IF(push_inst32(compiler, RBIT | RN4(arg2) | RD4(dst) | RM4(arg2)));
+		return push_inst32(compiler, CLZ | RN4(dst) | RD4(dst) | RM4(dst));
 	case SLJIT_ADD:
 		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
 		if (IS_3_LO_REGS(dst, arg1, arg2))
@@ -864,18 +890,38 @@
 		if (dst == (sljit_s32)arg1 && IS_2_LO_REGS(dst, arg2))
 			return push_inst16(compiler, EORS | RD3(dst) | RN3(arg2));
 		return push_inst32(compiler, EOR_W | (flags & SET_FLAGS) | RD4(dst) | RN4(arg1) | RM4(arg2));
+	case SLJIT_MSHL:
+		FAIL_IF(push_inst32(compiler, ANDI | RD4(TMP_REG2) | RN4(arg2) | 0x1f));
+		arg2 = TMP_REG2;
+		/* fallthrough */
 	case SLJIT_SHL:
 		if (dst == (sljit_s32)arg1 && IS_2_LO_REGS(dst, arg2))
 			return push_inst16(compiler, LSLS | RD3(dst) | RN3(arg2));
 		return push_inst32(compiler, LSL_W | (flags & SET_FLAGS) | RD4(dst) | RN4(arg1) | RM4(arg2));
+	case SLJIT_MLSHR:
+		FAIL_IF(push_inst32(compiler, ANDI | RD4(TMP_REG2) | RN4(arg2) | 0x1f));
+		arg2 = TMP_REG2;
+		/* fallthrough */
 	case SLJIT_LSHR:
 		if (dst == (sljit_s32)arg1 && IS_2_LO_REGS(dst, arg2))
 			return push_inst16(compiler, LSRS | RD3(dst) | RN3(arg2));
 		return push_inst32(compiler, LSR_W | (flags & SET_FLAGS) | RD4(dst) | RN4(arg1) | RM4(arg2));
+	case SLJIT_MASHR:
+		FAIL_IF(push_inst32(compiler, ANDI | RD4(TMP_REG2) | RN4(arg2) | 0x1f));
+		arg2 = TMP_REG2;
+		/* fallthrough */
 	case SLJIT_ASHR:
 		if (dst == (sljit_s32)arg1 && IS_2_LO_REGS(dst, arg2))
 			return push_inst16(compiler, ASRS | RD3(dst) | RN3(arg2));
 		return push_inst32(compiler, ASR_W | (flags & SET_FLAGS) | RD4(dst) | RN4(arg1) | RM4(arg2));
+	case SLJIT_ROTL:
+		FAIL_IF(push_inst32(compiler, RSB_WI | RD4(TMP_REG2) | RN4(arg2) | 0));
+		arg2 = TMP_REG2;
+		/* fallthrough */
+	case SLJIT_ROTR:
+		if (dst == (sljit_s32)arg1 && IS_2_LO_REGS(dst, arg2))
+			return push_inst16(compiler, RORS | RD3(dst) | RN3(arg2));
+		return push_inst32(compiler, ROR_W | RD4(dst) | RN4(arg1) | RM4(arg2));
 	}
 
 	SLJIT_UNREACHABLE();
@@ -890,8 +936,8 @@
 #define HALF_SIZE	0x08
 #define PRELOAD		0x0c
 
-#define IS_WORD_SIZE(flags)		(!(flags & (BYTE_SIZE | HALF_SIZE)))
-#define OFFSET_CHECK(imm, shift)	(!(argw & ~(imm << shift)))
+#define IS_WORD_SIZE(flags)		(!((flags) & (BYTE_SIZE | HALF_SIZE)))
+#define ALIGN_CHECK(argw, imm, shift)	(!((argw) & ~((imm) << (shift))))
 
 /*
   1st letter:
@@ -990,16 +1036,15 @@
 	sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg)
 {
 	sljit_s32 other_r;
-	sljit_uw tmp;
+	sljit_uw imm, tmp;
 
 	SLJIT_ASSERT(arg & SLJIT_MEM);
-	SLJIT_ASSERT((arg & REG_MASK) != tmp_reg);
-	arg &= ~SLJIT_MEM;
+	SLJIT_ASSERT((arg & REG_MASK) != tmp_reg || (arg == SLJIT_MEM1(tmp_reg) && argw >= -0xff && argw <= 0xfff));
 
 	if (SLJIT_UNLIKELY(!(arg & REG_MASK))) {
-		tmp = get_imm((sljit_uw)argw & ~(sljit_uw)0xfff);
-		if (tmp != INVALID_IMM) {
-			FAIL_IF(push_inst32(compiler, MOV_WI | RD4(tmp_reg) | tmp));
+		imm = get_imm((sljit_uw)argw & ~(sljit_uw)0xfff);
+		if (imm != INVALID_IMM) {
+			FAIL_IF(push_inst32(compiler, MOV_WI | RD4(tmp_reg) | imm));
 			return push_inst32(compiler, sljit_mem32[flags] | MEM_IMM12 | RT4(reg) | RN4(tmp_reg) | (argw & 0xfff));
 		}
 
@@ -1012,51 +1057,59 @@
 	if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
 		argw &= 0x3;
 		other_r = OFFS_REG(arg);
-		arg &= 0xf;
+		arg &= REG_MASK;
 
 		if (!argw && IS_3_LO_REGS(reg, arg, other_r))
 			return push_inst16(compiler, sljit_mem16[flags] | RD3(reg) | RN3(arg) | RM3(other_r));
 		return push_inst32(compiler, sljit_mem32[flags] | RT4(reg) | RN4(arg) | RM4(other_r) | ((sljit_ins)argw << 4));
 	}
 
+	arg &= REG_MASK;
+
 	if (argw > 0xfff) {
-		tmp = get_imm((sljit_uw)argw & ~(sljit_uw)0xfff);
-		if (tmp != INVALID_IMM) {
-			push_inst32(compiler, ADD_WI | RD4(tmp_reg) | RN4(arg) | tmp);
+		imm = get_imm((sljit_uw)(argw & ~0xfff));
+		if (imm != INVALID_IMM) {
+			push_inst32(compiler, ADD_WI | RD4(tmp_reg) | RN4(arg) | imm);
 			arg = tmp_reg;
 			argw = argw & 0xfff;
 		}
 	}
 	else if (argw < -0xff) {
-		tmp = get_imm((sljit_uw)-argw & ~(sljit_uw)0xff);
-		if (tmp != INVALID_IMM) {
-			push_inst32(compiler, SUB_WI | RD4(tmp_reg) | RN4(arg) | tmp);
+		tmp = (sljit_uw)((-argw + 0xfff) & ~0xfff);
+		SLJIT_ASSERT(tmp >= (sljit_uw)-argw);
+		imm = get_imm(tmp);
+
+		if (imm != INVALID_IMM) {
+			push_inst32(compiler, SUB_WI | RD4(tmp_reg) | RN4(arg) | imm);
 			arg = tmp_reg;
-			argw = -(-argw & 0xff);
+			argw += (sljit_sw)tmp;
+
+			SLJIT_ASSERT(argw >= 0 && argw <= 0xfff);
 		}
 	}
 
+	/* 16 bit instruction forms. */
 	if (IS_2_LO_REGS(reg, arg) && sljit_mem16_imm5[flags]) {
 		tmp = 3;
 		if (IS_WORD_SIZE(flags)) {
-			if (OFFSET_CHECK(0x1f, 2))
+			if (ALIGN_CHECK(argw, 0x1f, 2))
 				tmp = 2;
 		}
 		else if (flags & BYTE_SIZE)
 		{
-			if (OFFSET_CHECK(0x1f, 0))
+			if (ALIGN_CHECK(argw, 0x1f, 0))
 				tmp = 0;
 		}
 		else {
 			SLJIT_ASSERT(flags & HALF_SIZE);
-			if (OFFSET_CHECK(0x1f, 1))
+			if (ALIGN_CHECK(argw, 0x1f, 1))
 				tmp = 1;
 		}
 
 		if (tmp < 3)
 			return push_inst16(compiler, sljit_mem16_imm5[flags] | RD3(reg) | RN3(arg) | ((sljit_ins)argw << (6 - tmp)));
 	}
-	else if (SLJIT_UNLIKELY(arg == SLJIT_SP) && IS_WORD_SIZE(flags) && OFFSET_CHECK(0xff, 2) && reg_map[reg] <= 7) {
+	else if (SLJIT_UNLIKELY(arg == SLJIT_SP) && IS_WORD_SIZE(flags) && ALIGN_CHECK(argw, 0xff, 2) && reg_map[reg] <= 7) {
 		/* SP based immediate. */
 		return push_inst16(compiler, STR_SP | (sljit_ins)((flags & STORE) ? 0 : 0x800) | RDN3(reg) | ((sljit_ins)argw >> 2));
 	}
@@ -1074,6 +1127,9 @@
 	return push_inst32(compiler, sljit_mem32[flags] | RT4(reg) | RN4(arg) | RM4(tmp_reg));
 }
 
+#undef ALIGN_CHECK
+#undef IS_WORD_SIZE
+
 /* --------------------------------------------------------------------- */
 /*  Entry, exit                                                          */
 /* --------------------------------------------------------------------- */
@@ -1082,7 +1138,8 @@
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
-	sljit_s32 size, i, tmp, word_arg_count, saved_arg_count;
+	sljit_s32 size, i, tmp, word_arg_count;
+	sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options);
 	sljit_uw offset;
 	sljit_uw imm = 0;
 #ifdef __SOFTFP__
@@ -1098,7 +1155,7 @@
 	set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
 	tmp = SLJIT_S0 - saveds;
-	for (i = SLJIT_S0; i > tmp; i--)
+	for (i = SLJIT_S0 - saved_arg_count; i > tmp; i--)
 		imm |= (sljit_uw)1 << reg_map[i];
 
 	for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--)
@@ -1110,7 +1167,7 @@
 		: push_inst16(compiler, PUSH | (1 << 8) | imm));
 
 	/* Stack must be aligned to 8 bytes: (LR, R4) */
-	size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
+	size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 1);
 
 	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
 		if ((size & SSIZE_OF(sw)) != 0) {
@@ -1131,6 +1188,9 @@
 	local_size = ((size + local_size + 0x7) & ~0x7) - size;
 	compiler->local_size = local_size;
 
+	if (options & SLJIT_ENTER_REG_ARG)
+		arg_types = 0;
+
 	arg_types >>= SLJIT_ARG_SHIFT;
 	word_arg_count = 0;
 	saved_arg_count = 0;
@@ -1173,13 +1233,14 @@
 			else
 				break;
 
-			SLJIT_ASSERT(reg_map[tmp] <= 7);
-
 			if (offset < 4 * sizeof(sljit_sw))
-				FAIL_IF(push_inst16(compiler, MOV | RD3(tmp) | (offset << 1)));
-			else
+				FAIL_IF(push_inst16(compiler, MOV | ((sljit_ins)reg_map[tmp] & 0x7) | (((sljit_ins)reg_map[tmp] & 0x8) << 4) | (offset << 1)));
+			else if (reg_map[tmp] <= 7)
 				FAIL_IF(push_inst16(compiler, LDR_SP | RDN3(tmp)
 					| ((offset + (sljit_uw)size - 4 * sizeof(sljit_sw)) >> 2)));
+			else
+				FAIL_IF(push_inst32(compiler, LDR | RT4(tmp) | RN4(SLJIT_SP)
+					| ((offset + (sljit_uw)size - 4 * sizeof(sljit_sw)))));
 			break;
 		}
 
@@ -1293,7 +1354,7 @@
 	CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
 	set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
-	size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
+	size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1);
 
 	if ((size & SSIZE_OF(sw)) != 0 && (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG))
 		size += SSIZE_OF(sw);
@@ -1325,8 +1386,9 @@
 static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 frame_size)
 {
 	sljit_s32 local_size, fscratches, fsaveds, i, tmp;
+	sljit_s32 restored_reg = 0;
 	sljit_s32 lr_dst = TMP_PC;
-	sljit_uw reg_list;
+	sljit_uw reg_list = 0;
 
 	SLJIT_ASSERT(reg_map[TMP_REG2] == 14 && frame_size <= 128);
 
@@ -1353,46 +1415,88 @@
 	if (frame_size < 0) {
 		lr_dst = TMP_REG2;
 		frame_size = 0;
-	} else if (frame_size > 0)
+	} else if (frame_size > 0) {
+		SLJIT_ASSERT(frame_size == 1 || (frame_size & 0x7) == 0);
 		lr_dst = 0;
+		frame_size &= ~0x7;
+	}
 
-	reg_list = 0;
 	tmp = SLJIT_S0 - compiler->saveds;
-	for (i = SLJIT_S0; i > tmp; i--)
-		reg_list |= (sljit_uw)1 << reg_map[i];
+	i = SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
+	if (tmp < i) {
+		restored_reg = i;
+		do {
+			reg_list |= (sljit_uw)1 << reg_map[i];
+		} while (--i > tmp);
+	}
 
-	for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--)
-		reg_list |= (sljit_uw)1 << reg_map[i];
+	i = compiler->scratches;
+	if (i >= SLJIT_FIRST_SAVED_REG) {
+		restored_reg = i;
+		do {
+			reg_list |= (sljit_uw)1 << reg_map[i];
+		} while (--i >= SLJIT_FIRST_SAVED_REG);
+	}
+
+	if (lr_dst == TMP_REG2 && reg_list == 0) {
+		reg_list |= (sljit_uw)1 << reg_map[TMP_REG2];
+		restored_reg = TMP_REG2;
+		lr_dst = 0;
+	}
 
 	if (lr_dst == 0 && (reg_list & (reg_list - 1)) == 0) {
 		/* The local_size does not include the saved registers. */
+		tmp = 0;
+		if (reg_list != 0) {
+			tmp = 2;
+			if (local_size <= 0xfff) {
+				if (local_size == 0) {
+					SLJIT_ASSERT(restored_reg != TMP_REG2);
+					if (frame_size == 0)
+						return push_inst32(compiler, LDRI | RT4(restored_reg) | RN4(SLJIT_SP) | 0x308);
+					if (frame_size > 2 * SSIZE_OF(sw))
+						return push_inst32(compiler, LDRI | RT4(restored_reg) | RN4(SLJIT_SP) | 0x100 | (sljit_ins)(frame_size - (2 * SSIZE_OF(sw))));
+				}
+
+				if (reg_map[restored_reg] <= 7 && local_size <= 0x3fc)
+					FAIL_IF(push_inst16(compiler, STR_SP | 0x800 | RDN3(restored_reg) | (sljit_ins)(local_size >> 2)));
+				else
+					FAIL_IF(push_inst32(compiler, LDR | RT4(restored_reg) | RN4(SLJIT_SP) | (sljit_ins)local_size));
+				tmp = 1;
+			} else if (frame_size == 0) {
+				frame_size = (restored_reg == TMP_REG2) ? SSIZE_OF(sw) : 2 * SSIZE_OF(sw);
+				tmp = 3;
+			}
+
+			/* Place for the saved register. */
+			if (restored_reg != TMP_REG2)
+				local_size += SSIZE_OF(sw);
+		}
+
+		/* Place for the lr register. */
 		local_size += SSIZE_OF(sw);
 
-		if (reg_list != 0)
-			local_size += SSIZE_OF(sw);
-
 		if (frame_size > local_size)
-			FAIL_IF(push_inst16(compiler, SUB_SP_I | ((sljit_uw)(frame_size - local_size) >> 2)));
+			FAIL_IF(push_inst16(compiler, SUB_SP_I | ((sljit_ins)(frame_size - local_size) >> 2)));
 		else if (frame_size < local_size)
 			FAIL_IF(emit_add_sp(compiler, (sljit_uw)(local_size - frame_size)));
 
-		if (reg_list == 0)
+		if (tmp <= 1)
 			return SLJIT_SUCCESS;
 
-		if (compiler->saveds > 0) {
-			SLJIT_ASSERT(reg_list == ((sljit_uw)1 << reg_map[SLJIT_S0]));
-			lr_dst = SLJIT_S0;
-		} else {
-			SLJIT_ASSERT(reg_list == ((sljit_uw)1 << reg_map[SLJIT_FIRST_SAVED_REG]));
-			lr_dst = SLJIT_FIRST_SAVED_REG;
+		if (tmp == 2) {
+			frame_size -= SSIZE_OF(sw);
+			if (restored_reg != TMP_REG2)
+				frame_size -= SSIZE_OF(sw);
+
+			if (reg_map[restored_reg] <= 7)
+				return push_inst16(compiler, STR_SP | 0x800 | RDN3(restored_reg) | (sljit_ins)(frame_size >> 2));
+
+			return push_inst32(compiler, LDR | RT4(restored_reg) | RN4(SLJIT_SP) | (sljit_ins)frame_size);
 		}
 
-		frame_size -= 2 * SSIZE_OF(sw);
-
-		if (reg_map[lr_dst] <= 7)
-			return push_inst16(compiler, STR_SP | 0x800 | RDN3(lr_dst) | (sljit_uw)(frame_size >> 2));
-
-		return push_inst32(compiler, LDR | RT4(lr_dst) | RN4(SLJIT_SP) | (sljit_uw)frame_size);
+		tmp = (restored_reg == TMP_REG2) ? 0x304 : 0x308;
+		return push_inst32(compiler, LDRI | RT4(restored_reg) | RN4(SLJIT_SP) | (sljit_ins)tmp);
 	}
 
 	if (local_size > 0)
@@ -1407,12 +1511,8 @@
 
 		FAIL_IF(push_inst16(compiler, POP | reg_list));
 	} else {
-		if (lr_dst != 0) {
-			if (reg_list == 0)
-				return push_inst32(compiler, 0xf85d0b04 | RT4(lr_dst));
-
+		if (lr_dst != 0)
 			reg_list |= (sljit_uw)1 << reg_map[lr_dst];
-		}
 
 		/* At least two registers must be set for POP_W instruction. */
 		SLJIT_ASSERT((reg_list & (reg_list - 1)) != 0);
@@ -1421,8 +1521,12 @@
 	}
 
 	if (frame_size > 0)
-		return push_inst16(compiler, SUB_SP_I | (((sljit_uw)frame_size - sizeof(sljit_sw)) >> 2));
-	return SLJIT_SUCCESS;
+		return push_inst16(compiler, SUB_SP_I | (((sljit_ins)frame_size - sizeof(sljit_sw)) >> 2));
+
+	if (lr_dst != 0)
+		return SLJIT_SUCCESS;
+
+	return push_inst16(compiler, ADD_SP_I | 1);
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
@@ -1433,6 +1537,28 @@
 	return emit_stack_frame_release(compiler, 0);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
+		src = TMP_REG1;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(TMP_REG1, src)));
+		src = TMP_REG1;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */
@@ -1685,13 +1811,75 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_op2(compiler, op, TMP_REG1, 0, src1, src1w, src2, src2w);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src_dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_s32 is_left;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, src_dst, src1, src1w, src2, src2w));
+
+	op = GET_OPCODE(op);
+	is_left = (op == SLJIT_SHL || op == SLJIT_MSHL);
+
+	if (src_dst == src1) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, is_left ? SLJIT_ROTL : SLJIT_ROTR, src_dst, 0, src_dst, 0, src2, src2w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+	if (src2 & SLJIT_IMM) {
+		src2w &= 0x1f;
+
+		if (src2w == 0)
+			return SLJIT_SUCCESS;
+	} else if (src2 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG2, src2, src2w, TMP_REG2));
+		src2 = TMP_REG2;
+	}
+
+	if (src1 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src1, src1w, TMP_REG1));
+		src1 = TMP_REG1;
+	} else if (src1 & SLJIT_IMM) {
+		FAIL_IF(load_immediate(compiler, TMP_REG1, (sljit_uw)src1w));
+		src1 = TMP_REG1;
+	}
+
+	if (src2 & SLJIT_IMM) {
+		if (reg_map[src_dst] <= 7)
+			FAIL_IF(push_inst16(compiler, (is_left ? LSLSI : LSRSI) | RD3(src_dst) | RN3(src_dst) | ((sljit_ins)src2w << 6)));
+		else
+			FAIL_IF(push_inst32(compiler, (is_left ? LSL_WI : LSR_WI) | RD4(src_dst) | RM4(src_dst) | IMM5(src2w)));
+
+		src2w = (src2w ^ 0x1f) + 1;
+		return push_inst32(compiler, ORR_W | RD4(src_dst) | RN4(src_dst) | RM4(src1) | (is_left ? 0x10 : 0x0) | IMM5(src2w));
+	}
+
+	if (op == SLJIT_MSHL || op == SLJIT_MLSHR) {
+		FAIL_IF(push_inst32(compiler, ANDI | RD4(TMP_REG2) | RN4(src2) | 0x1f));
+		src2 = TMP_REG2;
+	}
+
+	if (IS_2_LO_REGS(src_dst, src2))
+		FAIL_IF(push_inst16(compiler, (is_left ? LSLS : LSRS) | RD3(src_dst) | RN3(src2)));
+	else
+		FAIL_IF(push_inst32(compiler, (is_left ? LSL_W : LSR_W) | RD4(src_dst) | RN4(src_dst) | RM4(src2)));
+
+	FAIL_IF(push_inst32(compiler, (is_left ? LSR_WI : LSL_WI) | RD4(TMP_REG1) | RM4(src1) | (1 << 6)));
+	FAIL_IF(push_inst32(compiler, EORI | RD4(TMP_REG2) | RN4(src2) | 0x1f));
+	FAIL_IF(push_inst32(compiler, (is_left ? LSR_W : LSL_W) | RD4(TMP_REG1) | RN4(TMP_REG1) | RM4(TMP_REG2)));
+	return push_inst32(compiler, ORR_W | RD4(src_dst) | RN4(src_dst) | RM4(TMP_REG1));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -1955,8 +2143,6 @@
 	return emit_fop_mem(compiler, (op & SLJIT_32), TMP_FREG1, dst, dstw);
 }
 
-#undef FPU_LOAD
-
 /* --------------------------------------------------------------------- */
 /*  Other instructions                                                   */
 /* --------------------------------------------------------------------- */
@@ -1984,11 +2170,15 @@
 {
 	switch (type) {
 	case SLJIT_EQUAL:
-	case SLJIT_EQUAL_F64:
+	case SLJIT_F_EQUAL:
+	case SLJIT_ORDERED_EQUAL:
+	case SLJIT_UNORDERED_OR_EQUAL: /* Not supported. */
 		return 0x0;
 
 	case SLJIT_NOT_EQUAL:
-	case SLJIT_NOT_EQUAL_F64:
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL: /* Not supported. */
 		return 0x1;
 
 	case SLJIT_CARRY:
@@ -1997,7 +2187,6 @@
 		/* fallthrough */
 
 	case SLJIT_LESS:
-	case SLJIT_LESS_F64:
 		return 0x3;
 
 	case SLJIT_NOT_CARRY:
@@ -2006,27 +2195,33 @@
 		/* fallthrough */
 
 	case SLJIT_GREATER_EQUAL:
-	case SLJIT_GREATER_EQUAL_F64:
 		return 0x2;
 
 	case SLJIT_GREATER:
-	case SLJIT_GREATER_F64:
+	case SLJIT_UNORDERED_OR_GREATER:
 		return 0x8;
 
 	case SLJIT_LESS_EQUAL:
-	case SLJIT_LESS_EQUAL_F64:
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_ORDERED_LESS_EQUAL:
 		return 0x9;
 
 	case SLJIT_SIG_LESS:
+	case SLJIT_UNORDERED_OR_LESS:
 		return 0xb;
 
 	case SLJIT_SIG_GREATER_EQUAL:
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_ORDERED_GREATER_EQUAL:
 		return 0xa;
 
 	case SLJIT_SIG_GREATER:
+	case SLJIT_F_GREATER:
+	case SLJIT_ORDERED_GREATER:
 		return 0xc;
 
 	case SLJIT_SIG_LESS_EQUAL:
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
 		return 0xd;
 
 	case SLJIT_OVERFLOW:
@@ -2034,7 +2229,7 @@
 			return 0x1;
 		/* fallthrough */
 
-	case SLJIT_UNORDERED_F64:
+	case SLJIT_UNORDERED:
 		return 0x6;
 
 	case SLJIT_NOT_OVERFLOW:
@@ -2042,9 +2237,16 @@
 			return 0x0;
 		/* fallthrough */
 
-	case SLJIT_ORDERED_F64:
+	case SLJIT_ORDERED:
 		return 0x7;
 
+	case SLJIT_F_LESS:
+	case SLJIT_ORDERED_LESS:
+		return 0x4;
+
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+		return 0x5;
+
 	default: /* SLJIT_JUMP */
 		SLJIT_UNREACHABLE();
 		return 0xe;
@@ -2289,52 +2491,49 @@
 	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
 
 #ifdef __SOFTFP__
-	PTR_FAIL_IF(softfloat_call_with_args(compiler, arg_types, NULL, &extra_space));
-	SLJIT_ASSERT((extra_space & 0x7) == 0);
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG) {
+		PTR_FAIL_IF(softfloat_call_with_args(compiler, arg_types, NULL, &extra_space));
+		SLJIT_ASSERT((extra_space & 0x7) == 0);
 
-	if ((type & SLJIT_CALL_RETURN) && extra_space == 0)
-		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
+		if ((type & SLJIT_CALL_RETURN) && extra_space == 0)
+			type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+		SLJIT_SKIP_CHECKS(compiler);
+		jump = sljit_emit_jump(compiler, type);
+		PTR_FAIL_IF(jump == NULL);
 
-	jump = sljit_emit_jump(compiler, type);
-	PTR_FAIL_IF(jump == NULL);
+		if (extra_space > 0) {
+			if (type & SLJIT_CALL_RETURN)
+				PTR_FAIL_IF(push_inst32(compiler, LDR | RT4(TMP_REG2)
+					| RN4(SLJIT_SP) | (extra_space - sizeof(sljit_sw))));
 
-	if (extra_space > 0) {
-		if (type & SLJIT_CALL_RETURN)
-			PTR_FAIL_IF(push_inst32(compiler, LDR | RT4(TMP_REG2)
-				| RN4(SLJIT_SP) | (extra_space - sizeof(sljit_sw))));
+			PTR_FAIL_IF(push_inst16(compiler, ADD_SP_I | (extra_space >> 2)));
 
-		PTR_FAIL_IF(push_inst16(compiler, ADD_SP_I | (extra_space >> 2)));
-
-		if (type & SLJIT_CALL_RETURN) {
-			PTR_FAIL_IF(push_inst16(compiler, BX | RN3(TMP_REG2)));
-			return jump;
+			if (type & SLJIT_CALL_RETURN) {
+				PTR_FAIL_IF(push_inst16(compiler, BX | RN3(TMP_REG2)));
+				return jump;
+			}
 		}
-	}
 
-	SLJIT_ASSERT(!(type & SLJIT_CALL_RETURN));
-	PTR_FAIL_IF(softfloat_post_call_with_args(compiler, arg_types));
-	return jump;
-#else
+		SLJIT_ASSERT(!(type & SLJIT_CALL_RETURN));
+		PTR_FAIL_IF(softfloat_post_call_with_args(compiler, arg_types));
+		return jump;
+	}
+#endif /* __SOFTFP__ */
+
 	if (type & SLJIT_CALL_RETURN) {
 		/* ldmia sp!, {..., lr} */
 		PTR_FAIL_IF(emit_stack_frame_release(compiler, -1));
 		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 	}
 
-	PTR_FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+#ifndef __SOFTFP__
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG)
+		PTR_FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+#endif /* !__SOFTFP__ */
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_jump(compiler, type);
-#endif
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
@@ -2385,56 +2584,80 @@
 		src = TMP_REG1;
 	}
 
-	if ((type & SLJIT_CALL_RETURN) && (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0)) {
+	if ((type & SLJIT_CALL_RETURN) && (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options)))) {
 		FAIL_IF(push_inst16(compiler, MOV | SET_REGS44(TMP_REG1, src)));
 		src = TMP_REG1;
 	}
 
 #ifdef __SOFTFP__
-	FAIL_IF(softfloat_call_with_args(compiler, arg_types, &src, &extra_space));
-	SLJIT_ASSERT((extra_space & 0x7) == 0);
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG) {
+		FAIL_IF(softfloat_call_with_args(compiler, arg_types, &src, &extra_space));
+		SLJIT_ASSERT((extra_space & 0x7) == 0);
 
-	if ((type & SLJIT_CALL_RETURN) && extra_space == 0)
-		type = SLJIT_JUMP;
+		if ((type & SLJIT_CALL_RETURN) && extra_space == 0)
+			type = SLJIT_JUMP;
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+		SLJIT_SKIP_CHECKS(compiler);
+		FAIL_IF(sljit_emit_ijump(compiler, type, src, srcw));
 
-	FAIL_IF(sljit_emit_ijump(compiler, type, src, srcw));
+		if (extra_space > 0) {
+			if (type & SLJIT_CALL_RETURN)
+				FAIL_IF(push_inst32(compiler, LDR | RT4(TMP_REG2)
+					| RN4(SLJIT_SP) | (extra_space - sizeof(sljit_sw))));
 
-	if (extra_space > 0) {
-		if (type & SLJIT_CALL_RETURN)
-			FAIL_IF(push_inst32(compiler, LDR | RT4(TMP_REG2)
-				| RN4(SLJIT_SP) | (extra_space - sizeof(sljit_sw))));
+			FAIL_IF(push_inst16(compiler, ADD_SP_I | (extra_space >> 2)));
 
-		FAIL_IF(push_inst16(compiler, ADD_SP_I | (extra_space >> 2)));
+			if (type & SLJIT_CALL_RETURN)
+				return push_inst16(compiler, BX | RN3(TMP_REG2));
+		}
 
-		if (type & SLJIT_CALL_RETURN)
-			return push_inst16(compiler, BX | RN3(TMP_REG2));
+		SLJIT_ASSERT(!(type & SLJIT_CALL_RETURN));
+		return softfloat_post_call_with_args(compiler, arg_types);
 	}
+#endif /* __SOFTFP__ */
 
-	SLJIT_ASSERT(!(type & SLJIT_CALL_RETURN));
-	return softfloat_post_call_with_args(compiler, arg_types);
-#else /* !__SOFTFP__ */
 	if (type & SLJIT_CALL_RETURN) {
 		/* ldmia sp!, {..., lr} */
 		FAIL_IF(emit_stack_frame_release(compiler, -1));
 		type = SLJIT_JUMP;
 	}
 
-	FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+#ifndef __SOFTFP__
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG)
+		FAIL_IF(hardfloat_call_with_args(compiler, arg_types));
+#endif /* !__SOFTFP__ */
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_ijump(compiler, type, src, srcw);
-#endif /* __SOFTFP__ */
 }
 
+#ifdef __SOFTFP__
+
+static SLJIT_INLINE sljit_s32 emit_fmov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+	if (compiler->options & SLJIT_ENTER_REG_ARG) {
+		if (src == SLJIT_FR0)
+			return SLJIT_SUCCESS;
+
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_fop1(compiler, op, SLJIT_RETURN_FREG, 0, src, srcw);
+	}
+
+	if (FAST_IS_REG(src)) {
+		if (op & SLJIT_32)
+			return push_inst32(compiler, VMOV | (1 << 20) | DN4(src) | RT4(SLJIT_R0));
+		return push_inst32(compiler, VMOV2 | (1 << 20) | DM4(src) | RT4(SLJIT_R0) | RN4(SLJIT_R1));
+	}
+
+	SLJIT_SKIP_CHECKS(compiler);
+
+	if (op & SLJIT_32)
+		return sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src, srcw);
+	return sljit_emit_mem(compiler, SLJIT_MOV, SLJIT_REG_PAIR(SLJIT_R0, SLJIT_R1), src, srcw);
+}
+
+#endif /* __SOFTFP__ */
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 type)
@@ -2447,7 +2670,7 @@
 	ADJUST_LOCAL_OFFSET(dst, dstw);
 
 	op = GET_OPCODE(op);
-	cc = get_cc(compiler, type & 0xff);
+	cc = get_cc(compiler, type);
 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
 
 	if (op < SLJIT_ADD) {
@@ -2497,9 +2720,7 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
 
-	dst_reg &= ~SLJIT_32;
-
-	cc = get_cc(compiler, type & 0xff);
+	cc = get_cc(compiler, type & ~SLJIT_32);
 
 	if (!(src & SLJIT_IMM)) {
 		FAIL_IF(push_inst16(compiler, IT | (cc << 4) | 0x8));
@@ -2541,11 +2762,186 @@
 	sljit_s32 mem, sljit_sw memw)
 {
 	sljit_s32 flags;
-	sljit_ins inst;
+	sljit_uw imm, tmp;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
 
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+
+	if (type & (SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32)) {
+		if ((mem & REG_MASK) == 0) {
+			if ((memw & 0xfff) >= (0x1000 - SSIZE_OF(sw))) {
+				imm = get_imm((sljit_uw)((memw + 0x1000) & ~0xfff));
+
+				if (imm != INVALID_IMM)
+					memw = (memw & 0xfff) - 0x1000;
+			} else {
+				imm = get_imm((sljit_uw)(memw & ~0xfff));
+
+				if (imm != INVALID_IMM)
+					memw &= 0xff;
+			}
+
+			if (imm == INVALID_IMM) {
+				FAIL_IF(load_immediate(compiler, TMP_REG1, (sljit_uw)memw));
+				memw = 0;
+			} else
+				FAIL_IF(push_inst32(compiler, MOV_WI | RD4(TMP_REG1) | imm));
+
+			mem = SLJIT_MEM1(TMP_REG1);
+		} else if (mem & OFFS_REG_MASK) {
+			FAIL_IF(push_inst32(compiler, ADD_W | RD4(TMP_REG1) | RN4(mem & REG_MASK) | RM4(OFFS_REG(mem)) | ((sljit_uw)(memw & 0x3) << 6)));
+			memw = 0;
+			mem = SLJIT_MEM1(TMP_REG1);
+		} else if (memw < -0xff) {
+			/* Zero value can be included in the first case. */
+			if ((-memw & 0xfff) <= SSIZE_OF(sw))
+				tmp = (sljit_uw)((-memw + 0x7ff) & ~0x7ff);
+			else
+				tmp = (sljit_uw)((-memw + 0xfff) & ~0xfff);
+
+			SLJIT_ASSERT(tmp >= (sljit_uw)-memw);
+			imm = get_imm(tmp);
+
+			if (imm != INVALID_IMM) {
+				FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(mem & REG_MASK) | imm));
+				memw += (sljit_sw)tmp;
+				SLJIT_ASSERT(memw >= 0 && memw <= 0xfff - SSIZE_OF(sw));
+			} else {
+				FAIL_IF(load_immediate(compiler, TMP_REG1, (sljit_uw)memw));
+				FAIL_IF(push_inst16(compiler, ADD | SET_REGS44(TMP_REG1, mem & REG_MASK)));
+				memw = 0;
+			}
+
+			mem = SLJIT_MEM1(TMP_REG1);
+		} else if (memw >= (0x1000 - SSIZE_OF(sw))) {
+			if ((memw & 0xfff) >= (0x1000 - SSIZE_OF(sw))) {
+				imm = get_imm((sljit_uw)((memw + 0x1000) & ~0xfff));
+
+				if (imm != INVALID_IMM)
+					memw = (memw & 0xfff) - 0x1000;
+			} else {
+				imm = get_imm((sljit_uw)(memw & ~0xfff));
+
+				if (imm != INVALID_IMM)
+					memw &= 0xfff;
+			}
+
+			if (imm != INVALID_IMM) {
+				SLJIT_ASSERT(memw >= -0xff && memw <= 0xfff);
+				FAIL_IF(push_inst32(compiler, ADD_WI | RD4(TMP_REG1) | RN4(mem & REG_MASK) | imm));
+			} else {
+				FAIL_IF(load_immediate(compiler, TMP_REG1, (sljit_uw)memw));
+				FAIL_IF(push_inst16(compiler, ADD | SET_REGS44(TMP_REG1, mem & REG_MASK)));
+				memw = 0;
+			}
+
+			mem = SLJIT_MEM1(TMP_REG1);
+		}
+
+		flags = WORD_SIZE;
+
+		SLJIT_ASSERT(memw <= 0xfff - SSIZE_OF(sw) && memw >= -0xff);
+
+		if (type & SLJIT_MEM_STORE) {
+			flags |= STORE;
+		} else if (REG_PAIR_FIRST(reg) == (mem & REG_MASK)) {
+			FAIL_IF(emit_op_mem(compiler, WORD_SIZE, REG_PAIR_SECOND(reg), mem, memw + SSIZE_OF(sw), TMP_REG2));
+			return emit_op_mem(compiler, WORD_SIZE, REG_PAIR_FIRST(reg), mem, memw, TMP_REG2);
+		}
+
+		FAIL_IF(emit_op_mem(compiler, flags, REG_PAIR_FIRST(reg), mem, memw, TMP_REG2));
+		return emit_op_mem(compiler, flags, REG_PAIR_SECOND(reg), mem, memw + SSIZE_OF(sw), TMP_REG2);
+	}
+
+	flags = 1 << 23;
+
+	if ((mem & REG_MASK) == 0) {
+		tmp = (sljit_uw)(memw & 0x7fc);
+		imm = get_imm((sljit_uw)((memw + (tmp <= 0x400 ? 0 : 0x400)) & ~0x3fc));
+
+		if (imm == INVALID_IMM) {
+			FAIL_IF(load_immediate(compiler, TMP_REG1, (sljit_uw)memw));
+			memw = 0;
+		} else {
+			FAIL_IF(push_inst32(compiler, MOV_WI | RD4(TMP_REG1) | imm));
+			memw = (memw & 0x3fc) >> 2;
+
+			if (tmp > 0x400) {
+				memw = 0x100 - memw;
+				flags = 0;
+			}
+
+			SLJIT_ASSERT(memw >= 0 && memw <= 0xff);
+		}
+
+		mem = SLJIT_MEM1(TMP_REG1);
+	} else if (mem & OFFS_REG_MASK) {
+		FAIL_IF(push_inst32(compiler, ADD_W | RD4(TMP_REG1) | RN4(mem & REG_MASK) | RM4(OFFS_REG(mem)) | ((sljit_uw)(memw & 0x3) << 6)));
+		memw = 0;
+		mem = SLJIT_MEM1(TMP_REG1);
+	} else if (memw < 0) {
+		if ((-memw & ~0x3fc) == 0) {
+			flags = 0;
+			memw = -memw >> 2;
+		} else {
+			tmp = (sljit_uw)(-memw & 0x7fc);
+			imm = get_imm((sljit_uw)((-memw + (tmp <= 0x400 ? 0 : 0x400)) & ~0x3fc));
+
+			if (imm != INVALID_IMM) {
+				FAIL_IF(push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(mem & REG_MASK) | imm));
+				memw = (-memw & 0x3fc) >> 2;
+
+				if (tmp <= 0x400)
+					flags = 0;
+				else
+					memw = 0x100 - memw;
+			} else {
+				FAIL_IF(load_immediate(compiler, TMP_REG1, (sljit_uw)memw));
+				FAIL_IF(push_inst16(compiler, ADD | SET_REGS44(TMP_REG1, mem & REG_MASK)));
+				memw = 0;
+			}
+
+			mem = SLJIT_MEM1(TMP_REG1);
+		}
+	} else if ((memw & ~0x3fc) != 0) {
+		tmp = (sljit_uw)(memw & 0x7fc);
+		imm = get_imm((sljit_uw)((memw + (tmp <= 0x400 ? 0 : 0x400)) & ~0x3fc));
+
+		if (imm != INVALID_IMM) {
+			FAIL_IF(push_inst32(compiler, ADD_WI | RD4(TMP_REG1) | RN4(mem & REG_MASK) | imm));
+			memw = (memw & 0x3fc) >> 2;
+
+			if (tmp > 0x400) {
+				memw = 0x100 - memw;
+				flags = 0;
+			}
+		} else {
+			FAIL_IF(load_immediate(compiler, TMP_REG1, (sljit_uw)memw));
+			FAIL_IF(push_inst16(compiler, ADD | SET_REGS44(TMP_REG1, mem & REG_MASK)));
+			memw = 0;
+		}
+
+		mem = SLJIT_MEM1(TMP_REG1);
+	} else
+		memw >>= 2;
+
+	SLJIT_ASSERT(memw >= 0 && memw <= 0xff);
+	return push_inst32(compiler, ((type & SLJIT_MEM_STORE) ? STRD : LDRD) | (sljit_ins)flags | RN4(mem & REG_MASK) | RT4(REG_PAIR_FIRST(reg)) | RD4(REG_PAIR_SECOND(reg)) | (sljit_ins)memw);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_s32 flags;
+	sljit_ins inst;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem_update(compiler, type, reg, mem, memw));
+
 	if ((mem & OFFS_REG_MASK) || (memw > 255 || memw < -255))
 		return SLJIT_ERR_UNSUPPORTED;
 
@@ -2583,7 +2979,7 @@
 
 	inst = sljit_mem32[flags] | 0x900;
 
-	if (type & SLJIT_MEM_PRE)
+	if (!(type & SLJIT_MEM_POST))
 		inst |= 0x400;
 
 	if (memw >= 0)
@@ -2594,6 +2990,106 @@
 	return push_inst32(compiler, inst | RT4(reg) | RN4(mem & REG_MASK) | (sljit_ins)memw);
 }
 
+static sljit_s32 update_mem_addr(struct sljit_compiler *compiler, sljit_s32 *mem, sljit_sw *memw, sljit_s32 max_offset)
+{
+	sljit_s32 arg = *mem;
+	sljit_sw argw = *memw;
+	sljit_uw imm;
+
+	*mem = TMP_REG1;
+
+	if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
+		*memw = 0;
+		return push_inst32(compiler, ADD_W | RD4(TMP_REG1) | RN4(arg & REG_MASK) | RM4(OFFS_REG(arg)) | ((sljit_uw)(argw & 0x3) << 6));
+	}
+
+	arg &= REG_MASK;
+
+	if (arg) {
+		if (argw <= max_offset && argw >= -0xff) {
+			*mem = arg;
+			return SLJIT_SUCCESS;
+		}
+
+		if (argw < 0) {
+			imm = get_imm((sljit_uw)(-argw & ~0xff));
+
+			if (imm) {
+				*memw = -(-argw & 0xff);
+				return push_inst32(compiler, SUB_WI | RD4(TMP_REG1) | RN4(arg) | imm);
+			}
+		} else if ((argw & 0xfff) <= max_offset) {
+			imm = get_imm((sljit_uw)(argw & ~0xfff));
+
+			if (imm) {
+				*memw = argw & 0xfff;
+				return push_inst32(compiler, ADD_WI | RD4(TMP_REG1) | RN4(arg) | imm);
+			}
+		} else {
+			imm = get_imm((sljit_uw)((argw | 0xfff) + 1));
+
+			if (imm) {
+				*memw = (argw & 0xfff) - 0x1000;
+				return push_inst32(compiler, ADD_WI | RD4(TMP_REG1) | RN4(arg) | imm);
+			}
+		}
+	}
+
+	imm = (sljit_uw)(argw & ~0xfff);
+
+	if ((argw & 0xfff) > max_offset) {
+		imm += 0x1000;
+		*memw = (argw & 0xfff) - 0x1000;
+	} else
+		*memw = argw & 0xfff;
+
+	FAIL_IF(load_immediate(compiler, TMP_REG1, imm));
+
+	if (arg == 0)
+		return SLJIT_SUCCESS;
+
+	return push_inst16(compiler, ADD | SET_REGS44(TMP_REG1, arg));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
+
+	if (type & SLJIT_MEM_UNALIGNED_32)
+		return emit_fop_mem(compiler, ((type ^ SLJIT_32) & SLJIT_32) | ((type & SLJIT_MEM_STORE) ? 0 : FPU_LOAD), freg, mem, memw);
+
+	if (type & SLJIT_MEM_STORE) {
+		FAIL_IF(push_inst32(compiler, VMOV | (1 << 20) | DN4(freg) | RT4(TMP_REG2)));
+
+		if (type & SLJIT_32)
+			return emit_op_mem(compiler, WORD_SIZE | STORE, TMP_REG2, mem, memw, TMP_REG1);
+
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xfff - 4));
+		mem |= SLJIT_MEM;
+
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE | STORE, TMP_REG2, mem, memw, TMP_REG1));
+		FAIL_IF(push_inst32(compiler, VMOV | (1 << 20) | DN4(freg) | 0x80 | RT4(TMP_REG2)));
+		return emit_op_mem(compiler, WORD_SIZE | STORE, TMP_REG2, mem, memw + 4, TMP_REG1);
+	}
+
+	if (type & SLJIT_32) {
+		FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG2, mem, memw, TMP_REG1));
+		return push_inst32(compiler, VMOV | DN4(freg) | RT4(TMP_REG2));
+	}
+
+	FAIL_IF(update_mem_addr(compiler, &mem, &memw, 0xfff - 4));
+	mem |= SLJIT_MEM;
+
+	FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG2, mem, memw, TMP_REG1));
+	FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, mem, memw + 4, TMP_REG1));
+	return push_inst32(compiler, VMOV2 | DM4(freg) | RT4(TMP_REG2) | RN4(TMP_REG1));
+}
+
+#undef FPU_LOAD
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
 {
 	struct sljit_const *const_;
diff --git a/src/sljit/sljitNativeMIPS_32.c b/src/sljit/sljitNativeMIPS_32.c
index 1a06b17..e6853c9 100644
--- a/src/sljit/sljitNativeMIPS_32.c
+++ b/src/sljit/sljitNativeMIPS_32.c
@@ -38,383 +38,6 @@
 	return (imm & 0xffff) ? push_inst(compiler, ORI | SA(dst_ar) | TA(dst_ar) | IMM(imm), dst_ar) : SLJIT_SUCCESS;
 }
 
-#define EMIT_LOGICAL(op_imm, op_norm) \
-	if (flags & SRC2_IMM) { \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, op_imm | S(src1) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, op_imm | S(src1) | T(dst) | IMM(src2), DR(dst))); \
-	} \
-	else { \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, op_norm | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, op_norm | S(src1) | T(src2) | D(dst), DR(dst))); \
-	}
-
-#define EMIT_SHIFT(op_imm, op_v) \
-	if (flags & SRC2_IMM) { \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, op_imm | T(src1) | DA(EQUAL_FLAG) | SH_IMM(src2), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, op_imm | T(src1) | D(dst) | SH_IMM(src2), DR(dst))); \
-	} \
-	else { \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, op_v | S(src2) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, op_v | S(src2) | T(src1) | D(dst), DR(dst))); \
-	}
-
-static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
-	sljit_s32 dst, sljit_s32 src1, sljit_sw src2)
-{
-	sljit_s32 is_overflow, is_carry, is_handled;
-
-	switch (GET_OPCODE(op)) {
-	case SLJIT_MOV:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if (dst != src2)
-			return push_inst(compiler, ADDU | S(src2) | TA(0) | D(dst), DR(dst));
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_U8:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE))
-			return push_inst(compiler, ANDI | S(src2) | T(dst) | IMM(0xff), DR(dst));
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_S8:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
-			return push_inst(compiler, SEB | T(src2) | D(dst), DR(dst));
-#else /* SLJIT_MIPS_REV < 1 */
-			FAIL_IF(push_inst(compiler, SLL | T(src2) | D(dst) | SH_IMM(24), DR(dst)));
-			return push_inst(compiler, SRA | T(dst) | D(dst) | SH_IMM(24), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 1 */
-		}
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_U16:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE))
-			return push_inst(compiler, ANDI | S(src2) | T(dst) | IMM(0xffff), DR(dst));
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_S16:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
-			return push_inst(compiler, SEH | T(src2) | D(dst), DR(dst));
-#else /* SLJIT_MIPS_REV < 1 */
-			FAIL_IF(push_inst(compiler, SLL | T(src2) | D(dst) | SH_IMM(16), DR(dst)));
-			return push_inst(compiler, SRA | T(dst) | D(dst) | SH_IMM(16), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 1 */
-		}
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_NOT:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if (op & SLJIT_SET_Z)
-			FAIL_IF(push_inst(compiler, NOR | S(src2) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		if (!(flags & UNUSED_DEST))
-			FAIL_IF(push_inst(compiler, NOR | S(src2) | T(src2) | D(dst), DR(dst)));
-		return SLJIT_SUCCESS;
-
-	case SLJIT_CLZ:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
-		if (op & SLJIT_SET_Z)
-			FAIL_IF(push_inst(compiler, CLZ | S(src2) | TA(EQUAL_FLAG) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		if (!(flags & UNUSED_DEST))
-			FAIL_IF(push_inst(compiler, CLZ | S(src2) | T(dst) | D(dst), DR(dst)));
-#else /* SLJIT_MIPS_REV < 1 */
-		if (SLJIT_UNLIKELY(flags & UNUSED_DEST)) {
-			FAIL_IF(push_inst(compiler, SRL | T(src2) | DA(EQUAL_FLAG) | SH_IMM(31), EQUAL_FLAG));
-			return push_inst(compiler, XORI | SA(EQUAL_FLAG) | TA(EQUAL_FLAG) | IMM(1), EQUAL_FLAG);
-		}
-		/* Nearly all instructions are unmovable in the following sequence. */
-		FAIL_IF(push_inst(compiler, ADDU | S(src2) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
-		/* Check zero. */
-		FAIL_IF(push_inst(compiler, BEQ | S(TMP_REG1) | TA(0) | IMM(5), UNMOVABLE_INS));
-		FAIL_IF(push_inst(compiler, ORI | SA(0) | T(dst) | IMM(32), UNMOVABLE_INS));
-		FAIL_IF(push_inst(compiler, ADDIU | SA(0) | T(dst) | IMM(-1), DR(dst)));
-		/* Loop for searching the highest bit. */
-		FAIL_IF(push_inst(compiler, ADDIU | S(dst) | T(dst) | IMM(1), DR(dst)));
-		FAIL_IF(push_inst(compiler, BGEZ | S(TMP_REG1) | IMM(-2), UNMOVABLE_INS));
-		FAIL_IF(push_inst(compiler, SLL | T(TMP_REG1) | D(TMP_REG1) | SH_IMM(1), UNMOVABLE_INS));
-#endif /* SLJIT_MIPS_REV >= 1 */
-		return SLJIT_SUCCESS;
-
-	case SLJIT_ADD:
-		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
-
-		if (flags & SRC2_IMM) {
-			if (is_overflow) {
-				if (src2 >= 0)
-					FAIL_IF(push_inst(compiler, OR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
-				else
-					FAIL_IF(push_inst(compiler, NOR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			}
-			else if (op & SLJIT_SET_Z)
-				FAIL_IF(push_inst(compiler, ADDIU | S(src1) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG));
-
-			if (is_overflow || is_carry) {
-				if (src2 >= 0)
-					FAIL_IF(push_inst(compiler, ORI | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
-				else {
-					FAIL_IF(push_inst(compiler, ADDIU | SA(0) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
-					FAIL_IF(push_inst(compiler, OR | S(src1) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
-				}
-			}
-			/* dst may be the same as src1 or src2. */
-			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
-				FAIL_IF(push_inst(compiler, ADDIU | S(src1) | T(dst) | IMM(src2), DR(dst)));
-		}
-		else {
-			if (is_overflow)
-				FAIL_IF(push_inst(compiler, XOR | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			else if (op & SLJIT_SET_Z)
-				FAIL_IF(push_inst(compiler, ADDU | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-
-			if (is_overflow || is_carry)
-				FAIL_IF(push_inst(compiler, OR | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
-			/* dst may be the same as src1 or src2. */
-			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
-				FAIL_IF(push_inst(compiler, ADDU | S(src1) | T(src2) | D(dst), DR(dst)));
-		}
-
-		/* a + b >= a | b (otherwise, the carry should be set to 1). */
-		if (is_overflow || is_carry)
-			FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
-		if (!is_overflow)
-			return SLJIT_SUCCESS;
-		FAIL_IF(push_inst(compiler, SLL | TA(OTHER_FLAG) | D(TMP_REG1) | SH_IMM(31), DR(TMP_REG1)));
-		FAIL_IF(push_inst(compiler, XOR | S(TMP_REG1) | TA(EQUAL_FLAG) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		FAIL_IF(push_inst(compiler, XOR | S(dst) | TA(EQUAL_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
-		if (op & SLJIT_SET_Z)
-			FAIL_IF(push_inst(compiler, ADDU | S(dst) | TA(0) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		return push_inst(compiler, SRL | TA(OTHER_FLAG) | DA(OTHER_FLAG) | SH_IMM(31), OTHER_FLAG);
-
-	case SLJIT_ADDC:
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
-
-		if (flags & SRC2_IMM) {
-			if (is_carry) {
-				if (src2 >= 0)
-					FAIL_IF(push_inst(compiler, ORI | S(src1) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG));
-				else {
-					FAIL_IF(push_inst(compiler, ADDIU | SA(0) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG));
-					FAIL_IF(push_inst(compiler, OR | S(src1) | TA(EQUAL_FLAG) | DA(EQUAL_FLAG), EQUAL_FLAG));
-				}
-			}
-			FAIL_IF(push_inst(compiler, ADDIU | S(src1) | T(dst) | IMM(src2), DR(dst)));
-		} else {
-			if (is_carry)
-				FAIL_IF(push_inst(compiler, OR | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			/* dst may be the same as src1 or src2. */
-			FAIL_IF(push_inst(compiler, ADDU | S(src1) | T(src2) | D(dst), DR(dst)));
-		}
-		if (is_carry)
-			FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(EQUAL_FLAG) | DA(EQUAL_FLAG), EQUAL_FLAG));
-
-		FAIL_IF(push_inst(compiler, ADDU | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst)));
-		if (!is_carry)
-			return SLJIT_SUCCESS;
-
-		/* Set ULESS_FLAG (dst == 0) && (OTHER_FLAG == 1). */
-		FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
-		/* Set carry flag. */
-		return push_inst(compiler, OR | SA(OTHER_FLAG) | TA(EQUAL_FLAG) | DA(OTHER_FLAG), OTHER_FLAG);
-
-	case SLJIT_SUB:
-		if ((flags & SRC2_IMM) && src2 == SIMM_MIN) {
-			FAIL_IF(push_inst(compiler, ADDIU | SA(0) | T(TMP_REG2) | IMM(src2), DR(TMP_REG2)));
-			src2 = TMP_REG2;
-			flags &= ~SRC2_IMM;
-		}
-
-		is_handled = 0;
-
-		if (flags & SRC2_IMM) {
-			if (GET_FLAG_TYPE(op) == SLJIT_LESS || GET_FLAG_TYPE(op) == SLJIT_GREATER_EQUAL) {
-				FAIL_IF(push_inst(compiler, SLTIU | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
-				is_handled = 1;
-			}
-			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_LESS || GET_FLAG_TYPE(op) == SLJIT_SIG_GREATER_EQUAL) {
-				FAIL_IF(push_inst(compiler, SLTI | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
-				is_handled = 1;
-			}
-		}
-
-		if (!is_handled && GET_FLAG_TYPE(op) >= SLJIT_LESS && GET_FLAG_TYPE(op) <= SLJIT_SIG_LESS_EQUAL) {
-			is_handled = 1;
-
-			if (flags & SRC2_IMM) {
-				FAIL_IF(push_inst(compiler, ADDIU | SA(0) | T(TMP_REG2) | IMM(src2), DR(TMP_REG2)));
-				src2 = TMP_REG2;
-				flags &= ~SRC2_IMM;
-			}
-
-			if (GET_FLAG_TYPE(op) == SLJIT_LESS || GET_FLAG_TYPE(op) == SLJIT_GREATER_EQUAL) {
-				FAIL_IF(push_inst(compiler, SLTU | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
-			}
-			else if (GET_FLAG_TYPE(op) == SLJIT_GREATER || GET_FLAG_TYPE(op) == SLJIT_LESS_EQUAL)
-			{
-				FAIL_IF(push_inst(compiler, SLTU | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
-			}
-			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_LESS || GET_FLAG_TYPE(op) == SLJIT_SIG_GREATER_EQUAL) {
-				FAIL_IF(push_inst(compiler, SLT | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
-			}
-			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_GREATER || GET_FLAG_TYPE(op) == SLJIT_SIG_LESS_EQUAL)
-			{
-				FAIL_IF(push_inst(compiler, SLT | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
-			}
-		}
-
-		if (is_handled) {
-			if (flags & SRC2_IMM) {
-				if (op & SLJIT_SET_Z)
-					FAIL_IF(push_inst(compiler, ADDIU | S(src1) | TA(EQUAL_FLAG) | IMM(-src2), EQUAL_FLAG));
-				if (!(flags & UNUSED_DEST))
-					return push_inst(compiler, ADDIU | S(src1) | T(dst) | IMM(-src2), DR(dst));
-			}
-			else {
-				if (op & SLJIT_SET_Z)
-					FAIL_IF(push_inst(compiler, SUBU | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-				if (!(flags & UNUSED_DEST))
-					return push_inst(compiler, SUBU | S(src1) | T(src2) | D(dst), DR(dst));
-			}
-			return SLJIT_SUCCESS;
-		}
-
-		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
-
-		if (flags & SRC2_IMM) {
-			if (is_overflow) {
-				if (src2 >= 0)
-					FAIL_IF(push_inst(compiler, OR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
-				else
-					FAIL_IF(push_inst(compiler, NOR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			}
-			else if (op & SLJIT_SET_Z)
-				FAIL_IF(push_inst(compiler, ADDIU | S(src1) | TA(EQUAL_FLAG) | IMM(-src2), EQUAL_FLAG));
-
-			if (is_overflow || is_carry)
-				FAIL_IF(push_inst(compiler, SLTIU | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
-			/* dst may be the same as src1 or src2. */
-			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
-				FAIL_IF(push_inst(compiler, ADDIU | S(src1) | T(dst) | IMM(-src2), DR(dst)));
-		}
-		else {
-			if (is_overflow)
-				FAIL_IF(push_inst(compiler, XOR | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			else if (op & SLJIT_SET_Z)
-				FAIL_IF(push_inst(compiler, SUBU | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-
-			if (is_overflow || is_carry)
-				FAIL_IF(push_inst(compiler, SLTU | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
-			/* dst may be the same as src1 or src2. */
-			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
-				FAIL_IF(push_inst(compiler, SUBU | S(src1) | T(src2) | D(dst), DR(dst)));
-		}
-
-		if (!is_overflow)
-			return SLJIT_SUCCESS;
-		FAIL_IF(push_inst(compiler, SLL | TA(OTHER_FLAG) | D(TMP_REG1) | SH_IMM(31), DR(TMP_REG1)));
-		FAIL_IF(push_inst(compiler, XOR | S(TMP_REG1) | TA(EQUAL_FLAG) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		FAIL_IF(push_inst(compiler, XOR | S(dst) | TA(EQUAL_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
-		if (op & SLJIT_SET_Z)
-			FAIL_IF(push_inst(compiler, ADDU | S(dst) | TA(0) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		return push_inst(compiler, SRL | TA(OTHER_FLAG) | DA(OTHER_FLAG) | SH_IMM(31), OTHER_FLAG);
-
-	case SLJIT_SUBC:
-		if ((flags & SRC2_IMM) && src2 == SIMM_MIN) {
-			FAIL_IF(push_inst(compiler, ADDIU | SA(0) | T(TMP_REG2) | IMM(src2), DR(TMP_REG2)));
-			src2 = TMP_REG2;
-			flags &= ~SRC2_IMM;
-		}
-
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
-
-		if (flags & SRC2_IMM) {
-			if (is_carry)
-				FAIL_IF(push_inst(compiler, SLTIU | S(src1) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG));
-			/* dst may be the same as src1 or src2. */
-			FAIL_IF(push_inst(compiler, ADDIU | S(src1) | T(dst) | IMM(-src2), DR(dst)));
-		}
-		else {
-			if (is_carry)
-				FAIL_IF(push_inst(compiler, SLTU | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			/* dst may be the same as src1 or src2. */
-			FAIL_IF(push_inst(compiler, SUBU | S(src1) | T(src2) | D(dst), DR(dst)));
-		}
-
-		if (is_carry)
-			FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(OTHER_FLAG) | D(TMP_REG1), DR(TMP_REG1)));
-
-		FAIL_IF(push_inst(compiler, SUBU | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst)));
-		return (is_carry) ? push_inst(compiler, OR | SA(EQUAL_FLAG) | T(TMP_REG1) | DA(OTHER_FLAG), OTHER_FLAG) : SLJIT_SUCCESS;
-
-	case SLJIT_MUL:
-		SLJIT_ASSERT(!(flags & SRC2_IMM));
-
-		if (GET_FLAG_TYPE(op) != SLJIT_OVERFLOW) {
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
-			return push_inst(compiler, MUL | S(src1) | T(src2) | D(dst), DR(dst));
-#else /* SLJIT_MIPS_REV < 1 */
-			FAIL_IF(push_inst(compiler, MULT | S(src1) | T(src2), MOVABLE_INS));
-			return push_inst(compiler, MFLO | D(dst), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 1 */
-		}
-
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
-		FAIL_IF(push_inst(compiler, MUL | S(src1) | T(src2) | D(dst), DR(dst)));
-		FAIL_IF(push_inst(compiler, MUH | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-#else /* SLJIT_MIPS_REV < 6 */
-		FAIL_IF(push_inst(compiler, MULT | S(src1) | T(src2), MOVABLE_INS));
-		FAIL_IF(push_inst(compiler, MFHI | DA(EQUAL_FLAG), EQUAL_FLAG));
-		FAIL_IF(push_inst(compiler, MFLO | D(dst), DR(dst)));
-#endif /* SLJIT_MIPS_REV >= 6 */
-		FAIL_IF(push_inst(compiler, SRA | T(dst) | DA(OTHER_FLAG) | SH_IMM(31), OTHER_FLAG));
-		return push_inst(compiler, SUBU | SA(EQUAL_FLAG) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG);
-
-	case SLJIT_AND:
-		EMIT_LOGICAL(ANDI, AND);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_OR:
-		EMIT_LOGICAL(ORI, OR);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_XOR:
-		EMIT_LOGICAL(XORI, XOR);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_SHL:
-		EMIT_SHIFT(SLL, SLLV);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_LSHR:
-		EMIT_SHIFT(SRL, SRLV);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_ASHR:
-		EMIT_SHIFT(SRA, SRAV);
-		return SLJIT_SUCCESS;
-	}
-
-	SLJIT_UNREACHABLE();
-	return SLJIT_SUCCESS;
-}
-
 static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw init_value)
 {
 	FAIL_IF(push_inst(compiler, LUI | T(dst) | IMM(init_value >> 16), DR(dst)));
@@ -573,8 +196,8 @@
 	sljit_s32 arg_types)
 {
 	struct sljit_jump *jump;
-	sljit_u32 extra_space = (sljit_u32)type;
-	sljit_ins ins;
+	sljit_u32 extra_space = 0;
+	sljit_ins ins = NOP;
 
 	CHECK_ERROR_PTR();
 	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
@@ -583,14 +206,23 @@
 	PTR_FAIL_IF(!jump);
 	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
 
-	PTR_FAIL_IF(call_with_args(compiler, arg_types, &ins, &extra_space));
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG) {
+		extra_space = (sljit_u32)type;
+		PTR_FAIL_IF(call_with_args(compiler, arg_types, &ins, &extra_space));
+	} else if (type & SLJIT_CALL_RETURN)
+		PTR_FAIL_IF(emit_stack_frame_release(compiler, 0, &ins));
 
 	SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
 
-	PTR_FAIL_IF(emit_const(compiler, PIC_ADDR_REG, 0));
+	if (ins == NOP && compiler->delay_slot != UNMOVABLE_INS)
+		jump->flags |= IS_MOVABLE;
 
 	if (!(type & SLJIT_CALL_RETURN) || extra_space > 0) {
-		jump->flags |= IS_JAL | IS_CALL;
+		jump->flags |= IS_JAL;
+
+		if ((type & 0xff) != SLJIT_CALL_REG_ARG)
+			jump->flags |= IS_CALL;
+
 		PTR_FAIL_IF(push_inst(compiler, JALR | S(PIC_ADDR_REG) | DA(RETURN_ADDR_REG), UNMOVABLE_INS));
 	} else
 		PTR_FAIL_IF(push_inst(compiler, JR | S(PIC_ADDR_REG), UNMOVABLE_INS));
@@ -598,6 +230,9 @@
 	jump->addr = compiler->size;
 	PTR_FAIL_IF(push_inst(compiler, ins, UNMOVABLE_INS));
 
+	/* Maximum number of instructions required for generating a constant. */
+	compiler->size += 2;
+
 	if (extra_space == 0)
 		return jump;
 
@@ -623,16 +258,37 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
 
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, DR(PIC_ADDR_REG), src, srcw));
+		src = PIC_ADDR_REG;
+		srcw = 0;
+	}
+
+	if ((type & 0xff) == SLJIT_CALL_REG_ARG) {
+		if (type & SLJIT_CALL_RETURN) {
+			if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+				FAIL_IF(push_inst(compiler, ADDU | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
+				src = PIC_ADDR_REG;
+				srcw = 0;
+			}
+
+			FAIL_IF(emit_stack_frame_release(compiler, 0, &ins));
+
+			if (ins != NOP)
+				FAIL_IF(push_inst(compiler, ins, MOVABLE_INS));
+		}
+
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_ijump(compiler, type, src, srcw);
+	}
+
 	SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
 
 	if (src & SLJIT_IMM)
 		FAIL_IF(load_immediate(compiler, DR(PIC_ADDR_REG), srcw));
-	else if (FAST_IS_REG(src))
+	else if (src != PIC_ADDR_REG)
 		FAIL_IF(push_inst(compiler, ADDU | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
-	else if (src & SLJIT_MEM) {
-		ADJUST_LOCAL_OFFSET(src, srcw);
-		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, DR(PIC_ADDR_REG), src, srcw));
-	}
 
 	FAIL_IF(call_with_args(compiler, arg_types, &ins, &extra_space));
 
diff --git a/src/sljit/sljitNativeMIPS_64.c b/src/sljit/sljitNativeMIPS_64.c
index c2b3d83..d2a5924 100644
--- a/src/sljit/sljitNativeMIPS_64.c
+++ b/src/sljit/sljitNativeMIPS_64.c
@@ -118,421 +118,6 @@
 	return !(imm & 0xffff) ? SLJIT_SUCCESS : push_inst(compiler, ORI | SA(dst_ar) | TA(dst_ar) | IMM(imm), dst_ar);
 }
 
-#define SELECT_OP(a, b) \
-	(!(op & SLJIT_32) ? a : b)
-
-#define EMIT_LOGICAL(op_imm, op_norm) \
-	if (flags & SRC2_IMM) { \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, op_imm | S(src1) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, op_imm | S(src1) | T(dst) | IMM(src2), DR(dst))); \
-	} \
-	else { \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, op_norm | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, op_norm | S(src1) | T(src2) | D(dst), DR(dst))); \
-	}
-
-#define EMIT_SHIFT(op_dimm, op_dimm32, op_imm, op_dv, op_v) \
-	if (flags & SRC2_IMM) { \
-		if (src2 >= 32) { \
-			SLJIT_ASSERT(!(op & SLJIT_32)); \
-			ins = op_dimm32; \
-			src2 -= 32; \
-		} \
-		else \
-			ins = (op & SLJIT_32) ? op_imm : op_dimm; \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, ins | T(src1) | DA(EQUAL_FLAG) | SH_IMM(src2), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, ins | T(src1) | D(dst) | SH_IMM(src2), DR(dst))); \
-	} \
-	else { \
-		ins = (op & SLJIT_32) ? op_v : op_dv; \
-		if (op & SLJIT_SET_Z) \
-			FAIL_IF(push_inst(compiler, ins | S(src2) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG)); \
-		if (!(flags & UNUSED_DEST)) \
-			FAIL_IF(push_inst(compiler, ins | S(src2) | T(src1) | D(dst), DR(dst))); \
-	}
-
-static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
-	sljit_s32 dst, sljit_s32 src1, sljit_sw src2)
-{
-	sljit_ins ins;
-	sljit_s32 is_overflow, is_carry, is_handled;
-
-	switch (GET_OPCODE(op)) {
-	case SLJIT_MOV:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if (dst != src2)
-			return push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src2) | TA(0) | D(dst), DR(dst));
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_U8:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE))
-			return push_inst(compiler, ANDI | S(src2) | T(dst) | IMM(0xff), DR(dst));
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_S8:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
-			if (op & SLJIT_32)
-				return push_inst(compiler, SEB | T(src2) | D(dst), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 1 */
-			FAIL_IF(push_inst(compiler, DSLL32 | T(src2) | D(dst) | SH_IMM(24), DR(dst)));
-			return push_inst(compiler, DSRA32 | T(dst) | D(dst) | SH_IMM(24), DR(dst));
-		}
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_U16:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE))
-			return push_inst(compiler, ANDI | S(src2) | T(dst) | IMM(0xffff), DR(dst));
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_S16:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
-			if (op & SLJIT_32)
-				return push_inst(compiler, SEH | T(src2) | D(dst), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 1 */
-			FAIL_IF(push_inst(compiler, DSLL32 | T(src2) | D(dst) | SH_IMM(16), DR(dst)));
-			return push_inst(compiler, DSRA32 | T(dst) | D(dst) | SH_IMM(16), DR(dst));
-		}
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_U32:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM) && !(op & SLJIT_32));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
-			if (dst == src2)
-				return push_inst(compiler, DINSU | T(src2) | SA(0) | (31 << 11) | (0 << 11), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 2 */
-			FAIL_IF(push_inst(compiler, DSLL32 | T(src2) | D(dst) | SH_IMM(0), DR(dst)));
-			return push_inst(compiler, DSRL32 | T(dst) | D(dst) | SH_IMM(0), DR(dst));
-		}
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_S32:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM) && !(op & SLJIT_32));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
-			return push_inst(compiler, SLL | T(src2) | D(dst) | SH_IMM(0), DR(dst));
-		}
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_NOT:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if (op & SLJIT_SET_Z)
-			FAIL_IF(push_inst(compiler, NOR | S(src2) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		if (!(flags & UNUSED_DEST))
-			FAIL_IF(push_inst(compiler, NOR | S(src2) | T(src2) | D(dst), DR(dst)));
-		return SLJIT_SUCCESS;
-
-	case SLJIT_CLZ:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
-		if (op & SLJIT_SET_Z)
-			FAIL_IF(push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(src2) | TA(EQUAL_FLAG) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		if (!(flags & UNUSED_DEST))
-			FAIL_IF(push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(src2) | T(dst) | D(dst), DR(dst)));
-#else /* SLJIT_MIPS_REV < 1 */
-		if (SLJIT_UNLIKELY(flags & UNUSED_DEST)) {
-			FAIL_IF(push_inst(compiler, SELECT_OP(DSRL32, SRL) | T(src2) | DA(EQUAL_FLAG) | SH_IMM(31), EQUAL_FLAG));
-			return push_inst(compiler, XORI | SA(EQUAL_FLAG) | TA(EQUAL_FLAG) | IMM(1), EQUAL_FLAG);
-		}
-		/* Nearly all instructions are unmovable in the following sequence. */
-		FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src2) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
-		/* Check zero. */
-		FAIL_IF(push_inst(compiler, BEQ | S(TMP_REG1) | TA(0) | IMM(5), UNMOVABLE_INS));
-		FAIL_IF(push_inst(compiler, ORI | SA(0) | T(dst) | IMM((op & SLJIT_32) ? 32 : 64), UNMOVABLE_INS));
-		FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | SA(0) | T(dst) | IMM(-1), DR(dst)));
-		/* Loop for searching the highest bit. */
-		FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(dst) | T(dst) | IMM(1), DR(dst)));
-		FAIL_IF(push_inst(compiler, BGEZ | S(TMP_REG1) | IMM(-2), UNMOVABLE_INS));
-		FAIL_IF(push_inst(compiler, SELECT_OP(DSLL, SLL) | T(TMP_REG1) | D(TMP_REG1) | SH_IMM(1), UNMOVABLE_INS));
-#endif /* SLJIT_MIPS_REV >= 1 */
-		return SLJIT_SUCCESS;
-
-	case SLJIT_ADD:
-		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
-
-		if (flags & SRC2_IMM) {
-			if (is_overflow) {
-				if (src2 >= 0)
-					FAIL_IF(push_inst(compiler, OR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
-				else
-					FAIL_IF(push_inst(compiler, NOR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			}
-			else if (op & SLJIT_SET_Z)
-				FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG));
-
-			if (is_overflow || is_carry) {
-				if (src2 >= 0)
-					FAIL_IF(push_inst(compiler, ORI | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
-				else {
-					FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | SA(0) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
-					FAIL_IF(push_inst(compiler, OR | S(src1) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
-				}
-			}
-			/* dst may be the same as src1 or src2. */
-			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
-				FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | T(dst) | IMM(src2), DR(dst)));
-		}
-		else {
-			if (is_overflow)
-				FAIL_IF(push_inst(compiler, XOR | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			else if (op & SLJIT_SET_Z)
-				FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-
-			if (is_overflow || is_carry)
-				FAIL_IF(push_inst(compiler, OR | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
-			/* dst may be the same as src1 or src2. */
-			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
-				FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src1) | T(src2) | D(dst), DR(dst)));
-		}
-
-		/* a + b >= a | b (otherwise, the carry should be set to 1). */
-		if (is_overflow || is_carry)
-			FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
-		if (!is_overflow)
-			return SLJIT_SUCCESS;
-		FAIL_IF(push_inst(compiler, SELECT_OP(DSLL32, SLL) | TA(OTHER_FLAG) | D(TMP_REG1) | SH_IMM(31), DR(TMP_REG1)));
-		FAIL_IF(push_inst(compiler, XOR | S(TMP_REG1) | TA(EQUAL_FLAG) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		FAIL_IF(push_inst(compiler, XOR | S(dst) | TA(EQUAL_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
-		if (op & SLJIT_SET_Z)
-			FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(dst) | TA(0) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		return push_inst(compiler, SELECT_OP(DSRL32, SRL) | TA(OTHER_FLAG) | DA(OTHER_FLAG) | SH_IMM(31), OTHER_FLAG);
-
-	case SLJIT_ADDC:
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
-
-		if (flags & SRC2_IMM) {
-			if (is_carry) {
-				if (src2 >= 0)
-					FAIL_IF(push_inst(compiler, ORI | S(src1) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG));
-				else {
-					FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | SA(0) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG));
-					FAIL_IF(push_inst(compiler, OR | S(src1) | TA(EQUAL_FLAG) | DA(EQUAL_FLAG), EQUAL_FLAG));
-				}
-			}
-			FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | T(dst) | IMM(src2), DR(dst)));
-		} else {
-			if (is_carry)
-				FAIL_IF(push_inst(compiler, OR | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			/* dst may be the same as src1 or src2. */
-			FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src1) | T(src2) | D(dst), DR(dst)));
-		}
-		if (is_carry)
-			FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(EQUAL_FLAG) | DA(EQUAL_FLAG), EQUAL_FLAG));
-
-		FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst)));
-		if (!is_carry)
-			return SLJIT_SUCCESS;
-
-		/* Set ULESS_FLAG (dst == 0) && (OTHER_FLAG == 1). */
-		FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
-		/* Set carry flag. */
-		return push_inst(compiler, OR | SA(OTHER_FLAG) | TA(EQUAL_FLAG) | DA(OTHER_FLAG), OTHER_FLAG);
-
-	case SLJIT_SUB:
-		if ((flags & SRC2_IMM) && src2 == SIMM_MIN) {
-			FAIL_IF(push_inst(compiler, ADDIU | SA(0) | T(TMP_REG2) | IMM(src2), DR(TMP_REG2)));
-			src2 = TMP_REG2;
-			flags &= ~SRC2_IMM;
-		}
-
-		is_handled = 0;
-
-		if (flags & SRC2_IMM) {
-			if (GET_FLAG_TYPE(op) == SLJIT_LESS || GET_FLAG_TYPE(op) == SLJIT_GREATER_EQUAL) {
-				FAIL_IF(push_inst(compiler, SLTIU | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
-				is_handled = 1;
-			}
-			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_LESS || GET_FLAG_TYPE(op) == SLJIT_SIG_GREATER_EQUAL) {
-				FAIL_IF(push_inst(compiler, SLTI | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
-				is_handled = 1;
-			}
-		}
-
-		if (!is_handled && GET_FLAG_TYPE(op) >= SLJIT_LESS && GET_FLAG_TYPE(op) <= SLJIT_SIG_LESS_EQUAL) {
-			is_handled = 1;
-
-			if (flags & SRC2_IMM) {
-				FAIL_IF(push_inst(compiler, ADDIU | SA(0) | T(TMP_REG2) | IMM(src2), DR(TMP_REG2)));
-				src2 = TMP_REG2;
-				flags &= ~SRC2_IMM;
-			}
-
-			if (GET_FLAG_TYPE(op) == SLJIT_LESS || GET_FLAG_TYPE(op) == SLJIT_GREATER_EQUAL) {
-				FAIL_IF(push_inst(compiler, SLTU | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
-			}
-			else if (GET_FLAG_TYPE(op) == SLJIT_GREATER || GET_FLAG_TYPE(op) == SLJIT_LESS_EQUAL)
-			{
-				FAIL_IF(push_inst(compiler, SLTU | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
-			}
-			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_LESS || GET_FLAG_TYPE(op) == SLJIT_SIG_GREATER_EQUAL) {
-				FAIL_IF(push_inst(compiler, SLT | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
-			}
-			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_GREATER || GET_FLAG_TYPE(op) == SLJIT_SIG_LESS_EQUAL)
-			{
-				FAIL_IF(push_inst(compiler, SLT | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
-			}
-		}
-
-		if (is_handled) {
-			if (flags & SRC2_IMM) {
-				if (op & SLJIT_SET_Z)
-					FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | TA(EQUAL_FLAG) | IMM(-src2), EQUAL_FLAG));
-				if (!(flags & UNUSED_DEST))
-					return push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | T(dst) | IMM(-src2), DR(dst));
-			}
-			else {
-				if (op & SLJIT_SET_Z)
-					FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-				if (!(flags & UNUSED_DEST))
-					return push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(src1) | T(src2) | D(dst), DR(dst));
-			}
-			return SLJIT_SUCCESS;
-		}
-
-		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
-
-		if (flags & SRC2_IMM) {
-			if (is_overflow) {
-				if (src2 >= 0)
-					FAIL_IF(push_inst(compiler, OR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
-				else
-					FAIL_IF(push_inst(compiler, NOR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			}
-			else if (op & SLJIT_SET_Z)
-				FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | TA(EQUAL_FLAG) | IMM(-src2), EQUAL_FLAG));
-
-			if (is_overflow || is_carry)
-				FAIL_IF(push_inst(compiler, SLTIU | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
-			/* dst may be the same as src1 or src2. */
-			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
-				FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | T(dst) | IMM(-src2), DR(dst)));
-		}
-		else {
-			if (is_overflow)
-				FAIL_IF(push_inst(compiler, XOR | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			else if (op & SLJIT_SET_Z)
-				FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-
-			if (is_overflow || is_carry)
-				FAIL_IF(push_inst(compiler, SLTU | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
-			/* dst may be the same as src1 or src2. */
-			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
-				FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(src1) | T(src2) | D(dst), DR(dst)));
-		}
-
-		if (!is_overflow)
-			return SLJIT_SUCCESS;
-		FAIL_IF(push_inst(compiler, SELECT_OP(DSLL32, SLL) | TA(OTHER_FLAG) | D(TMP_REG1) | SH_IMM(31), DR(TMP_REG1)));
-		FAIL_IF(push_inst(compiler, XOR | S(TMP_REG1) | TA(EQUAL_FLAG) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		FAIL_IF(push_inst(compiler, XOR | S(dst) | TA(EQUAL_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
-		if (op & SLJIT_SET_Z)
-			FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(dst) | TA(0) | DA(EQUAL_FLAG), EQUAL_FLAG));
-		return push_inst(compiler, SELECT_OP(DSRL32, SRL) | TA(OTHER_FLAG) | DA(OTHER_FLAG) | SH_IMM(31), OTHER_FLAG);
-
-	case SLJIT_SUBC:
-		if ((flags & SRC2_IMM) && src2 == SIMM_MIN) {
-			FAIL_IF(push_inst(compiler, ADDIU | SA(0) | T(TMP_REG2) | IMM(src2), DR(TMP_REG2)));
-			src2 = TMP_REG2;
-			flags &= ~SRC2_IMM;
-		}
-
-		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
-
-		if (flags & SRC2_IMM) {
-			if (is_carry)
-				FAIL_IF(push_inst(compiler, SLTIU | S(src1) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG));
-			/* dst may be the same as src1 or src2. */
-			FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | T(dst) | IMM(-src2), DR(dst)));
-		}
-		else {
-			if (is_carry)
-				FAIL_IF(push_inst(compiler, SLTU | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-			/* dst may be the same as src1 or src2. */
-			FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(src1) | T(src2) | D(dst), DR(dst)));
-		}
-
-		if (is_carry)
-			FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(OTHER_FLAG) | D(TMP_REG1), DR(TMP_REG1)));
-
-		FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst)));
-		return (is_carry) ? push_inst(compiler, OR | SA(EQUAL_FLAG) | T(TMP_REG1) | DA(OTHER_FLAG), OTHER_FLAG) : SLJIT_SUCCESS;
-
-	case SLJIT_MUL:
-		SLJIT_ASSERT(!(flags & SRC2_IMM));
-
-		if (GET_FLAG_TYPE(op) != SLJIT_OVERFLOW) {
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
-			return push_inst(compiler, SELECT_OP(DMUL, MUL) | S(src1) | T(src2) | D(dst), DR(dst));
-#elif (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
-			if (op & SLJIT_32)
-				return push_inst(compiler, MUL | S(src1) | T(src2) | D(dst), DR(dst));
-			FAIL_IF(push_inst(compiler, DMULT | S(src1) | T(src2), MOVABLE_INS));
-			return push_inst(compiler, MFLO | D(dst), DR(dst));
-#else /* SLJIT_MIPS_REV < 1 */
-			FAIL_IF(push_inst(compiler, SELECT_OP(DMULT, MULT) | S(src1) | T(src2), MOVABLE_INS));
-			return push_inst(compiler, MFLO | D(dst), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 6 */
-		}
-
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
-		FAIL_IF(push_inst(compiler, SELECT_OP(DMUL, MUL) | S(src1) | T(src2) | D(dst), DR(dst)));
-		FAIL_IF(push_inst(compiler, SELECT_OP(DMUH, MUH) | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
-#else /* SLJIT_MIPS_REV < 6 */
-		FAIL_IF(push_inst(compiler, SELECT_OP(DMULT, MULT) | S(src1) | T(src2), MOVABLE_INS));
-		FAIL_IF(push_inst(compiler, MFHI | DA(EQUAL_FLAG), EQUAL_FLAG));
-		FAIL_IF(push_inst(compiler, MFLO | D(dst), DR(dst)));
-#endif /* SLJIT_MIPS_REV >= 6 */
-		FAIL_IF(push_inst(compiler, SELECT_OP(DSRA32, SRA) | T(dst) | DA(OTHER_FLAG) | SH_IMM(31), OTHER_FLAG));
-		return push_inst(compiler, SELECT_OP(DSUBU, SUBU) | SA(EQUAL_FLAG) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG);
-
-	case SLJIT_AND:
-		EMIT_LOGICAL(ANDI, AND);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_OR:
-		EMIT_LOGICAL(ORI, OR);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_XOR:
-		EMIT_LOGICAL(XORI, XOR);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_SHL:
-		EMIT_SHIFT(DSLL, DSLL32, SLL, DSLLV, SLLV);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_LSHR:
-		EMIT_SHIFT(DSRL, DSRL32, SRL, DSRLV, SRLV);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_ASHR:
-		EMIT_SHIFT(DSRA, DSRA32, SRA, DSRAV, SRAV);
-		return SLJIT_SUCCESS;
-	}
-
-	SLJIT_UNREACHABLE();
-	return SLJIT_SUCCESS;
-}
-
 static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw init_value)
 {
 	FAIL_IF(push_inst(compiler, LUI | T(dst) | IMM(init_value >> 48), DR(dst)));
@@ -653,14 +238,20 @@
 	if (type & SLJIT_CALL_RETURN)
 		PTR_FAIL_IF(emit_stack_frame_release(compiler, 0, &ins));
 
-	PTR_FAIL_IF(call_with_args(compiler, arg_types, &ins));
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG)
+		PTR_FAIL_IF(call_with_args(compiler, arg_types, &ins));
 
 	SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
 
-	PTR_FAIL_IF(emit_const(compiler, PIC_ADDR_REG, 0));
+	if (ins == NOP && compiler->delay_slot != UNMOVABLE_INS)
+		jump->flags |= IS_MOVABLE;
 
 	if (!(type & SLJIT_CALL_RETURN)) {
-		jump->flags |= IS_JAL | IS_CALL;
+		jump->flags |= IS_JAL;
+
+		if ((type & 0xff) != SLJIT_CALL_REG_ARG)
+			jump->flags |= IS_CALL;
+
 		PTR_FAIL_IF(push_inst(compiler, JALR | S(PIC_ADDR_REG) | DA(RETURN_ADDR_REG), UNMOVABLE_INS));
 	} else
 		PTR_FAIL_IF(push_inst(compiler, JR | S(PIC_ADDR_REG), UNMOVABLE_INS));
@@ -668,6 +259,8 @@
 	jump->addr = compiler->size;
 	PTR_FAIL_IF(push_inst(compiler, ins, UNMOVABLE_INS));
 
+	/* Maximum number of instructions required for generating a constant. */
+	compiler->size += 6;
 	return jump;
 }
 
@@ -680,16 +273,37 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
 
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, DR(PIC_ADDR_REG), src, srcw));
+		src = PIC_ADDR_REG;
+		srcw = 0;
+	}
+
+	if ((type & 0xff) == SLJIT_CALL_REG_ARG) {
+		if (type & SLJIT_CALL_RETURN) {
+			if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+				FAIL_IF(push_inst(compiler, DADDU | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
+				src = PIC_ADDR_REG;
+				srcw = 0;
+			}
+
+			FAIL_IF(emit_stack_frame_release(compiler, 0, &ins));
+
+			if (ins != NOP)
+				FAIL_IF(push_inst(compiler, ins, MOVABLE_INS));
+		}
+
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_ijump(compiler, type, src, srcw);
+	}
+
 	SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
 
 	if (src & SLJIT_IMM)
 		FAIL_IF(load_immediate(compiler, DR(PIC_ADDR_REG), srcw));
-	else if (FAST_IS_REG(src))
+	else if (src != PIC_ADDR_REG)
 		FAIL_IF(push_inst(compiler, DADDU | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
-	else if (src & SLJIT_MEM) {
-		ADJUST_LOCAL_OFFSET(src, srcw);
-		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, DR(PIC_ADDR_REG), src, srcw));
-	}
 
 	if (type & SLJIT_CALL_RETURN)
 		FAIL_IF(emit_stack_frame_release(compiler, 0, &ins));
diff --git a/src/sljit/sljitNativeMIPS_common.c b/src/sljit/sljitNativeMIPS_common.c
index be5cb22..9afe901 100644
--- a/src/sljit/sljitNativeMIPS_common.c
+++ b/src/sljit/sljitNativeMIPS_common.c
@@ -42,6 +42,14 @@
 	return "MIPS64-R6" SLJIT_CPUINFO;
 #endif /* SLJIT_CONFIG_MIPS_32 */
 
+#elif (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	return "MIPS32-R2" SLJIT_CPUINFO;
+#else /* !SLJIT_CONFIG_MIPS_32 */
+	return "MIPS64-R2" SLJIT_CPUINFO;
+#endif /* SLJIT_CONFIG_MIPS_32 */
+
 #elif (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -151,12 +159,18 @@
 #define BREAK		(HI(0) | LO(13))
 #define CFC1		(HI(17) | (2 << 21))
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+#define C_EQ_S		(HI(17) | CMP_FMT_S | LO(2))
+#define C_OLE_S		(HI(17) | CMP_FMT_S | LO(6))
+#define C_OLT_S		(HI(17) | CMP_FMT_S | LO(4))
 #define C_UEQ_S		(HI(17) | CMP_FMT_S | LO(3))
 #define C_ULE_S		(HI(17) | CMP_FMT_S | LO(7))
 #define C_ULT_S		(HI(17) | CMP_FMT_S | LO(5))
 #define C_UN_S		(HI(17) | CMP_FMT_S | LO(1))
 #define C_FD		(FD(TMP_FREG3))
 #else /* SLJIT_MIPS_REV < 6 */
+#define C_EQ_S		(HI(17) | FMT_S | LO(50))
+#define C_OLE_S		(HI(17) | FMT_S | LO(54))
+#define C_OLT_S		(HI(17) | FMT_S | LO(52))
 #define C_UEQ_S		(HI(17) | FMT_S | LO(51))
 #define C_ULE_S		(HI(17) | FMT_S | LO(55))
 #define C_ULT_S		(HI(17) | FMT_S | LO(53))
@@ -187,6 +201,9 @@
 #endif /* SLJIT_MIPS_REV >= 6 */
 #define DIV_S		(HI(17) | FMT_S | LO(3))
 #define DINSU		(HI(31) | LO(6))
+#define DROTR		(HI(0) | (1 << 21) | LO(58))
+#define DROTR32		(HI(0) | (1 << 21) | LO(62))
+#define DROTRV		(HI(0) | (1 << 6) | LO(22))
 #define DSLL		(HI(0) | LO(56))
 #define DSLL32		(HI(0) | LO(60))
 #define DSLLV		(HI(0) | LO(20))
@@ -206,9 +223,13 @@
 #define JR		(HI(0) | LO(8))
 #endif /* SLJIT_MIPS_REV >= 6 */
 #define LD		(HI(55))
+#define LDL		(HI(26))
+#define LDR		(HI(27))
 #define LDC1		(HI(53))
 #define LUI		(HI(15))
 #define LW		(HI(35))
+#define LWL		(HI(34))
+#define LWR		(HI(38))
 #define LWC1		(HI(49))
 #define MFC1		(HI(17))
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
@@ -235,7 +256,11 @@
 #define NOR		(HI(0) | LO(39))
 #define OR		(HI(0) | LO(37))
 #define ORI		(HI(13))
+#define ROTR		(HI(0) | (1 << 21) | LO(2))
+#define ROTRV		(HI(0) | (1 << 6) | LO(6))
 #define SD		(HI(63))
+#define SDL		(HI(44))
+#define SDR		(HI(45))
 #define SDC1		(HI(61))
 #define SLT		(HI(0) | LO(42))
 #define SLTI		(HI(10))
@@ -250,6 +275,8 @@
 #define SUB_S		(HI(17) | FMT_S | LO(1))
 #define SUBU		(HI(0) | LO(35))
 #define SW		(HI(43))
+#define SWL		(HI(42))
+#define SWR		(HI(46))
 #define SWC1		(HI(57))
 #define TRUNC_W_S	(HI(17) | FMT_S | LO(13))
 #define XOR		(HI(0) | LO(38))
@@ -277,12 +304,18 @@
 #define ADDU_W		ADDU
 #define ADDIU_W		ADDIU
 #define SLL_W		SLL
+#define SRA_W		SRA
 #define SUBU_W		SUBU
+#define STORE_W		SW
+#define LOAD_W		LW
 #else
 #define ADDU_W		DADDU
 #define ADDIU_W		DADDIU
 #define SLL_W		DSLL
+#define SRA_W		DSRA
 #define SUBU_W		DSUBU
+#define STORE_W		SD
+#define LOAD_W		LD
 #endif
 
 #define SIMM_MAX	(0x7fff)
@@ -315,19 +348,21 @@
 	return (1 << 16);
 }
 
-static SLJIT_INLINE sljit_ins* detect_jump_type(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code, sljit_sw executable_offset)
+static SLJIT_INLINE sljit_ins* detect_jump_type(struct sljit_jump *jump, sljit_ins *code, sljit_sw executable_offset)
 {
 	sljit_sw diff;
 	sljit_uw target_addr;
 	sljit_ins *inst;
 	sljit_ins saved_inst;
 
+	inst = (sljit_ins *)jump->addr;
+
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 	if (jump->flags & (SLJIT_REWRITABLE_JUMP | IS_CALL))
-		return code_ptr;
+		goto exit;
 #else
 	if (jump->flags & SLJIT_REWRITABLE_JUMP)
-		return code_ptr;
+		goto exit;
 #endif
 
 	if (jump->flags & JUMP_ADDR)
@@ -337,13 +372,12 @@
 		target_addr = (sljit_uw)(code + jump->u.label->size) + (sljit_uw)executable_offset;
 	}
 
-	inst = (sljit_ins *)jump->addr;
 	if (jump->flags & IS_COND)
 		inst--;
 
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
 	if (jump->flags & IS_CALL)
-		goto keep_address;
+		goto preserve_addr;
 #endif
 
 	/* B instructions. */
@@ -364,15 +398,14 @@
 			jump->addr -= 2 * sizeof(sljit_ins);
 			return inst;
 		}
-	}
-	else {
+	} else {
 		diff = ((sljit_sw)target_addr - (sljit_sw)(inst + 1) - executable_offset) >> 2;
 		if (diff <= SIMM_MAX && diff >= SIMM_MIN) {
 			jump->flags |= PATCH_B;
 
 			if (!(jump->flags & IS_COND)) {
 				inst[0] = (jump->flags & IS_JAL) ? BAL : B;
-				inst[1] = NOP;
+				/* Keep inst[1] */
 				return inst + 1;
 			}
 			inst[0] ^= invert_branch(jump->flags);
@@ -415,36 +448,46 @@
 		if ((target_addr & ~(sljit_uw)0xfffffff) == ((jump->addr + sizeof(sljit_ins)) & ~(sljit_uw)0xfffffff)) {
 			jump->flags |= PATCH_J;
 			inst[0] = (jump->flags & IS_JAL) ? JAL : J;
-			inst[1] = NOP;
+			/* Keep inst[1] */
 			return inst + 1;
 		}
 	}
 
+	if (jump->flags & IS_COND)
+		inst++;
+
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
-keep_address:
+preserve_addr:
 	if (target_addr <= 0x7fffffff) {
 		jump->flags |= PATCH_ABS32;
-		if (jump->flags & IS_COND) {
-			inst[0] -= 4;
-			inst++;
-		}
-		inst[2] = inst[6];
-		inst[3] = inst[7];
+		if (jump->flags & IS_COND)
+			inst[-1] -= 4;
+
+		inst[2] = inst[0];
+		inst[3] = inst[1];
 		return inst + 3;
 	}
 	if (target_addr <= 0x7fffffffffffl) {
 		jump->flags |= PATCH_ABS48;
-		if (jump->flags & IS_COND) {
-			inst[0] -= 2;
-			inst++;
-		}
-		inst[4] = inst[6];
-		inst[5] = inst[7];
+		if (jump->flags & IS_COND)
+			inst[-1] -= 2;
+
+		inst[4] = inst[0];
+		inst[5] = inst[1];
 		return inst + 5;
 	}
 #endif
 
-	return code_ptr;
+exit:
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	inst[2] = inst[0];
+	inst[3] = inst[1];
+	return inst + 3;
+#else
+	inst[6] = inst[0];
+	inst[7] = inst[1];
+	return inst + 7;
+#endif
 }
 
 #ifdef __GNUC__
@@ -459,30 +502,52 @@
 static SLJIT_INLINE sljit_sw put_label_get_length(struct sljit_put_label *put_label, sljit_uw max_label)
 {
 	if (max_label < 0x80000000l) {
-		put_label->flags = 0;
+		put_label->flags = PATCH_ABS32;
 		return 1;
 	}
 
 	if (max_label < 0x800000000000l) {
-		put_label->flags = 1;
+		put_label->flags = PATCH_ABS48;
 		return 3;
 	}
 
-	put_label->flags = 2;
+	put_label->flags = 0;
 	return 5;
 }
 
-static SLJIT_INLINE void put_label_set(struct sljit_put_label *put_label)
-{
-	sljit_uw addr = put_label->label->addr;
-	sljit_ins *inst = (sljit_ins *)put_label->addr;
-	sljit_u32 reg = *inst;
+#endif /* SLJIT_CONFIG_MIPS_64 */
 
-	if (put_label->flags == 0) {
+static SLJIT_INLINE void load_addr_to_reg(void *dst, sljit_u32 reg)
+{
+	struct sljit_jump *jump;
+	struct sljit_put_label *put_label;
+	sljit_uw flags;
+	sljit_ins *inst;
+	sljit_uw addr;
+
+	if (reg != 0) {
+		jump = (struct sljit_jump*)dst;
+		flags = jump->flags;
+		inst = (sljit_ins*)jump->addr;
+		addr = (flags & JUMP_LABEL) ? jump->u.label->addr : jump->u.target;
+	} else {
+		put_label = (struct sljit_put_label*)dst;
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+		flags = put_label->flags;
+#endif
+		inst = (sljit_ins*)put_label->addr;
+		addr = put_label->label->addr;
+		reg = *inst;
+	}
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	inst[0] = LUI | T(reg) | IMM(addr >> 16);
+#else /* !SLJIT_CONFIG_MIPS_32 */
+	if (flags & PATCH_ABS32) {
 		SLJIT_ASSERT(addr < 0x80000000l);
 		inst[0] = LUI | T(reg) | IMM(addr >> 16);
 	}
-	else if (put_label->flags == 1) {
+	else if (flags & PATCH_ABS48) {
 		SLJIT_ASSERT(addr < 0x800000000000l);
 		inst[0] = LUI | T(reg) | IMM(addr >> 32);
 		inst[1] = ORI | S(reg) | T(reg) | IMM((addr >> 16) & 0xffff);
@@ -497,12 +562,11 @@
 		inst[4] = DSLL | T(reg) | D(reg) | SH_IMM(16);
 		inst += 4;
 	}
+#endif /* SLJIT_CONFIG_MIPS_32 */
 
 	inst[1] = ORI | S(reg) | T(reg) | IMM(addr & 0xffff);
 }
 
-#endif
-
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
 {
 	struct sljit_memory_fragment *buf;
@@ -557,11 +621,12 @@
 				}
 				if (jump && jump->addr == word_count) {
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-					jump->addr = (sljit_uw)(code_ptr - 3);
+					word_count += 2;
 #else
-					jump->addr = (sljit_uw)(code_ptr - 7);
+					word_count += 6;
 #endif
-					code_ptr = detect_jump_type(jump, code_ptr, code, executable_offset);
+					jump->addr = (sljit_uw)(code_ptr - 1);
+					code_ptr = detect_jump_type(jump, code, executable_offset);
 					jump = jump->next;
 				}
 				if (const_ && const_->addr == word_count) {
@@ -571,7 +636,10 @@
 				if (put_label && put_label->addr == word_count) {
 					SLJIT_ASSERT(put_label->label);
 					put_label->addr = (sljit_uw)code_ptr;
-#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+					code_ptr += 1;
+					word_count += 1;
+#else
 					code_ptr += put_label_get_length(put_label, (sljit_uw)(SLJIT_ADD_EXEC_OFFSET(code, executable_offset) + put_label->label->size));
 					word_count += 5;
 #endif
@@ -579,8 +647,8 @@
 				}
 				next_addr = compute_next_addr(label, jump, const_, put_label);
 			}
-			code_ptr ++;
-			word_count ++;
+			code_ptr++;
+			word_count++;
 		} while (buf_ptr < buf_end);
 
 		buf = buf->next;
@@ -617,51 +685,14 @@
 				break;
 			}
 
-			/* Set the fields of immediate loads. */
-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-			SLJIT_ASSERT(((buf_ptr[0] | buf_ptr[1]) & 0xffff) == 0);
-			buf_ptr[0] |= (sljit_ins)(addr >> 16) & 0xffff;
-			buf_ptr[1] |= (sljit_ins)addr & 0xffff;
-#else
-			if (jump->flags & PATCH_ABS32) {
-				SLJIT_ASSERT(addr <= 0x7fffffff);
-				SLJIT_ASSERT(((buf_ptr[0] | buf_ptr[1]) & 0xffff) == 0);
-				buf_ptr[0] |= (sljit_ins)(addr >> 16) & 0xffff;
-				buf_ptr[1] |= (sljit_ins)addr & 0xffff;
-				break;
-			}
-
-			if (jump->flags & PATCH_ABS48) {
-				SLJIT_ASSERT(addr <= 0x7fffffffffffl);
-				SLJIT_ASSERT(((buf_ptr[0] | buf_ptr[1] | buf_ptr[3]) & 0xffff) == 0);
-				buf_ptr[0] |= (sljit_ins)(addr >> 32) & 0xffff;
-				buf_ptr[1] |= (sljit_ins)(addr >> 16) & 0xffff;
-				buf_ptr[3] |= (sljit_ins)addr & 0xffff;
-				break;
-			}
-
-			SLJIT_ASSERT(((buf_ptr[0] | buf_ptr[1] | buf_ptr[3] | buf_ptr[5]) & 0xffff) == 0);
-			buf_ptr[0] |= (sljit_ins)(addr >> 48) & 0xffff;
-			buf_ptr[1] |= (sljit_ins)(addr >> 32) & 0xffff;
-			buf_ptr[3] |= (sljit_ins)(addr >> 16) & 0xffff;
-			buf_ptr[5] |= (sljit_ins)addr & 0xffff;
-#endif
+			load_addr_to_reg(jump, PIC_ADDR_REG);
 		} while (0);
 		jump = jump->next;
 	}
 
 	put_label = compiler->put_labels;
 	while (put_label) {
-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-		addr = put_label->label->addr;
-		buf_ptr = (sljit_ins *)put_label->addr;
-
-		SLJIT_ASSERT((buf_ptr[0] & 0xffe00000) == LUI && (buf_ptr[1] & 0xfc000000) == ORI);
-		buf_ptr[0] |= (addr >> 16) & 0xffff;
-		buf_ptr[1] |= addr & 0xffff;
-#else
-		put_label_set(put_label);
-#endif
+		load_addr_to_reg(put_label, 0);
 		put_label = put_label->next;
 	}
 
@@ -700,19 +731,29 @@
 #endif
 	case SLJIT_HAS_ZERO_REGISTER:
 		return 1;
-
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
 	case SLJIT_HAS_CLZ:
 	case SLJIT_HAS_CMOV:
 	case SLJIT_HAS_PREFETCH:
 		return 1;
-#endif /* SLJIT_MIPS_REV >= 1 */
 
+	case SLJIT_HAS_CTZ:
+		return 2;
+#endif /* SLJIT_MIPS_REV >= 1 */
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
+	case SLJIT_HAS_ROT:
+		return 1;
+#endif /* SLJIT_MIPS_REV >= 2 */
 	default:
 		return 0;
 	}
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
+{
+	return (type >= SLJIT_ORDERED_EQUAL && type <= SLJIT_ORDERED_LESS_EQUAL);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Entry, exit                                                          */
 /* --------------------------------------------------------------------- */
@@ -747,14 +788,6 @@
 #define SLOW_SRC2	0x20000
 #define SLOW_DEST	0x40000
 
-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-#define STACK_STORE	SW
-#define STACK_LOAD	LW
-#else
-#define STACK_STORE	SD
-#define STACK_LOAD	LD
-#endif
-
 static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg_ar, sljit_s32 arg, sljit_sw argw);
 static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 frame_size, sljit_ins *ins_ptr);
 
@@ -770,13 +803,14 @@
 {
 	sljit_ins base;
 	sljit_s32 i, tmp, offset;
-	sljit_s32 arg_count, word_arg_count, saved_arg_count, float_arg_count;
+	sljit_s32 arg_count, word_arg_count, float_arg_count;
+	sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options);
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
 	set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
-	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
+	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 1);
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
 		if ((local_size & SSIZE_OF(sw)) != 0)
@@ -791,27 +825,27 @@
 #endif
 	compiler->local_size = local_size;
 
-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-	tmp = arg_types >> SLJIT_ARG_SHIFT;
-	arg_count = 0;
 	offset = 0;
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	if (!(options & SLJIT_ENTER_REG_ARG)) {
+		tmp = arg_types >> SLJIT_ARG_SHIFT;
+		arg_count = 0;
 
-	while (tmp) {
-		offset = arg_count;
-		if ((tmp & SLJIT_ARG_MASK) == SLJIT_ARG_TYPE_F64) {
-			if ((arg_count & 0x1) != 0)
+		while (tmp) {
+			offset = arg_count;
+			if ((tmp & SLJIT_ARG_MASK) == SLJIT_ARG_TYPE_F64) {
+				if ((arg_count & 0x1) != 0)
+					arg_count++;
 				arg_count++;
+			}
+
 			arg_count++;
+			tmp >>= SLJIT_ARG_SHIFT;
 		}
 
-		arg_count++;
-		tmp >>= SLJIT_ARG_SHIFT;
+		compiler->args_size = (sljit_uw)arg_count << 2;
+		offset = (offset >= 4) ? (offset << 2) : 0;
 	}
-
-	compiler->args_size = (sljit_uw)arg_count << 2;
-	offset = (offset >= 4) ? (offset << 2) : 0;
-#else /* !SLJIT_CONFIG_MIPS_32 */
-	offset = 0;
 #endif /* SLJIT_CONFIG_MIPS_32 */
 
 	if (local_size + offset <= -SIMM_MIN) {
@@ -820,9 +854,9 @@
 		base = S(SLJIT_SP);
 		offset = local_size - SSIZE_OF(sw);
 	} else {
-		FAIL_IF(load_immediate(compiler, DR(OTHER_FLAG), local_size));
+		FAIL_IF(load_immediate(compiler, OTHER_FLAG, local_size));
 		FAIL_IF(push_inst(compiler, ADDU_W | S(SLJIT_SP) | TA(0) | D(TMP_REG2), DR(TMP_REG2)));
-		FAIL_IF(push_inst(compiler, SUBU_W | S(SLJIT_SP) | T(OTHER_FLAG) | D(SLJIT_SP), DR(SLJIT_SP)));
+		FAIL_IF(push_inst(compiler, SUBU_W | S(SLJIT_SP) | TA(OTHER_FLAG) | D(SLJIT_SP), DR(SLJIT_SP)));
 		base = S(TMP_REG2);
 		offset = -SSIZE_OF(sw);
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -830,17 +864,17 @@
 #endif
 	}
 
-	FAIL_IF(push_inst(compiler, STACK_STORE | base | TA(RETURN_ADDR_REG) | IMM(offset), MOVABLE_INS));
+	FAIL_IF(push_inst(compiler, STORE_W | base | TA(RETURN_ADDR_REG) | IMM(offset), UNMOVABLE_INS));
 
 	tmp = SLJIT_S0 - saveds;
-	for (i = SLJIT_S0; i > tmp; i--) {
+	for (i = SLJIT_S0 - saved_arg_count; i > tmp; i--) {
 		offset -= SSIZE_OF(sw);
-		FAIL_IF(push_inst(compiler, STACK_STORE | base | T(i) | IMM(offset), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, STORE_W | base | T(i) | IMM(offset), MOVABLE_INS));
 	}
 
 	for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
 		offset -= SSIZE_OF(sw);
-		FAIL_IF(push_inst(compiler, STACK_STORE | base | T(i) | IMM(offset), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, STORE_W | base | T(i) | IMM(offset), MOVABLE_INS));
 	}
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -860,10 +894,12 @@
 		FAIL_IF(push_inst(compiler, SDC1 | base | FT(i) | IMM(offset), MOVABLE_INS));
 	}
 
+	if (options & SLJIT_ENTER_REG_ARG)
+		return SLJIT_SUCCESS;
+
 	arg_types >>= SLJIT_ARG_SHIFT;
 	arg_count = 0;
 	word_arg_count = 0;
-	saved_arg_count = 0;
 	float_arg_count = 0;
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -970,7 +1006,7 @@
 	CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
 	set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
-	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
+	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1);
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
 		if ((local_size & SSIZE_OF(sw)) != 0)
@@ -989,14 +1025,19 @@
 static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 frame_size, sljit_ins *ins_ptr)
 {
 	sljit_s32 local_size, i, tmp, offset;
+	sljit_s32 load_return_addr = (frame_size == 0);
 	sljit_s32 scratches = compiler->scratches;
 	sljit_s32 saveds = compiler->saveds;
 	sljit_s32 fsaveds = compiler->fsaveds;
 	sljit_s32 fscratches = compiler->fscratches;
+	sljit_s32 kept_saveds_count = SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
+
+	SLJIT_ASSERT(frame_size == 1 || (frame_size & 0xf) == 0);
+	frame_size &= ~0xf;
 
 	local_size = compiler->local_size;
 
-	tmp = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
+	tmp = GET_SAVED_REGISTERS_SIZE(scratches, saveds - kept_saveds_count, 1);
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
 		if ((tmp & SSIZE_OF(sw)) != 0)
@@ -1024,18 +1065,18 @@
 	SLJIT_ASSERT(local_size >= frame_size);
 
 	offset = local_size - SSIZE_OF(sw);
-	if (frame_size == 0)
-		FAIL_IF(push_inst(compiler, STACK_LOAD | S(SLJIT_SP) | TA(RETURN_ADDR_REG) | IMM(offset), RETURN_ADDR_REG));
+	if (load_return_addr)
+		FAIL_IF(push_inst(compiler, LOAD_W | S(SLJIT_SP) | TA(RETURN_ADDR_REG) | IMM(offset), RETURN_ADDR_REG));
 
 	tmp = SLJIT_S0 - saveds;
-	for (i = SLJIT_S0; i > tmp; i--) {
+	for (i = SLJIT_S0 - kept_saveds_count; i > tmp; i--) {
 		offset -= SSIZE_OF(sw);
-		FAIL_IF(push_inst(compiler, STACK_LOAD | S(SLJIT_SP) | T(i) | IMM(offset), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, LOAD_W | S(SLJIT_SP) | T(i) | IMM(offset), MOVABLE_INS));
 	}
 
 	for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
 		offset -= SSIZE_OF(sw);
-		FAIL_IF(push_inst(compiler, STACK_LOAD | S(SLJIT_SP) | T(i) | IMM(offset), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, LOAD_W | S(SLJIT_SP) | T(i) | IMM(offset), MOVABLE_INS));
 	}
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -1076,8 +1117,38 @@
 	return push_inst(compiler, ins, UNMOVABLE_INS);
 }
 
-#undef STACK_STORE
-#undef STACK_LOAD
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_ins ins;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, DR(PIC_ADDR_REG), src, srcw));
+		src = PIC_ADDR_REG;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, ADDU_W | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
+		src = PIC_ADDR_REG;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1, &ins));
+
+	if (!(src & SLJIT_IMM)) {
+		FAIL_IF(push_inst(compiler, JR | S(src), UNMOVABLE_INS));
+		return push_inst(compiler, ins, UNMOVABLE_INS);
+	}
+
+	if (ins != NOP)
+		FAIL_IF(push_inst(compiler, ins, MOVABLE_INS));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
 
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
@@ -1134,9 +1205,10 @@
 	return 0;
 }
 
+#define TO_ARGW_HI(argw) (((argw) & ~0xffff) + (((argw) & 0x8000) ? 0x10000 : 0))
+
 /* See getput_arg below.
-   Note: can_cache is called only for binary operators. Those
-   operators always uses word arguments without write back. */
+   Note: can_cache is called only for binary operators. */
 static sljit_s32 can_cache(sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
 {
 	SLJIT_ASSERT((arg & SLJIT_MEM) && (next_arg & SLJIT_MEM));
@@ -1151,7 +1223,8 @@
 	}
 
 	if (arg == next_arg) {
-		if (((next_argw - argw) <= SIMM_MAX && (next_argw - argw) >= SIMM_MIN))
+		if (((next_argw - argw) <= SIMM_MAX && (next_argw - argw) >= SIMM_MIN)
+				|| TO_ARGW_HI(argw) == TO_ARGW_HI(next_argw))
 			return 1;
 		return 0;
 	}
@@ -1163,6 +1236,7 @@
 static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg_ar, sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
 {
 	sljit_s32 tmp_ar, base, delay_slot;
+	sljit_sw offset, argw_hi;
 
 	SLJIT_ASSERT(arg & SLJIT_MEM);
 	if (!(next_arg & SLJIT_MEM)) {
@@ -1170,6 +1244,8 @@
 		next_argw = 0;
 	}
 
+	/* Since tmp can be the same as base or offset registers,
+	 * these might be unavailable after modifying tmp. */
 	if ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA)) {
 		tmp_ar = reg_ar;
 		delay_slot = reg_ar;
@@ -1217,35 +1293,39 @@
 		return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | SA(tmp_ar) | TA(reg_ar), delay_slot);
 	}
 
-	if (compiler->cache_arg == arg && argw - compiler->cache_argw <= SIMM_MAX && argw - compiler->cache_argw >= SIMM_MIN) {
-		if (argw != compiler->cache_argw) {
-			FAIL_IF(push_inst(compiler, ADDIU_W | S(TMP_REG3) | T(TMP_REG3) | IMM(argw - compiler->cache_argw), DR(TMP_REG3)));
-			compiler->cache_argw = argw;
-		}
-		return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(TMP_REG3) | TA(reg_ar), delay_slot);
-	}
+	if (compiler->cache_arg == arg && argw - compiler->cache_argw <= SIMM_MAX && argw - compiler->cache_argw >= SIMM_MIN)
+		return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(TMP_REG3) | TA(reg_ar) | IMM(argw - compiler->cache_argw), delay_slot);
 
-	if (compiler->cache_arg == SLJIT_MEM && argw - compiler->cache_argw <= SIMM_MAX && argw - compiler->cache_argw >= SIMM_MIN) {
-		if (argw != compiler->cache_argw)
-			FAIL_IF(push_inst(compiler, ADDIU_W | S(TMP_REG3) | T(TMP_REG3) | IMM(argw - compiler->cache_argw), DR(TMP_REG3)));
-	}
-	else {
+	if (compiler->cache_arg == SLJIT_MEM && (argw - compiler->cache_argw) <= SIMM_MAX && (argw - compiler->cache_argw) >= SIMM_MIN) {
+		offset = argw - compiler->cache_argw;
+	} else {
 		compiler->cache_arg = SLJIT_MEM;
-		FAIL_IF(load_immediate(compiler, DR(TMP_REG3), argw));
+
+		argw_hi = TO_ARGW_HI(argw);
+
+		if (next_arg && next_argw - argw <= SIMM_MAX && next_argw - argw >= SIMM_MIN && argw_hi != TO_ARGW_HI(next_argw)) {
+			FAIL_IF(load_immediate(compiler, DR(TMP_REG3), argw));
+			compiler->cache_argw = argw;
+			offset = 0;
+		} else {
+			FAIL_IF(load_immediate(compiler, DR(TMP_REG3), argw_hi));
+			compiler->cache_argw = argw_hi;
+			offset = argw & 0xffff;
+			argw = argw_hi;
+		}
 	}
-	compiler->cache_argw = argw;
 
 	if (!base)
-		return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(TMP_REG3) | TA(reg_ar), delay_slot);
+		return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(TMP_REG3) | TA(reg_ar) | IMM(offset), delay_slot);
 
 	if (arg == next_arg && next_argw - argw <= SIMM_MAX && next_argw - argw >= SIMM_MIN) {
 		compiler->cache_arg = arg;
 		FAIL_IF(push_inst(compiler, ADDU_W | S(TMP_REG3) | T(base) | D(TMP_REG3), DR(TMP_REG3)));
-		return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(TMP_REG3) | TA(reg_ar), delay_slot);
+		return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | S(TMP_REG3) | TA(reg_ar) | IMM(offset), delay_slot);
 	}
 
 	FAIL_IF(push_inst(compiler, ADDU_W | S(TMP_REG3) | T(base) | DA(tmp_ar), tmp_ar));
-	return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | SA(tmp_ar) | TA(reg_ar), delay_slot);
+	return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | SA(tmp_ar) | TA(reg_ar) | IMM(offset), delay_slot);
 }
 
 static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg_ar, sljit_s32 arg, sljit_sw argw)
@@ -1270,19 +1350,19 @@
 
 		if (SLJIT_UNLIKELY(argw)) {
 			FAIL_IF(push_inst(compiler, SLL_W | T(OFFS_REG(arg)) | DA(tmp_ar) | SH_IMM(argw), tmp_ar));
-			FAIL_IF(push_inst(compiler, ADDU_W | S(base) | TA(tmp_ar) | DA(tmp_ar), tmp_ar));
+			FAIL_IF(push_inst(compiler, ADDU_W | SA(tmp_ar) | T(base) | DA(tmp_ar), tmp_ar));
 		}
 		else
 			FAIL_IF(push_inst(compiler, ADDU_W | S(base) | T(OFFS_REG(arg)) | DA(tmp_ar), tmp_ar));
 		return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | SA(tmp_ar) | TA(reg_ar), delay_slot);
 	}
 
-	FAIL_IF(load_immediate(compiler, tmp_ar, argw));
+	FAIL_IF(load_immediate(compiler, tmp_ar, TO_ARGW_HI(argw)));
 
 	if (base != 0)
-		FAIL_IF(push_inst(compiler, ADDU_W | S(base) | TA(tmp_ar) | DA(tmp_ar), tmp_ar));
+		FAIL_IF(push_inst(compiler, ADDU_W | SA(tmp_ar) | T(base) | DA(tmp_ar), tmp_ar));
 
-	return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | SA(tmp_ar) | TA(reg_ar), delay_slot);
+	return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | SA(tmp_ar) | TA(reg_ar) | IMM(argw), delay_slot);
 }
 
 static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg1, sljit_sw arg1w, sljit_s32 arg2, sljit_sw arg2w)
@@ -1292,6 +1372,649 @@
 	return getput_arg(compiler, flags, reg, arg1, arg1w, arg2, arg2w);
 }
 
+#define EMIT_LOGICAL(op_imm, op_reg) \
+	if (flags & SRC2_IMM) { \
+		if (op & SLJIT_SET_Z) \
+			FAIL_IF(push_inst(compiler, op_imm | S(src1) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG)); \
+		if (!(flags & UNUSED_DEST)) \
+			FAIL_IF(push_inst(compiler, op_imm | S(src1) | T(dst) | IMM(src2), DR(dst))); \
+	} \
+	else { \
+		if (op & SLJIT_SET_Z) \
+			FAIL_IF(push_inst(compiler, op_reg | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG)); \
+		if (!(flags & UNUSED_DEST)) \
+			FAIL_IF(push_inst(compiler, op_reg | S(src1) | T(src2) | D(dst), DR(dst))); \
+	}
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+
+#define SELECT_OP(a, b) (b)
+
+#define EMIT_SHIFT(dimm, dimm32, imm, dv, v) \
+	op_imm = (imm); \
+	op_v = (v);
+
+#else /* !SLJIT_CONFIG_MIPS_32 */
+
+#define SELECT_OP(a, b) \
+	(!(op & SLJIT_32) ? a : b)
+
+#define EMIT_SHIFT(dimm, dimm32, imm, dv, v) \
+	op_dimm = (dimm); \
+	op_dimm32 = (dimm32); \
+	op_imm = (imm); \
+	op_dv = (dv); \
+	op_v = (v);
+
+#endif /* SLJIT_CONFIG_MIPS_32 */
+
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV < 1)
+
+static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw src)
+{
+	sljit_s32 is_clz = (GET_OPCODE(op) == SLJIT_CLZ);
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+	sljit_ins max = (op & SLJIT_32) ? 32 : 64;
+#else /* !SLJIT_CONFIG_RISCV_64 */
+	sljit_ins max = 32;
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+	/* The TMP_REG2 is the next value. */
+	if (src != TMP_REG2)
+		FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src) | TA(0) | D(TMP_REG2), DR(TMP_REG2)));
+
+	FAIL_IF(push_inst(compiler, BEQ | S(TMP_REG2) | TA(0) | IMM(is_clz ? 13 : 14), UNMOVABLE_INS));
+	/* The OTHER_FLAG is the counter. Delay slot. */
+	FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | SA(0) | TA(OTHER_FLAG) | IMM(max), OTHER_FLAG));
+
+	if (!is_clz) {
+		FAIL_IF(push_inst(compiler, ANDI | S(TMP_REG2) | T(TMP_REG1) | IMM(1), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, BNE | S(TMP_REG1) | TA(0) | IMM(11), UNMOVABLE_INS));
+	} else
+		FAIL_IF(push_inst(compiler, BLTZ | S(TMP_REG2) | TA(0) | IMM(11), UNMOVABLE_INS));
+
+	/* Delay slot. */
+	FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | SA(0) | TA(OTHER_FLAG) | IMM(0), OTHER_FLAG));
+
+	/* The TMP_REG1 is the next shift. */
+	FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | SA(0) | T(TMP_REG1) | IMM(max), DR(TMP_REG1)));
+
+	FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(TMP_REG2) | TA(0) | DA(EQUAL_FLAG), EQUAL_FLAG));
+	FAIL_IF(push_inst(compiler, SELECT_OP(DSRL, SRL) | T(TMP_REG1) | D(TMP_REG1) | SH_IMM(1), DR(TMP_REG1)));
+
+	FAIL_IF(push_inst(compiler, (is_clz ? SELECT_OP(DSRLV, SRLV) : SELECT_OP(DSLLV, SLLV)) | S(TMP_REG1) | TA(EQUAL_FLAG) | D(TMP_REG2), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, BNE | S(TMP_REG2) | TA(0) | IMM(-4), UNMOVABLE_INS));
+	/* Delay slot. */
+	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+
+	FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(TMP_REG1) | T(TMP_REG2) | IMM(-1), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, (is_clz ? SELECT_OP(DSRLV, SRLV) : SELECT_OP(DSLLV, SLLV)) | S(TMP_REG2) | TA(EQUAL_FLAG) | D(TMP_REG2), DR(TMP_REG2)));
+
+	FAIL_IF(push_inst(compiler, BEQ | S(TMP_REG2) | TA(0) | IMM(-7), UNMOVABLE_INS));
+	/* Delay slot. */
+	FAIL_IF(push_inst(compiler, OR | SA(OTHER_FLAG) | T(TMP_REG1) | DA(OTHER_FLAG), OTHER_FLAG));
+
+	return push_inst(compiler, SELECT_OP(DADDU, ADDU) | SA(OTHER_FLAG) | TA(0) | D(dst), DR(dst));
+}
+
+#endif /* SLJIT_MIPS_REV < 1 */
+
+static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
+	sljit_s32 dst, sljit_s32 src1, sljit_sw src2)
+{
+	sljit_s32 is_overflow, is_carry, carry_src_ar, is_handled;
+	sljit_ins op_imm, op_v;
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+	sljit_ins ins, op_dimm, op_dimm32, op_dv;
+#endif
+
+	switch (GET_OPCODE(op)) {
+	case SLJIT_MOV:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if (dst != src2)
+			return push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src2) | TA(0) | D(dst), DR(dst));
+		return SLJIT_SUCCESS;
+
+	case SLJIT_MOV_U8:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE))
+			return push_inst(compiler, ANDI | S(src2) | T(dst) | IMM(0xff), DR(dst));
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_MOV_S8:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
+			return push_inst(compiler, SEB | T(src2) | D(dst), DR(dst));
+#else /* SLJIT_MIPS_REV < 1 */
+			FAIL_IF(push_inst(compiler, SLL | T(src2) | D(dst) | SH_IMM(24), DR(dst)));
+			return push_inst(compiler, SRA | T(dst) | D(dst) | SH_IMM(24), DR(dst));
+#endif /* SLJIT_MIPS_REV >= 1 */
+#else /* !SLJIT_CONFIG_MIPS_32 */
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
+			if (op & SLJIT_32)
+				return push_inst(compiler, SEB | T(src2) | D(dst), DR(dst));
+#endif /* SLJIT_MIPS_REV >= 1 */
+			FAIL_IF(push_inst(compiler, DSLL32 | T(src2) | D(dst) | SH_IMM(24), DR(dst)));
+			return push_inst(compiler, DSRA32 | T(dst) | D(dst) | SH_IMM(24), DR(dst));
+#endif /* SLJIT_CONFIG_MIPS_32 */
+		}
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_MOV_U16:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE))
+			return push_inst(compiler, ANDI | S(src2) | T(dst) | IMM(0xffff), DR(dst));
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_MOV_S16:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
+			return push_inst(compiler, SEH | T(src2) | D(dst), DR(dst));
+#else /* SLJIT_MIPS_REV < 1 */
+			FAIL_IF(push_inst(compiler, SLL | T(src2) | D(dst) | SH_IMM(16), DR(dst)));
+			return push_inst(compiler, SRA | T(dst) | D(dst) | SH_IMM(16), DR(dst));
+#endif /* SLJIT_MIPS_REV >= 1 */
+#else /* !SLJIT_CONFIG_MIPS_32 */
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
+			if (op & SLJIT_32)
+				return push_inst(compiler, SEH | T(src2) | D(dst), DR(dst));
+#endif /* SLJIT_MIPS_REV >= 1 */
+			FAIL_IF(push_inst(compiler, DSLL32 | T(src2) | D(dst) | SH_IMM(16), DR(dst)));
+			return push_inst(compiler, DSRA32 | T(dst) | D(dst) | SH_IMM(16), DR(dst));
+#endif /* SLJIT_CONFIG_MIPS_32 */
+		}
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+	case SLJIT_MOV_U32:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM) && !(op & SLJIT_32));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
+			if (dst == src2)
+				return push_inst(compiler, DINSU | T(src2) | SA(0) | (31 << 11) | (0 << 11), DR(dst));
+#endif /* SLJIT_MIPS_REV >= 2 */
+			FAIL_IF(push_inst(compiler, DSLL32 | T(src2) | D(dst) | SH_IMM(0), DR(dst)));
+			return push_inst(compiler, DSRL32 | T(dst) | D(dst) | SH_IMM(0), DR(dst));
+		}
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_MOV_S32:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM) && !(op & SLJIT_32));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
+			return push_inst(compiler, SLL | T(src2) | D(dst) | SH_IMM(0), DR(dst));
+		}
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+#endif /* SLJIT_CONFIG_MIPS_64 */
+
+	case SLJIT_NOT:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if (op & SLJIT_SET_Z)
+			FAIL_IF(push_inst(compiler, NOR | S(src2) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
+		if (!(flags & UNUSED_DEST))
+			FAIL_IF(push_inst(compiler, NOR | S(src2) | T(src2) | D(dst), DR(dst)));
+		return SLJIT_SUCCESS;
+
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
+	case SLJIT_CLZ:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+		return push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(src2) | D(dst), DR(dst));
+#else /* SLJIT_MIPS_REV < 6 */
+		return push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(src2) | T(dst) | D(dst), DR(dst));
+#endif /* SLJIT_MIPS_REV >= 6 */
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | SA(0) | T(src2) | D(TMP_REG1), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, AND | S(src2) | T(TMP_REG1) | D(dst), DR(dst)));
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+		FAIL_IF(push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(dst) | D(dst), DR(dst)));
+#else /* SLJIT_MIPS_REV < 6 */
+		FAIL_IF(push_inst(compiler, SELECT_OP(DCLZ, CLZ) | S(dst) | T(dst) | D(dst), DR(dst)));
+#endif /* SLJIT_MIPS_REV >= 6 */
+		FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(dst) | T(TMP_REG1) | IMM(SELECT_OP(-64, -32)), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, SELECT_OP(DSRL32, SRL) | T(TMP_REG1) | D(TMP_REG1) | SH_IMM(SELECT_OP(26, 27)), DR(TMP_REG1)));
+		return push_inst(compiler, XOR | S(dst) | T(TMP_REG1) | D(dst), DR(dst));
+#else /* SLJIT_MIPS_REV < 1 */
+	case SLJIT_CLZ:
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		return emit_clz_ctz(compiler, op, dst, src2);
+#endif /* SLJIT_MIPS_REV >= 1 */
+
+	case SLJIT_ADD:
+		/* Overflow computation (both add and sub): overflow = src1_sign ^ src2_sign ^ result_sign ^ carry_flag */
+		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
+		carry_src_ar = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+
+		if (flags & SRC2_IMM) {
+			if (is_overflow) {
+				if (src2 >= 0)
+					FAIL_IF(push_inst(compiler, OR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
+				else
+					FAIL_IF(push_inst(compiler, NOR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
+			}
+			else if (op & SLJIT_SET_Z)
+				FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG));
+
+			/* Only the zero flag is needed. */
+			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
+				FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | T(dst) | IMM(src2), DR(dst)));
+		}
+		else {
+			if (is_overflow)
+				FAIL_IF(push_inst(compiler, XOR | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
+			else if (op & SLJIT_SET_Z)
+				FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
+
+			if (is_overflow || carry_src_ar != 0) {
+				if (src1 != dst)
+					carry_src_ar = DR(src1);
+				else if (src2 != dst)
+					carry_src_ar = DR(src2);
+				else {
+					FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src1) | TA(0) | DA(OTHER_FLAG), OTHER_FLAG));
+					carry_src_ar = OTHER_FLAG;
+				}
+			}
+
+			/* Only the zero flag is needed. */
+			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
+				FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src1) | T(src2) | D(dst), DR(dst)));
+		}
+
+		/* Carry is zero if a + b >= a or a + b >= b, otherwise it is 1. */
+		if (is_overflow || carry_src_ar != 0) {
+			if (flags & SRC2_IMM)
+				FAIL_IF(push_inst(compiler, SLTIU | S(dst) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
+			else
+				FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(carry_src_ar) | DA(OTHER_FLAG), OTHER_FLAG));
+		}
+
+		if (!is_overflow)
+			return SLJIT_SUCCESS;
+
+		FAIL_IF(push_inst(compiler, XOR | S(dst) | TA(EQUAL_FLAG) | D(TMP_REG1), DR(TMP_REG1)));
+		if (op & SLJIT_SET_Z)
+			FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(dst) | TA(0) | DA(EQUAL_FLAG), EQUAL_FLAG));
+		FAIL_IF(push_inst(compiler, SELECT_OP(DSRL32, SRL) | T(TMP_REG1) | D(TMP_REG1) | SH_IMM(31), DR(TMP_REG1)));
+		return push_inst(compiler, XOR | S(TMP_REG1) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG);
+
+	case SLJIT_ADDC:
+		carry_src_ar = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+
+		if (flags & SRC2_IMM) {
+			FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | T(dst) | IMM(src2), DR(dst)));
+		} else {
+			if (carry_src_ar != 0) {
+				if (src1 != dst)
+					carry_src_ar = DR(src1);
+				else if (src2 != dst)
+					carry_src_ar = DR(src2);
+				else {
+					FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src1) | TA(0) | DA(EQUAL_FLAG), EQUAL_FLAG));
+					carry_src_ar = EQUAL_FLAG;
+				}
+			}
+
+			FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src1) | T(src2) | D(dst), DR(dst)));
+		}
+
+		/* Carry is zero if a + b >= a or a + b >= b, otherwise it is 1. */
+		if (carry_src_ar != 0) {
+			if (flags & SRC2_IMM)
+				FAIL_IF(push_inst(compiler, SLTIU | S(dst) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG));
+			else
+				FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(carry_src_ar) | DA(EQUAL_FLAG), EQUAL_FLAG));
+		}
+
+		FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst)));
+
+		if (carry_src_ar == 0)
+			return SLJIT_SUCCESS;
+
+		/* Set ULESS_FLAG (dst == 0) && (OTHER_FLAG == 1). */
+		FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG));
+		/* Set carry flag. */
+		return push_inst(compiler, OR | SA(OTHER_FLAG) | TA(EQUAL_FLAG) | DA(OTHER_FLAG), OTHER_FLAG);
+
+	case SLJIT_SUB:
+		if ((flags & SRC2_IMM) && src2 == SIMM_MIN) {
+			FAIL_IF(push_inst(compiler, ADDIU | SA(0) | T(TMP_REG2) | IMM(src2), DR(TMP_REG2)));
+			src2 = TMP_REG2;
+			flags &= ~SRC2_IMM;
+		}
+
+		is_handled = 0;
+
+		if (flags & SRC2_IMM) {
+			if (GET_FLAG_TYPE(op) == SLJIT_LESS || GET_FLAG_TYPE(op) == SLJIT_GREATER_EQUAL) {
+				FAIL_IF(push_inst(compiler, SLTIU | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
+				is_handled = 1;
+			}
+			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_LESS || GET_FLAG_TYPE(op) == SLJIT_SIG_GREATER_EQUAL) {
+				FAIL_IF(push_inst(compiler, SLTI | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
+				is_handled = 1;
+			}
+		}
+
+		if (!is_handled && GET_FLAG_TYPE(op) >= SLJIT_LESS && GET_FLAG_TYPE(op) <= SLJIT_SIG_LESS_EQUAL) {
+			is_handled = 1;
+
+			if (flags & SRC2_IMM) {
+				FAIL_IF(push_inst(compiler, ADDIU | SA(0) | T(TMP_REG2) | IMM(src2), DR(TMP_REG2)));
+				src2 = TMP_REG2;
+				flags &= ~SRC2_IMM;
+			}
+
+			switch (GET_FLAG_TYPE(op)) {
+			case SLJIT_LESS:
+			case SLJIT_GREATER_EQUAL:
+				FAIL_IF(push_inst(compiler, SLTU | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
+				break;
+			case SLJIT_GREATER:
+			case SLJIT_LESS_EQUAL:
+				FAIL_IF(push_inst(compiler, SLTU | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
+				break;
+			case SLJIT_SIG_LESS:
+			case SLJIT_SIG_GREATER_EQUAL:
+				FAIL_IF(push_inst(compiler, SLT | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
+				break;
+			case SLJIT_SIG_GREATER:
+			case SLJIT_SIG_LESS_EQUAL:
+				FAIL_IF(push_inst(compiler, SLT | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
+				break;
+			}
+		}
+
+		if (is_handled) {
+			if (flags & SRC2_IMM) {
+				if (op & SLJIT_SET_Z)
+					FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | TA(EQUAL_FLAG) | IMM(-src2), EQUAL_FLAG));
+				if (!(flags & UNUSED_DEST))
+					return push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | T(dst) | IMM(-src2), DR(dst));
+			}
+			else {
+				if (op & SLJIT_SET_Z)
+					FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
+				if (!(flags & UNUSED_DEST))
+					return push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(src1) | T(src2) | D(dst), DR(dst));
+			}
+			return SLJIT_SUCCESS;
+		}
+
+		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
+		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+
+		if (flags & SRC2_IMM) {
+			if (is_overflow) {
+				if (src2 >= 0)
+					FAIL_IF(push_inst(compiler, OR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
+				else
+					FAIL_IF(push_inst(compiler, NOR | S(src1) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
+			}
+			else if (op & SLJIT_SET_Z)
+				FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | TA(EQUAL_FLAG) | IMM(-src2), EQUAL_FLAG));
+
+			if (is_overflow || is_carry)
+				FAIL_IF(push_inst(compiler, SLTIU | S(src1) | TA(OTHER_FLAG) | IMM(src2), OTHER_FLAG));
+
+			/* Only the zero flag is needed. */
+			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
+				FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | T(dst) | IMM(-src2), DR(dst)));
+		}
+		else {
+			if (is_overflow)
+				FAIL_IF(push_inst(compiler, XOR | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
+			else if (op & SLJIT_SET_Z)
+				FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
+
+			if (is_overflow || is_carry)
+				FAIL_IF(push_inst(compiler, SLTU | S(src1) | T(src2) | DA(OTHER_FLAG), OTHER_FLAG));
+
+			/* Only the zero flag is needed. */
+			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
+				FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(src1) | T(src2) | D(dst), DR(dst)));
+		}
+
+		if (!is_overflow)
+			return SLJIT_SUCCESS;
+
+		FAIL_IF(push_inst(compiler, XOR | S(dst) | TA(EQUAL_FLAG) | D(TMP_REG1), DR(TMP_REG1)));
+		if (op & SLJIT_SET_Z)
+			FAIL_IF(push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(dst) | TA(0) | DA(EQUAL_FLAG), EQUAL_FLAG));
+		FAIL_IF(push_inst(compiler, SELECT_OP(DSRL32, SRL) | T(TMP_REG1) | D(TMP_REG1) | SH_IMM(31), DR(TMP_REG1)));
+		return push_inst(compiler, XOR | S(TMP_REG1) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG);
+
+	case SLJIT_SUBC:
+		if ((flags & SRC2_IMM) && src2 == SIMM_MIN) {
+			FAIL_IF(push_inst(compiler, ADDIU | SA(0) | T(TMP_REG2) | IMM(src2), DR(TMP_REG2)));
+			src2 = TMP_REG2;
+			flags &= ~SRC2_IMM;
+		}
+
+		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+
+		if (flags & SRC2_IMM) {
+			if (is_carry)
+				FAIL_IF(push_inst(compiler, SLTIU | S(src1) | TA(EQUAL_FLAG) | IMM(src2), EQUAL_FLAG));
+
+			FAIL_IF(push_inst(compiler, SELECT_OP(DADDIU, ADDIU) | S(src1) | T(dst) | IMM(-src2), DR(dst)));
+		}
+		else {
+			if (is_carry)
+				FAIL_IF(push_inst(compiler, SLTU | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
+
+			FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(src1) | T(src2) | D(dst), DR(dst)));
+		}
+
+		if (is_carry)
+			FAIL_IF(push_inst(compiler, SLTU | S(dst) | TA(OTHER_FLAG) | D(TMP_REG1), DR(TMP_REG1)));
+
+		FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst)));
+
+		if (!is_carry)
+			return SLJIT_SUCCESS;
+
+		return push_inst(compiler, OR | SA(EQUAL_FLAG) | T(TMP_REG1) | DA(OTHER_FLAG), OTHER_FLAG);
+
+	case SLJIT_MUL:
+		SLJIT_ASSERT(!(flags & SRC2_IMM));
+
+		if (GET_FLAG_TYPE(op) != SLJIT_OVERFLOW) {
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+			return push_inst(compiler, SELECT_OP(DMUL, MUL) | S(src1) | T(src2) | D(dst), DR(dst));
+#elif (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+			return push_inst(compiler, MUL | S(src1) | T(src2) | D(dst), DR(dst));
+#else /* !SLJIT_CONFIG_MIPS_32 */
+			if (op & SLJIT_32)
+				return push_inst(compiler, MUL | S(src1) | T(src2) | D(dst), DR(dst));
+			FAIL_IF(push_inst(compiler, DMULT | S(src1) | T(src2), MOVABLE_INS));
+			return push_inst(compiler, MFLO | D(dst), DR(dst));
+#endif /* SLJIT_CONFIG_MIPS_32 */
+#else /* SLJIT_MIPS_REV < 1 */
+			FAIL_IF(push_inst(compiler, SELECT_OP(DMULT, MULT) | S(src1) | T(src2), MOVABLE_INS));
+			return push_inst(compiler, MFLO | D(dst), DR(dst));
+#endif /* SLJIT_MIPS_REV >= 6 */
+		}
+
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+		FAIL_IF(push_inst(compiler, SELECT_OP(DMUL, MUL) | S(src1) | T(src2) | D(dst), DR(dst)));
+		FAIL_IF(push_inst(compiler, SELECT_OP(DMUH, MUH) | S(src1) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
+#else /* SLJIT_MIPS_REV < 6 */
+		FAIL_IF(push_inst(compiler, SELECT_OP(DMULT, MULT) | S(src1) | T(src2), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, MFHI | DA(EQUAL_FLAG), EQUAL_FLAG));
+		FAIL_IF(push_inst(compiler, MFLO | D(dst), DR(dst)));
+#endif /* SLJIT_MIPS_REV >= 6 */
+		FAIL_IF(push_inst(compiler, SELECT_OP(DSRA32, SRA) | T(dst) | DA(OTHER_FLAG) | SH_IMM(31), OTHER_FLAG));
+		return push_inst(compiler, SELECT_OP(DSUBU, SUBU) | SA(EQUAL_FLAG) | TA(OTHER_FLAG) | DA(OTHER_FLAG), OTHER_FLAG);
+
+	case SLJIT_AND:
+		EMIT_LOGICAL(ANDI, AND);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_OR:
+		EMIT_LOGICAL(ORI, OR);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_XOR:
+		EMIT_LOGICAL(XORI, XOR);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_SHL:
+	case SLJIT_MSHL:
+		EMIT_SHIFT(DSLL, DSLL32, SLL, DSLLV, SLLV);
+		break;
+
+	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
+		EMIT_SHIFT(DSRL, DSRL32, SRL, DSRLV, SRLV);
+		break;
+
+	case SLJIT_ASHR:
+	case SLJIT_MASHR:
+		EMIT_SHIFT(DSRA, DSRA32, SRA, DSRAV, SRAV);
+		break;
+
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
+	case SLJIT_ROTL:
+		if ((flags & SRC2_IMM) || src2 == 0) {
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+			src2 = -src2 & 0x1f;
+#else /* !SLJIT_CONFIG_MIPS_32 */
+			src2 = -src2 & ((op & SLJIT_32) ? 0x1f : 0x3f);
+#endif /* SLJIT_CONFIG_MIPS_32 */
+		} else {
+			FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | SA(0) | T(src2) | D(TMP_REG2), DR(TMP_REG2)));
+			src2 = TMP_REG2;
+		}
+		/* fallthrough */
+
+	case SLJIT_ROTR:
+		EMIT_SHIFT(DROTR, DROTR32, ROTR, DROTRV, ROTRV);
+		break;
+#else /* SLJIT_MIPS_REV < 1 */
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		if (flags & SRC2_IMM) {
+			SLJIT_ASSERT(src2 != 0);
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+			if (!(op & SLJIT_32)) {
+				if (GET_OPCODE(op) == SLJIT_ROTL)
+					op_imm = ((src2 < 32) ? DSLL : DSLL32);
+				else
+					op_imm = ((src2 < 32) ? DSRL : DSRL32);
+
+				FAIL_IF(push_inst(compiler, op_imm | T(src1) | DA(OTHER_FLAG) | (((sljit_ins)src2 & 0x1f) << 6), OTHER_FLAG));
+
+				src2 = 64 - src2;
+				if (GET_OPCODE(op) == SLJIT_ROTL)
+					op_imm = ((src2 < 32) ? DSRL : DSRL32);
+				else
+					op_imm = ((src2 < 32) ? DSLL : DSLL32);
+
+				FAIL_IF(push_inst(compiler, op_imm | T(src1) | D(dst) | (((sljit_ins)src2 & 0x1f) << 6), DR(dst)));
+				return push_inst(compiler, OR | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst));
+			}
+#endif /* SLJIT_CONFIG_MIPS_64 */
+
+			op_imm = (GET_OPCODE(op) == SLJIT_ROTL) ? SLL : SRL;
+			FAIL_IF(push_inst(compiler, op_imm | T(src1) | DA(OTHER_FLAG) | ((sljit_ins)src2 << 6), OTHER_FLAG));
+
+			src2 = 32 - src2;
+			op_imm = (GET_OPCODE(op) == SLJIT_ROTL) ? SRL : SLL;
+			FAIL_IF(push_inst(compiler, op_imm | T(src1) | D(dst) | (((sljit_ins)src2 & 0x1f) << 6), DR(dst)));
+			return push_inst(compiler, OR | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst));
+		}
+
+		if (src2 == 0) {
+			if (dst != src1)
+				return push_inst(compiler, SELECT_OP(DADDU, ADDU) | S(src1) | TA(0) | D(dst), DR(dst));
+			return SLJIT_SUCCESS;
+		}
+
+		FAIL_IF(push_inst(compiler, SELECT_OP(DSUBU, SUBU) | SA(0) | T(src2) | DA(EQUAL_FLAG), EQUAL_FLAG));
+
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+		if (!(op & SLJIT_32)) {
+			op_v = (GET_OPCODE(op) == SLJIT_ROTL) ? DSLLV : DSRLV;
+			FAIL_IF(push_inst(compiler, op_v | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
+			op_v = (GET_OPCODE(op) == SLJIT_ROTL) ? DSRLV : DSLLV;
+			FAIL_IF(push_inst(compiler, op_v | SA(EQUAL_FLAG) | T(src1) | D(dst), DR(dst)));
+			return push_inst(compiler, OR | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst));
+		}
+#endif /* SLJIT_CONFIG_MIPS_64 */
+
+		op_v = (GET_OPCODE(op) == SLJIT_ROTL) ? SLLV : SRLV;
+		FAIL_IF(push_inst(compiler, op_v | S(src2) | T(src1) | DA(OTHER_FLAG), OTHER_FLAG));
+		op_v = (GET_OPCODE(op) == SLJIT_ROTL) ? SRLV : SLLV;
+		FAIL_IF(push_inst(compiler, op_v | SA(EQUAL_FLAG) | T(src1) | D(dst), DR(dst)));
+		return push_inst(compiler, OR | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst));
+#endif /* SLJIT_MIPS_REV >= 2 */
+
+	default:
+		SLJIT_UNREACHABLE();
+		return SLJIT_SUCCESS;
+	}
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	if ((flags & SRC2_IMM) || src2 == 0) {
+		if (op & SLJIT_SET_Z)
+			FAIL_IF(push_inst(compiler, op_imm | T(src1) | DA(EQUAL_FLAG) | SH_IMM(src2), EQUAL_FLAG));
+
+		if (flags & UNUSED_DEST)
+			return SLJIT_SUCCESS;
+		return push_inst(compiler, op_imm | T(src1) | D(dst) | SH_IMM(src2), DR(dst));
+	}
+
+	if (op & SLJIT_SET_Z)
+		FAIL_IF(push_inst(compiler, op_v | S(src2) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
+
+	if (flags & UNUSED_DEST)
+		return SLJIT_SUCCESS;
+	return push_inst(compiler, op_v | S(src2) | T(src1) | D(dst), DR(dst));
+#else /* !SLJIT_CONFIG_MIPS_32 */
+	if ((flags & SRC2_IMM) || src2 == 0) {
+		if (src2 >= 32) {
+			SLJIT_ASSERT(!(op & SLJIT_32));
+			ins = op_dimm32;
+			src2 -= 32;
+		}
+		else
+			ins = (op & SLJIT_32) ? op_imm : op_dimm;
+
+		if (op & SLJIT_SET_Z)
+			FAIL_IF(push_inst(compiler, ins | T(src1) | DA(EQUAL_FLAG) | SH_IMM(src2), EQUAL_FLAG));
+
+		if (flags & UNUSED_DEST)
+			return SLJIT_SUCCESS;
+		return push_inst(compiler, ins | T(src1) | D(dst) | SH_IMM(src2), DR(dst));
+	}
+
+	ins = (op & SLJIT_32) ? op_v : op_dv;
+	if (op & SLJIT_SET_Z)
+		FAIL_IF(push_inst(compiler, ins | S(src2) | T(src1) | DA(EQUAL_FLAG), EQUAL_FLAG));
+
+	if (flags & UNUSED_DEST)
+		return SLJIT_SUCCESS;
+	return push_inst(compiler, ins | S(src2) | T(src1) | D(dst), DR(dst));
+#endif /* SLJIT_CONFIG_MIPS_32 */
+}
+
+#define CHECK_IMM(flags, srcw) \
+	((!((flags) & LOGICAL_OP) && ((srcw) <= SIMM_MAX && (srcw) >= SIMM_MIN)) \
+		|| (((flags) & LOGICAL_OP) && !((srcw) & ~UIMM_MAX)))
+
 static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src1, sljit_sw src1w,
@@ -1325,25 +2048,18 @@
 		flags |= SLOW_DEST;
 
 	if (flags & IMM_OP) {
-		if ((src2 & SLJIT_IMM) && src2w) {
-			if ((!(flags & LOGICAL_OP) && (src2w <= SIMM_MAX && src2w >= SIMM_MIN))
-				|| ((flags & LOGICAL_OP) && !(src2w & ~UIMM_MAX))) {
-				flags |= SRC2_IMM;
-				src2_r = src2w;
-			}
-		}
-		if (!(flags & SRC2_IMM) && (flags & CUMULATIVE_OP) && (src1 & SLJIT_IMM) && src1w) {
-			if ((!(flags & LOGICAL_OP) && (src1w <= SIMM_MAX && src1w >= SIMM_MIN))
-				|| ((flags & LOGICAL_OP) && !(src1w & ~UIMM_MAX))) {
-				flags |= SRC2_IMM;
-				src2_r = src1w;
+		if ((src2 & SLJIT_IMM) && src2w != 0 && CHECK_IMM(flags, src2w)) {
+			flags |= SRC2_IMM;
+			src2_r = src2w;
+		} else if ((flags & CUMULATIVE_OP) && (src1 & SLJIT_IMM) && src1w != 0 && CHECK_IMM(flags, src1w)) {
+			flags |= SRC2_IMM;
+			src2_r = src1w;
 
-				/* And swap arguments. */
-				src1 = src2;
-				src1w = src2w;
-				src2 = SLJIT_IMM;
-				/* src2w = src2_r unneeded. */
-			}
+			/* And swap arguments. */
+			src1 = src2;
+			src1w = src2w;
+			src2 = SLJIT_IMM;
+			/* src2w = src2_r unneeded. */
 		}
 	}
 
@@ -1429,6 +2145,8 @@
 	return SLJIT_SUCCESS;
 }
 
+#undef CHECK_IMM
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
 {
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
@@ -1584,6 +2302,7 @@
 		return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);
 
 	case SLJIT_CLZ:
+	case SLJIT_CTZ:
 		return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);
 	}
 
@@ -1635,8 +2354,13 @@
 		return emit_op(compiler, op, flags | CUMULATIVE_OP | LOGICAL_OP | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 		if (src2 & SLJIT_IMM)
 			src2w &= 0x1f;
@@ -1662,13 +2386,106 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w);
 }
 
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+#define SELECT_OP3(op, src2w, D, D32, W) (((op & SLJIT_32) ? (W) : ((src2w) < 32) ? (D) : (D32)) | (((sljit_ins)src2w & 0x1f) << 6))
+#define SELECT_OP2(op, D, W) ((op & SLJIT_32) ? (W) : (D))
+#else /* !SLJIT_CONFIG_MIPS_64 */
+#define SELECT_OP3(op, src2w, D, D32, W) ((W) | ((sljit_ins)(src2w) << 6))
+#define SELECT_OP2(op, D, W) (W)
+#endif /* SLJIT_CONFIG_MIPS_64 */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src_dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_s32 is_left;
+	sljit_ins ins1, ins2, ins3;
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+	sljit_s32 inp_flags = ((op & SLJIT_32) ? INT_DATA : WORD_DATA) | LOAD_DATA;
+	sljit_sw bit_length = (op & SLJIT_32) ? 32 : 64;
+#else /* !SLJIT_CONFIG_MIPS_64 */
+	sljit_s32 inp_flags = WORD_DATA | LOAD_DATA;
+	sljit_sw bit_length = 32;
+#endif /* SLJIT_CONFIG_MIPS_64 */
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, src_dst, src1, src1w, src2, src2w));
+
+	is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL);
+
+	if (src_dst == src1) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, (is_left ? SLJIT_ROTL : SLJIT_ROTR) | (op & SLJIT_32), src_dst, 0, src_dst, 0, src2, src2w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+	if (src2 & SLJIT_IMM) {
+		src2w &= bit_length - 1;
+
+		if (src2w == 0)
+			return SLJIT_SUCCESS;
+	} else if (src2 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inp_flags, DR(TMP_REG2), src2, src2w));
+		src2 = TMP_REG2;
+	}
+
+	if (src1 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inp_flags, DR(TMP_REG1), src1, src1w));
+		src1 = TMP_REG1;
+	} else if (src1 & SLJIT_IMM) {
+		FAIL_IF(load_immediate(compiler, DR(TMP_REG1), src1w));
+		src1 = TMP_REG1;
+	}
+
+	if (src2 & SLJIT_IMM) {
+		if (is_left) {
+			ins1 = SELECT_OP3(op, src2w, DSLL, DSLL32, SLL);
+			src2w = bit_length - src2w;
+			ins2 = SELECT_OP3(op, src2w, DSRL, DSRL32, SRL);
+		} else {
+			ins1 = SELECT_OP3(op, src2w, DSRL, DSRL32, SRL);
+			src2w = bit_length - src2w;
+			ins2 = SELECT_OP3(op, src2w, DSLL, DSLL32, SLL);
+		}
+
+		FAIL_IF(push_inst(compiler, ins1 | T(src_dst) | D(src_dst), DR(src_dst)));
+		FAIL_IF(push_inst(compiler, ins2 | T(src1) | D(TMP_REG1), DR(TMP_REG1)));
+		return push_inst(compiler, OR | S(src_dst) | T(TMP_REG1) | D(src_dst), DR(src_dst));
+	}
+
+	if (is_left) {
+		ins1 = SELECT_OP2(op, DSRL, SRL);
+		ins2 = SELECT_OP2(op, DSLLV, SLLV);
+		ins3 = SELECT_OP2(op, DSRLV, SRLV);
+	} else {
+		ins1 = SELECT_OP2(op, DSLL, SLL);
+		ins2 = SELECT_OP2(op, DSRLV, SRLV);
+		ins3 = SELECT_OP2(op, DSLLV, SLLV);
+	}
+
+	FAIL_IF(push_inst(compiler, ins2 | S(src2) | T(src_dst) | D(src_dst), DR(src_dst)));
+
+	if (!(op & SLJIT_SHIFT_INTO_NON_ZERO)) {
+		FAIL_IF(push_inst(compiler, ins1 | T(src1) | D(TMP_REG1) | (1 << 6), DR(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, XORI | S(src2) | T(TMP_REG2) | ((sljit_ins)bit_length - 1), DR(TMP_REG2)));
+		src1 = TMP_REG1;
+	} else
+		FAIL_IF(push_inst(compiler, SELECT_OP2(op, DSUBU, SUBU) | SA(0) | T(src2) | D(TMP_REG2), DR(TMP_REG2)));
+
+	FAIL_IF(push_inst(compiler, ins3 | S(TMP_REG2) | T(src1) | D(TMP_REG1), DR(TMP_REG1)));
+	return push_inst(compiler, OR | S(src_dst) | T(TMP_REG1) | D(src_dst), DR(src_dst));
+}
+
+#undef SELECT_OP3
+#undef SELECT_OP2
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -1746,14 +2563,19 @@
 
 	FAIL_IF(push_inst(compiler, (TRUNC_W_S ^ (flags >> 19)) | FMT(op) | FS(src) | FD(TMP_FREG1), MOVABLE_INS));
 
-	if (FAST_IS_REG(dst))
-		return push_inst(compiler, MFC1 | flags | T(dst) | FS(TMP_FREG1), MOVABLE_INS);
+	if (FAST_IS_REG(dst)) {
+		FAIL_IF(push_inst(compiler, MFC1 | flags | T(dst) | FS(TMP_FREG1), MOVABLE_INS));
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif
+		return SLJIT_SUCCESS;
+	}
 
 	/* Store the integer value from a VFP register. */
 	return emit_op_mem2(compiler, flags ? DOUBLE_DATA : SINGLE_DATA, FR(TMP_FREG1), dst, dstw, 0, 0);
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-#	undef is_long
+#	undef flags
 #endif
 }
 
@@ -1769,19 +2591,25 @@
 
 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
 
-	if (FAST_IS_REG(src))
+	if (FAST_IS_REG(src)) {
 		FAIL_IF(push_inst(compiler, MTC1 | flags | T(src) | FS(TMP_FREG1), MOVABLE_INS));
-	else if (src & SLJIT_MEM) {
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif
+	} else if (src & SLJIT_MEM) {
 		/* Load the integer value into a VFP register. */
-		FAIL_IF(emit_op_mem2(compiler, ((flags) ? DOUBLE_DATA : SINGLE_DATA) | LOAD_DATA, FR(TMP_FREG1), src, srcw, dst, dstw));
+		FAIL_IF(emit_op_mem2(compiler, (flags ? DOUBLE_DATA : SINGLE_DATA) | LOAD_DATA, FR(TMP_FREG1), src, srcw, dst, dstw));
 	}
 	else {
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
 			srcw = (sljit_s32)srcw;
 #endif
 		FAIL_IF(load_immediate(compiler, DR(TMP_REG1), srcw));
 		FAIL_IF(push_inst(compiler, MTC1 | flags | T(TMP_REG1) | FS(TMP_FREG1), MOVABLE_INS));
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif
 	}
 
 	FAIL_IF(push_inst(compiler, CVT_S_S | flags | (4 << 21) | ((((sljit_ins)op & SLJIT_32) ^ SLJIT_32) >> 8) | FS(TMP_FREG1) | FD(dst_r), MOVABLE_INS));
@@ -1812,20 +2640,38 @@
 	}
 
 	switch (GET_FLAG_TYPE(op)) {
-	case SLJIT_EQUAL_F64:
-	case SLJIT_NOT_EQUAL_F64:
+	case SLJIT_F_EQUAL:
+	case SLJIT_ORDERED_EQUAL:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
+		inst = C_EQ_S;
+		break;
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_UNORDERED_OR_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL:
 		inst = C_UEQ_S;
 		break;
-	case SLJIT_LESS_F64:
-	case SLJIT_GREATER_EQUAL_F64:
+	case SLJIT_F_LESS:
+	case SLJIT_ORDERED_LESS:
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+		inst = C_OLT_S;
+		break;
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_UNORDERED_OR_LESS:
+	case SLJIT_ORDERED_GREATER_EQUAL:
 		inst = C_ULT_S;
 		break;
-	case SLJIT_GREATER_F64:
-	case SLJIT_LESS_EQUAL_F64:
+	case SLJIT_F_GREATER:
+	case SLJIT_ORDERED_GREATER:
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
 		inst = C_ULE_S;
 		break;
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_UNORDERED_OR_GREATER:
+	case SLJIT_ORDERED_LESS_EQUAL:
+		inst = C_OLE_S;
+		break;
 	default:
-		SLJIT_ASSERT(GET_FLAG_TYPE(op) == SLJIT_UNORDERED_F64 || GET_FLAG_TYPE(op) == SLJIT_ORDERED_F64);
+		SLJIT_ASSERT(GET_FLAG_TYPE(op) == SLJIT_UNORDERED || GET_FLAG_TYPE(op) == SLJIT_ORDERED);
 		inst = C_UN_S;
 		break;
 	}
@@ -1871,6 +2717,7 @@
 		FAIL_IF(push_inst(compiler, ABS_S | FMT(op) | FS(src) | FD(dst_r), MOVABLE_INS));
 		break;
 	case SLJIT_CONV_F64_FROM_F32:
+		/* The SLJIT_32 bit is inverted because sljit_f32 needs to be loaded from the memory. */
 		FAIL_IF(push_inst(compiler, CVT_S_S | (sljit_ins)((op & SLJIT_32) ? 1 : (1 << 21)) | FS(src) | FD(dst_r), MOVABLE_INS));
 		op ^= SLJIT_32;
 		break;
@@ -1959,6 +2806,9 @@
 	return SLJIT_SUCCESS;
 }
 
+#undef FLOAT_DATA
+#undef FMT
+
 /* --------------------------------------------------------------------- */
 /*  Other instructions                                                   */
 /* --------------------------------------------------------------------- */
@@ -2000,18 +2850,18 @@
 }
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-#define JUMP_LENGTH	4
+#define BRANCH_LENGTH	4
 #else
-#define JUMP_LENGTH	8
+#define BRANCH_LENGTH	8
 #endif
 
 #define BR_Z(src) \
-	inst = BEQ | SA(src) | TA(0) | JUMP_LENGTH; \
+	inst = BEQ | SA(src) | TA(0) | BRANCH_LENGTH; \
 	flags = IS_BIT26_COND; \
 	delay_check = src;
 
 #define BR_NZ(src) \
-	inst = BNE | SA(src) | TA(0) | JUMP_LENGTH; \
+	inst = BNE | SA(src) | TA(0) | BRANCH_LENGTH; \
 	flags = IS_BIT26_COND; \
 	delay_check = src;
 
@@ -2029,11 +2879,11 @@
 #else /* SLJIT_MIPS_REV < 6 */
 
 #define BR_T() \
-	inst = BC1T | JUMP_LENGTH; \
+	inst = BC1T | BRANCH_LENGTH; \
 	flags = IS_BIT16_COND; \
 	delay_check = FCSR_FCC;
 #define BR_F() \
-	inst = BC1F | JUMP_LENGTH; \
+	inst = BC1F | BRANCH_LENGTH; \
 	flags = IS_BIT16_COND; \
 	delay_check = FCSR_FCC;
 
@@ -2077,16 +2927,28 @@
 	case SLJIT_NOT_CARRY:
 		BR_NZ(OTHER_FLAG);
 		break;
-	case SLJIT_NOT_EQUAL_F64:
-	case SLJIT_GREATER_EQUAL_F64:
-	case SLJIT_GREATER_F64:
-	case SLJIT_ORDERED_F64:
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_F_GREATER:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL:
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+	case SLJIT_ORDERED_GREATER_EQUAL:
+	case SLJIT_ORDERED_GREATER:
+	case SLJIT_UNORDERED_OR_GREATER:
+	case SLJIT_ORDERED:
 		BR_T();
 		break;
-	case SLJIT_EQUAL_F64:
-	case SLJIT_LESS_F64:
-	case SLJIT_LESS_EQUAL_F64:
-	case SLJIT_UNORDERED_F64:
+	case SLJIT_F_EQUAL:
+	case SLJIT_F_LESS:
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_ORDERED_EQUAL:
+	case SLJIT_UNORDERED_OR_EQUAL:
+	case SLJIT_ORDERED_LESS:
+	case SLJIT_UNORDERED_OR_LESS:
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
+	case SLJIT_ORDERED_LESS_EQUAL:
+	case SLJIT_UNORDERED:
 		BR_F();
 		break;
 	default:
@@ -2102,8 +2964,6 @@
 	if (inst)
 		PTR_FAIL_IF(push_inst(compiler, inst, UNMOVABLE_INS));
 
-	PTR_FAIL_IF(emit_const(compiler, TMP_REG2, 0));
-
 	if (type <= SLJIT_JUMP)
 		PTR_FAIL_IF(push_inst(compiler, JR | S(TMP_REG2), UNMOVABLE_INS));
 	else {
@@ -2113,6 +2973,13 @@
 
 	jump->addr = compiler->size;
 	PTR_FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+
+	/* Maximum number of instructions required for generating a constant. */
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	compiler->size += 2;
+#else
+	compiler->size += 6;
+#endif
 	return jump;
 }
 
@@ -2151,11 +3018,17 @@
 
 	compiler->cache_arg = 0;
 	compiler->cache_argw = 0;
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	flags = WORD_DATA | LOAD_DATA;
+#else /* !SLJIT_CONFIG_MIPS_32 */
 	flags = ((type & SLJIT_32) ? INT_DATA : WORD_DATA) | LOAD_DATA;
+#endif /* SLJIT_CONFIG_MIPS_32 */
+
 	if (src1 & SLJIT_MEM) {
 		PTR_FAIL_IF(emit_op_mem2(compiler, flags, DR(TMP_REG1), src1, src1w, src2, src2w));
 		src1 = TMP_REG1;
 	}
+
 	if (src2 & SLJIT_MEM) {
 		PTR_FAIL_IF(emit_op_mem2(compiler, flags, DR(TMP_REG2), src2, src2w, 0, 0));
 		src2 = TMP_REG2;
@@ -2172,7 +3045,7 @@
 		jump->flags |= IS_BIT26_COND;
 		if (compiler->delay_slot == MOVABLE_INS || (compiler->delay_slot != UNMOVABLE_INS && compiler->delay_slot != DR(src1) && compiler->delay_slot != DR(src2)))
 			jump->flags |= IS_MOVABLE;
-		PTR_FAIL_IF(push_inst(compiler, (type == SLJIT_EQUAL ? BNE : BEQ) | S(src1) | T(src2) | JUMP_LENGTH, UNMOVABLE_INS));
+		PTR_FAIL_IF(push_inst(compiler, (type == SLJIT_EQUAL ? BNE : BEQ) | S(src1) | T(src2) | BRANCH_LENGTH, UNMOVABLE_INS));
 	}
 	else if (type >= SLJIT_SIG_LESS && (((src1 & SLJIT_IMM) && (src1w == 0)) || ((src2 & SLJIT_IMM) && (src2w == 0)))) {
 		inst = NOP;
@@ -2219,7 +3092,7 @@
 				break;
 			}
 		}
-		PTR_FAIL_IF(push_inst(compiler, inst | S(src1) | JUMP_LENGTH, UNMOVABLE_INS));
+		PTR_FAIL_IF(push_inst(compiler, inst | S(src1) | BRANCH_LENGTH, UNMOVABLE_INS));
 	}
 	else {
 		if (type == SLJIT_LESS || type == SLJIT_GREATER_EQUAL || type == SLJIT_SIG_LESS || type == SLJIT_SIG_GREATER_EQUAL) {
@@ -2244,20 +3117,26 @@
 		}
 
 		jump->flags |= IS_BIT26_COND;
-		PTR_FAIL_IF(push_inst(compiler, (type == SLJIT_EQUAL ? BNE : BEQ) | S(TMP_REG1) | TA(0) | JUMP_LENGTH, UNMOVABLE_INS));
+		PTR_FAIL_IF(push_inst(compiler, (type == SLJIT_EQUAL ? BNE : BEQ) | S(TMP_REG1) | TA(0) | BRANCH_LENGTH, UNMOVABLE_INS));
 	}
 
-	PTR_FAIL_IF(emit_const(compiler, TMP_REG2, 0));
 	PTR_FAIL_IF(push_inst(compiler, JR | S(TMP_REG2), UNMOVABLE_INS));
 	jump->addr = compiler->size;
 	PTR_FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+
+	/* Maximum number of instructions required for generating a constant. */
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	compiler->size += 2;
+#else
+	compiler->size += 6;
+#endif
 	return jump;
 }
 
 #undef RESOLVE_IMM1
 #undef RESOLVE_IMM2
 
-#undef JUMP_LENGTH
+#undef BRANCH_LENGTH
 #undef BR_Z
 #undef BR_NZ
 #undef BR_T
@@ -2272,7 +3151,6 @@
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
 
 	if (src & SLJIT_IMM) {
 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
@@ -2283,17 +3161,29 @@
 		if (compiler->delay_slot != UNMOVABLE_INS)
 			jump->flags |= IS_MOVABLE;
 
-		FAIL_IF(emit_const(compiler, TMP_REG2, 0));
 		src = TMP_REG2;
-	}
-	else if (src & SLJIT_MEM) {
+	} else if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
 		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, DR(TMP_REG2), src, srcw));
 		src = TMP_REG2;
 	}
 
-	FAIL_IF(push_inst(compiler, JR | S(src), UNMOVABLE_INS));
-	if (jump)
+	if (type <= SLJIT_JUMP)
+		FAIL_IF(push_inst(compiler, JR | S(src), UNMOVABLE_INS));
+	else
+		FAIL_IF(push_inst(compiler, JALR | S(src) | DA(RETURN_ADDR_REG), UNMOVABLE_INS));
+
+	if (jump != NULL) {
 		jump->addr = compiler->size;
+
+		/* Maximum number of instructions required for generating a constant. */
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+		compiler->size += 2;
+#else
+		compiler->size += 6;
+#endif
+	}
+
 	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
 	return SLJIT_SUCCESS;
 }
@@ -2302,7 +3192,7 @@
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 type)
 {
-	sljit_s32 src_ar, dst_ar;
+	sljit_s32 src_ar, dst_ar, invert;
 	sljit_s32 saved_op = op;
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 	sljit_s32 mem_type = WORD_DATA;
@@ -2323,32 +3213,45 @@
 	if (op >= SLJIT_ADD && (dst & SLJIT_MEM))
 		FAIL_IF(emit_op_mem2(compiler, mem_type | LOAD_DATA, DR(TMP_REG1), dst, dstw, dst, dstw));
 
-	switch (type & 0xff) {
-	case SLJIT_EQUAL:
-	case SLJIT_NOT_EQUAL:
-		FAIL_IF(push_inst(compiler, SLTIU | SA(EQUAL_FLAG) | TA(dst_ar) | IMM(1), dst_ar));
-		src_ar = dst_ar;
-		break;
-	case SLJIT_OVERFLOW:
-	case SLJIT_NOT_OVERFLOW:
-		if (compiler->status_flags_state & (SLJIT_CURRENT_FLAGS_ADD | SLJIT_CURRENT_FLAGS_SUB)) {
-			src_ar = OTHER_FLAG;
+	if (type < SLJIT_F_EQUAL) {
+		src_ar = OTHER_FLAG;
+		invert = type & 0x1;
+
+		switch (type) {
+		case SLJIT_EQUAL:
+		case SLJIT_NOT_EQUAL:
+			FAIL_IF(push_inst(compiler, SLTIU | SA(EQUAL_FLAG) | TA(dst_ar) | IMM(1), dst_ar));
+			src_ar = dst_ar;
+			break;
+		case SLJIT_OVERFLOW:
+		case SLJIT_NOT_OVERFLOW:
+			if (compiler->status_flags_state & (SLJIT_CURRENT_FLAGS_ADD | SLJIT_CURRENT_FLAGS_SUB)) {
+				src_ar = OTHER_FLAG;
+				break;
+			}
+			FAIL_IF(push_inst(compiler, SLTIU | SA(OTHER_FLAG) | TA(dst_ar) | IMM(1), dst_ar));
+			src_ar = dst_ar;
+			invert ^= 0x1;
 			break;
 		}
-		FAIL_IF(push_inst(compiler, SLTIU | SA(OTHER_FLAG) | TA(dst_ar) | IMM(1), dst_ar));
-		src_ar = dst_ar;
-		type ^= 0x1; /* Flip type bit for the XORI below. */
-		break;
-	case SLJIT_GREATER_F64:
-	case SLJIT_LESS_EQUAL_F64:
-		type ^= 0x1; /* Flip type bit for the XORI below. */
-		/* fallthrough */
-	case SLJIT_EQUAL_F64:
-	case SLJIT_NOT_EQUAL_F64:
-	case SLJIT_LESS_F64:
-	case SLJIT_GREATER_EQUAL_F64:
-	case SLJIT_UNORDERED_F64:
-	case SLJIT_ORDERED_F64:
+	} else {
+		invert = 0;
+
+		switch (type) {
+		case SLJIT_F_NOT_EQUAL:
+		case SLJIT_F_GREATER_EQUAL:
+		case SLJIT_F_GREATER:
+		case SLJIT_UNORDERED_OR_NOT_EQUAL:
+		case SLJIT_ORDERED_NOT_EQUAL:
+		case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+		case SLJIT_ORDERED_GREATER_EQUAL:
+		case SLJIT_ORDERED_GREATER:
+		case SLJIT_UNORDERED_OR_GREATER:
+		case SLJIT_ORDERED:
+			invert = 1;
+			break;
+		}
+
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
 		FAIL_IF(push_inst(compiler, MFC1 | TA(dst_ar) | FS(TMP_FREG3), dst_ar));
 #else /* SLJIT_MIPS_REV < 6 */
@@ -2357,14 +3260,9 @@
 		FAIL_IF(push_inst(compiler, SRL | TA(dst_ar) | DA(dst_ar) | SH_IMM(23), dst_ar));
 		FAIL_IF(push_inst(compiler, ANDI | SA(dst_ar) | TA(dst_ar) | IMM(1), dst_ar));
 		src_ar = dst_ar;
-		break;
-
-	default:
-		src_ar = OTHER_FLAG;
-		break;
 	}
 
-	if (type & 0x1) {
+	if (invert) {
 		FAIL_IF(push_inst(compiler, XORI | SA(src_ar) | TA(dst_ar) | IMM(1), dst_ar));
 		src_ar = dst_ar;
 	}
@@ -2404,7 +3302,7 @@
 
 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
-		if (dst_reg & SLJIT_32)
+		if (type & SLJIT_32)
 			srcw = (sljit_s32)srcw;
 #endif
 		FAIL_IF(load_immediate(compiler, DR(TMP_REG1), srcw));
@@ -2412,9 +3310,7 @@
 		srcw = 0;
 	}
 
-	dst_reg &= ~SLJIT_32;
-
-	switch (type & 0xff) {
+	switch (type & ~SLJIT_32) {
 	case SLJIT_EQUAL:
 		ins = MOVZ | TA(EQUAL_FLAG);
 		break;
@@ -2435,16 +3331,28 @@
 	case SLJIT_NOT_OVERFLOW:
 		ins = MOVZ | TA(OTHER_FLAG);
 		break;
-	case SLJIT_EQUAL_F64:
-	case SLJIT_LESS_F64:
-	case SLJIT_LESS_EQUAL_F64:
-	case SLJIT_UNORDERED_F64:
+	case SLJIT_F_EQUAL:
+	case SLJIT_F_LESS:
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_ORDERED_EQUAL:
+	case SLJIT_UNORDERED_OR_EQUAL:
+	case SLJIT_ORDERED_LESS:
+	case SLJIT_UNORDERED_OR_LESS:
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
+	case SLJIT_ORDERED_LESS_EQUAL:
+	case SLJIT_UNORDERED:
 		ins = MOVT;
 		break;
-	case SLJIT_NOT_EQUAL_F64:
-	case SLJIT_GREATER_EQUAL_F64:
-	case SLJIT_GREATER_F64:
-	case SLJIT_ORDERED_F64:
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_F_GREATER:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL:
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+	case SLJIT_ORDERED_GREATER_EQUAL:
+	case SLJIT_ORDERED_GREATER:
+	case SLJIT_UNORDERED_OR_GREATER:
+	case SLJIT_ORDERED:
 		ins = MOVF;
 		break;
 	default:
@@ -2460,6 +3368,308 @@
 #endif /* SLJIT_MIPS_REV >= 1 */
 }
 
+static sljit_s32 update_mem_addr(struct sljit_compiler *compiler, sljit_s32 *mem, sljit_sw *memw, sljit_s16 max_offset)
+{
+	sljit_s32 arg = *mem;
+	sljit_sw argw = *memw;
+
+	if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
+		argw &= 0x3;
+
+		if (SLJIT_UNLIKELY(argw)) {
+			FAIL_IF(push_inst(compiler, SLL_W | T(OFFS_REG(arg)) | D(TMP_REG1) | SH_IMM(argw), DR(TMP_REG1)));
+			FAIL_IF(push_inst(compiler, ADDU_W | S(TMP_REG1) | T(arg & REG_MASK) | D(TMP_REG1), DR(TMP_REG1)));
+		} else
+			FAIL_IF(push_inst(compiler, ADDU_W | S(arg & REG_MASK) | T(OFFS_REG(arg)) | D(TMP_REG1), DR(TMP_REG1)));
+
+		*mem = TMP_REG1;
+		*memw = 0;
+
+		return SLJIT_SUCCESS;
+	}
+
+	if (argw <= max_offset && argw >= SIMM_MIN) {
+		*mem = arg & REG_MASK;
+		return SLJIT_SUCCESS;
+	}
+
+	*mem = TMP_REG1;
+
+	if ((sljit_s16)argw > max_offset) {
+		FAIL_IF(load_immediate(compiler, DR(TMP_REG1), argw));
+		*memw = 0;
+	} else {
+		FAIL_IF(load_immediate(compiler, DR(TMP_REG1), TO_ARGW_HI(argw)));
+		*memw = (sljit_s16)argw;
+	}
+
+	if ((arg & REG_MASK) == 0)
+		return SLJIT_SUCCESS;
+
+	return push_inst(compiler, ADDU_W | S(TMP_REG1) | T(arg & REG_MASK) | D(TMP_REG1), DR(TMP_REG1));
+}
+
+#if (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
+#define MEM16_IMM_FIRST(memw) IMM((memw) + 1)
+#define MEM16_IMM_SECOND(memw) IMM(memw)
+#define MEMF64_FS_FIRST(freg) FS(freg)
+#define MEMF64_FS_SECOND(freg) (FS(freg) | ((sljit_ins)1 << 11))
+#else /* !SLJIT_LITTLE_ENDIAN */
+#define MEM16_IMM_FIRST(memw) IMM(memw)
+#define MEM16_IMM_SECOND(memw) IMM((memw) + 1)
+#define MEMF64_FS_FIRST(freg) (FS(freg) | ((sljit_ins)1 << 11))
+#define MEMF64_FS_SECOND(freg) FS(freg)
+#endif /* SLJIT_LITTLE_ENDIAN */
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+#define MEM_CHECK_UNALIGNED(type) ((type) & (SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16))
+#else /* !SLJIT_CONFIG_MIPS_32 */
+#define MEM_CHECK_UNALIGNED(type) ((type) & (SLJIT_MEM_UNALIGNED | SLJIT_MEM_UNALIGNED_16 | SLJIT_MEM_UNALIGNED_32))
+#endif /* SLJIT_CONFIG_MIPS_32 */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_s32 op = type & 0xff;
+	sljit_s32 flags = 0;
+	sljit_ins ins;
+#if !(defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+	sljit_ins ins_right;
+#endif /* !(SLJIT_MIPS_REV >= 6) */
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+	if (reg & REG_PAIR_MASK) {
+		ADJUST_LOCAL_OFFSET(mem, memw);
+
+#if !(defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+		if (MEM_CHECK_UNALIGNED(type)) {
+			FAIL_IF(update_mem_addr(compiler, &mem, &memw, SIMM_MAX - (2 * SSIZE_OF(sw) - 1)));
+
+			if (!(type & SLJIT_MEM_STORE) && (mem == REG_PAIR_FIRST(reg) || mem == REG_PAIR_SECOND(reg))) {
+				FAIL_IF(push_inst(compiler, ADDU_W | S(mem) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
+				mem = TMP_REG1;
+			}
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+			ins = ((type & SLJIT_MEM_STORE) ? SWL : LWL) | S(mem);
+			ins_right = ((type & SLJIT_MEM_STORE) ? SWR : LWR) | S(mem);
+#else /* !SLJIT_CONFIG_MIPS_32 */
+			ins = ((type & SLJIT_MEM_STORE) ? SDL : LDL) | S(mem);
+			ins_right = ((type & SLJIT_MEM_STORE) ? SDR : LDR) | S(mem);
+#endif /* SLJIT_CONFIG_MIPS_32 */
+
+			FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_FIRST(reg)) | IMM(memw), DR(REG_PAIR_FIRST(reg))));
+			FAIL_IF(push_inst(compiler, ins_right | T(REG_PAIR_FIRST(reg)) | IMM(memw + (SSIZE_OF(sw) - 1)), DR(REG_PAIR_FIRST(reg))));
+			FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_SECOND(reg)) | IMM(memw + SSIZE_OF(sw)), DR(REG_PAIR_SECOND(reg))));
+			return push_inst(compiler, ins_right | T(REG_PAIR_SECOND(reg)) | IMM((memw + 2 * SSIZE_OF(sw) - 1)), DR(REG_PAIR_SECOND(reg)));
+		}
+#endif /* !(SLJIT_MIPS_REV >= 6) */
+
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, SIMM_MAX - SSIZE_OF(sw)));
+
+		ins = ((type & SLJIT_MEM_STORE) ? STORE_W : LOAD_W) | S(mem);
+
+		if (!(type & SLJIT_MEM_STORE) && mem == REG_PAIR_FIRST(reg)) {
+			FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_SECOND(reg)) | IMM(memw + SSIZE_OF(sw)), DR(REG_PAIR_SECOND(reg))));
+			return push_inst(compiler, ins | T(REG_PAIR_FIRST(reg)) | IMM(memw), DR(REG_PAIR_FIRST(reg)));
+		}
+
+		FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_FIRST(reg)) | IMM(memw), DR(REG_PAIR_FIRST(reg))));
+		return push_inst(compiler, ins | T(REG_PAIR_SECOND(reg)) | IMM(memw + SSIZE_OF(sw)), DR(REG_PAIR_SECOND(reg)));
+	}
+
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+	return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+#else /* !(SLJIT_MIPS_REV >= 6) */
+	ADJUST_LOCAL_OFFSET(mem, memw);
+
+	switch (op) {
+	case SLJIT_MOV_U8:
+	case SLJIT_MOV_S8:
+		flags = BYTE_DATA;
+		if (!(type & SLJIT_MEM_STORE))
+			flags |= LOAD_DATA;
+
+		if (op == SLJIT_MOV_S8)
+			flags |= SIGNED_DATA;
+
+		return emit_op_mem(compiler, flags, DR(reg), mem, memw);
+
+	case SLJIT_MOV_U16:
+	case SLJIT_MOV_S16:
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, SIMM_MAX - 1));
+		SLJIT_ASSERT(FAST_IS_REG(mem) && mem != TMP_REG2);
+
+		if (type & SLJIT_MEM_STORE) {
+			FAIL_IF(push_inst(compiler, SRA_W | T(reg) | D(TMP_REG2) | SH_IMM(8), DR(TMP_REG2)));
+			FAIL_IF(push_inst(compiler, data_transfer_insts[BYTE_DATA] | S(mem) | T(TMP_REG2) | MEM16_IMM_FIRST(memw), MOVABLE_INS));
+			return push_inst(compiler, data_transfer_insts[BYTE_DATA] | S(mem) | T(reg) | MEM16_IMM_SECOND(memw), MOVABLE_INS);
+		}
+
+		flags = BYTE_DATA | LOAD_DATA;
+
+		if (op == SLJIT_MOV_S16)
+			flags |= SIGNED_DATA;
+
+		FAIL_IF(push_inst(compiler, data_transfer_insts[flags] | S(mem) | T(TMP_REG2) | MEM16_IMM_FIRST(memw), DR(TMP_REG2)));
+		FAIL_IF(push_inst(compiler, data_transfer_insts[BYTE_DATA | LOAD_DATA] | S(mem) | T(reg) | MEM16_IMM_SECOND(memw), DR(reg)));
+		FAIL_IF(push_inst(compiler, SLL_W | T(TMP_REG2) | D(TMP_REG2) | SH_IMM(8), DR(TMP_REG2)));
+		return push_inst(compiler, OR | S(reg) | T(TMP_REG2) | D(reg), DR(reg));
+
+	case SLJIT_MOV:
+	case SLJIT_MOV_P:
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+		if (type & SLJIT_MEM_UNALIGNED_32) {
+			flags = WORD_DATA;
+			if (!(type & SLJIT_MEM_STORE))
+				flags |= LOAD_DATA;
+
+			return emit_op_mem(compiler, flags, DR(reg), mem, memw);
+		}
+#else /* !SLJIT_CONFIG_MIPS_32 */
+		FAIL_IF(update_mem_addr(compiler, &mem, &memw, SIMM_MAX - 7));
+		SLJIT_ASSERT(FAST_IS_REG(mem) && mem != TMP_REG2);
+
+		if (type & SLJIT_MEM_STORE) {
+			FAIL_IF(push_inst(compiler, SDL | S(mem) | T(reg) | IMM(memw), MOVABLE_INS));
+			return push_inst(compiler, SDR | S(mem) | T(reg) | IMM(memw + 7), MOVABLE_INS);
+		}
+
+		if (mem == reg) {
+			FAIL_IF(push_inst(compiler, ADDU_W | S(mem) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
+			mem = TMP_REG1;
+		}
+
+		FAIL_IF(push_inst(compiler, LDL | S(mem) | T(reg) | IMM(memw), DR(reg)));
+		return push_inst(compiler, LDR | S(mem) | T(reg) | IMM(memw + 7), DR(reg));
+#endif /* SLJIT_CONFIG_MIPS_32 */
+	}
+
+	FAIL_IF(update_mem_addr(compiler, &mem, &memw, SIMM_MAX - 3));
+	SLJIT_ASSERT(FAST_IS_REG(mem) && mem != TMP_REG2);
+
+	if (type & SLJIT_MEM_STORE) {
+		FAIL_IF(push_inst(compiler, SWL | S(mem) | T(reg) | IMM(memw), MOVABLE_INS));
+		return push_inst(compiler, SWR | S(mem) | T(reg) | IMM(memw + 3), MOVABLE_INS);
+	}
+
+	if (mem == reg) {
+		FAIL_IF(push_inst(compiler, ADDU_W | S(mem) | TA(0) | D(TMP_REG1), DR(TMP_REG1)));
+		mem = TMP_REG1;
+	}
+
+	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(reg) | IMM(memw), DR(reg)));
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	return push_inst(compiler, LWR | S(mem) | T(reg) | IMM(memw + 3), DR(reg));
+#else /* !SLJIT_CONFIG_MIPS_32 */
+	FAIL_IF(push_inst(compiler, LWR | S(mem) | T(reg) | IMM(memw + 3), DR(reg)));
+
+	if (op != SLJIT_MOV_U32)
+		return SLJIT_SUCCESS;
+
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
+	return push_inst(compiler, DINSU | T(reg) | SA(0) | (31 << 11) | (0 << 11), DR(reg));
+#else  /* SLJIT_MIPS_REV < 1 */
+	FAIL_IF(push_inst(compiler, DSLL32 | T(reg) | D(reg) | SH_IMM(0), DR(reg)));
+	return push_inst(compiler, DSRL32 | T(reg) | D(reg) | SH_IMM(0), DR(reg));
+#endif /* SLJIT_MIPS_REV >= 2 */
+#endif /* SLJIT_CONFIG_MIPS_32 */
+#endif /* SLJIT_MIPS_REV >= 6 */
+}
+
+#if !(defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
+
+	FAIL_IF(update_mem_addr(compiler, &mem, &memw, SIMM_MAX - (type & SLJIT_32) ? 3 : 7));
+	SLJIT_ASSERT(FAST_IS_REG(mem) && mem != TMP_REG2);
+
+	if (type & SLJIT_MEM_STORE) {
+		if (type & SLJIT_32) {
+			FAIL_IF(push_inst(compiler, MFC1 | T(TMP_REG2) | FS(freg), DR(TMP_REG2)));
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+			FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif
+			FAIL_IF(push_inst(compiler, SWL | S(mem) | T(TMP_REG2) | IMM(memw), MOVABLE_INS));
+			return push_inst(compiler, SWR | S(mem) | T(TMP_REG2) | IMM(memw + 3), MOVABLE_INS);
+		}
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+		FAIL_IF(push_inst(compiler, MFC1 | T(TMP_REG2) | MEMF64_FS_FIRST(freg), DR(TMP_REG2)));
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif
+		FAIL_IF(push_inst(compiler, SWL | S(mem) | T(TMP_REG2) | IMM(memw), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, SWR | S(mem) | T(TMP_REG2) | IMM(memw + 3), MOVABLE_INS));
+
+		FAIL_IF(push_inst(compiler, MFC1 | T(TMP_REG2) | MEMF64_FS_SECOND(freg), DR(TMP_REG2)));
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif
+		FAIL_IF(push_inst(compiler, SWL | S(mem) | T(TMP_REG2) | IMM(memw + 4), MOVABLE_INS));
+		return push_inst(compiler, SWR | S(mem) | T(TMP_REG2) | IMM(memw + 7), MOVABLE_INS);
+#else /* !SLJIT_CONFIG_MIPS_32 */
+		FAIL_IF(push_inst(compiler, MFC1 | (1 << 21) | T(TMP_REG2) | FS(freg), DR(TMP_REG2)));
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif
+		FAIL_IF(push_inst(compiler, SDL | S(mem) | T(TMP_REG2) | IMM(memw), MOVABLE_INS));
+		return push_inst(compiler, SDR | S(mem) | T(TMP_REG2) | IMM(memw + 7), MOVABLE_INS);
+#endif /* SLJIT_CONFIG_MIPS_32 */
+	}
+
+	if (type & SLJIT_32) {
+		FAIL_IF(push_inst(compiler, LWL | S(mem) | T(TMP_REG2) | IMM(memw), DR(TMP_REG2)));
+		FAIL_IF(push_inst(compiler, LWR | S(mem) | T(TMP_REG2) | IMM(memw + 3), DR(TMP_REG2)));
+
+		FAIL_IF(push_inst(compiler, MTC1 | T(TMP_REG2) | FS(freg), MOVABLE_INS));
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif
+		return SLJIT_SUCCESS;
+	}
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(TMP_REG2) | IMM(memw), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, LWR | S(mem) | T(TMP_REG2) | IMM(memw + 3), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, MTC1 | T(TMP_REG2) | MEMF64_FS_FIRST(freg), MOVABLE_INS));
+
+	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(TMP_REG2) | IMM(memw + 4), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, LWR | S(mem) | T(TMP_REG2) | IMM(memw + 7), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, MTC1 | T(TMP_REG2) | MEMF64_FS_SECOND(freg), MOVABLE_INS));
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif
+#else /* !SLJIT_CONFIG_MIPS_32 */
+	FAIL_IF(push_inst(compiler, LDL | S(mem) | T(TMP_REG2) | IMM(memw), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, LDR | S(mem) | T(TMP_REG2) | IMM(memw + 7), DR(TMP_REG2)));
+
+	FAIL_IF(push_inst(compiler, MTC1 | (1 << 21) | T(TMP_REG2) | FS(freg), MOVABLE_INS));
+#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif
+#endif /* SLJIT_CONFIG_MIPS_32 */
+	return SLJIT_SUCCESS;
+}
+
+#endif /* !SLJIT_MIPS_REV || SLJIT_MIPS_REV < 6 */
+
+#undef MEM16_IMM_FIRST
+#undef MEM16_IMM_SECOND
+#undef MEMF64_FS_FIRST
+#undef MEMF64_FS_SECOND
+#undef MEM_CHECK_UNALIGNED
+
+#undef TO_ARGW_HI
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
 {
 	struct sljit_const *const_;
@@ -2477,7 +3687,7 @@
 	PTR_FAIL_IF(emit_const(compiler, dst_r, init_value));
 
 	if (dst & SLJIT_MEM)
-		PTR_FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, dst, dstw, TMP_REG1, 0, TMP_REG2, 0));
+		PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, DR(TMP_REG2), dst, dstw));
 
 	return const_;
 }
@@ -2496,15 +3706,15 @@
 	set_put_label(put_label, compiler, 0);
 
 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-	PTR_FAIL_IF(emit_const(compiler, dst_r, 0));
-#else
 	PTR_FAIL_IF(push_inst(compiler, (sljit_ins)dst_r, UNMOVABLE_INS));
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	compiler->size += 1;
+#else
 	compiler->size += 5;
 #endif
 
 	if (dst & SLJIT_MEM)
-		PTR_FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, dst, dstw, TMP_REG1, 0, TMP_REG2, 0));
+		PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, DR(TMP_REG2), dst, dstw));
 
 	return put_label;
 }
diff --git a/src/sljit/sljitNativePPC_32.c b/src/sljit/sljitNativePPC_32.c
index 95fe6bb..9449e4b 100644
--- a/src/sljit/sljitNativePPC_32.c
+++ b/src/sljit/sljitNativePPC_32.c
@@ -38,12 +38,15 @@
 	return (imm & 0xffff) ? push_inst(compiler, ORI | S(reg) | A(reg) | IMM(imm)) : SLJIT_SUCCESS;
 }
 
+/* Simplified mnemonics: clrlwi. */
 #define INS_CLEAR_LEFT(dst, src, from) \
-	(RLWINM | S(src) | A(dst) | ((from) << 6) | (31 << 1))
+	(RLWINM | S(src) | A(dst) | RLWI_MBE(from, 31))
 
 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
 	sljit_s32 dst, sljit_s32 src1, sljit_s32 src2)
 {
+	sljit_u32 imm;
+
 	switch (op) {
 	case SLJIT_MOV:
 	case SLJIT_MOV_U32:
@@ -90,6 +93,16 @@
 		SLJIT_ASSERT(src1 == TMP_REG1);
 		return push_inst(compiler, CNTLZW | S(src2) | A(dst));
 
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(src1 == TMP_REG1);
+		FAIL_IF(push_inst(compiler, NEG | D(TMP_REG1) | A(src2)));
+		FAIL_IF(push_inst(compiler, AND | S(src2) | A(dst) | B(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, CNTLZW | S(dst) | A(dst)));
+		FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG1) | A(dst) | IMM(-32)));
+		/* The highest bits are set, if dst < 32, zero otherwise. */
+		FAIL_IF(push_inst(compiler, SRWI(27) | S(TMP_REG1) | A(TMP_REG1)));
+		return push_inst(compiler, XOR | S(dst) | A(dst) | B(TMP_REG1));
+
 	case SLJIT_ADD:
 		if (flags & ALT_FORM1) {
 			/* Setting XER SO is not enough, CR SO is also needed. */
@@ -103,12 +116,14 @@
 			if (flags & ALT_FORM3)
 				return push_inst(compiler, ADDIS | D(dst) | A(src1) | compiler->imm);
 
+			imm = compiler->imm;
+
 			if (flags & ALT_FORM4) {
-				FAIL_IF(push_inst(compiler, ADDIS | D(dst) | A(src1) | (((compiler->imm >> 16) & 0xffff) + ((compiler->imm >> 15) & 0x1))));
+				FAIL_IF(push_inst(compiler, ADDIS | D(dst) | A(src1) | (((imm >> 16) & 0xffff) + ((imm >> 15) & 0x1))));
 				src1 = dst;
 			}
 
-			return push_inst(compiler, ADDI | D(dst) | A(src1) | (compiler->imm & 0xffff));
+			return push_inst(compiler, ADDI | D(dst) | A(src1) | (imm & 0xffff));
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
@@ -208,8 +223,10 @@
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			FAIL_IF(push_inst(compiler, ORI | S(src1) | A(dst) | IMM(compiler->imm)));
-			return push_inst(compiler, ORIS | S(dst) | A(dst) | IMM(compiler->imm >> 16));
+			imm = compiler->imm;
+
+			FAIL_IF(push_inst(compiler, ORI | S(src1) | A(dst) | IMM(imm)));
+			return push_inst(compiler, ORIS | S(dst) | A(dst) | IMM(imm >> 16));
 		}
 		return push_inst(compiler, OR | RC(flags) | S(src1) | A(dst) | B(src2));
 
@@ -224,34 +241,78 @@
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			FAIL_IF(push_inst(compiler, XORI | S(src1) | A(dst) | IMM(compiler->imm)));
-			return push_inst(compiler, XORIS | S(dst) | A(dst) | IMM(compiler->imm >> 16));
+			imm = compiler->imm;
+
+			FAIL_IF(push_inst(compiler, XORI | S(src1) | A(dst) | IMM(imm)));
+			return push_inst(compiler, XORIS | S(dst) | A(dst) | IMM(imm >> 16));
 		}
 		return push_inst(compiler, XOR | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			compiler->imm &= 0x1f;
-			return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | (compiler->imm << 11) | ((31 - compiler->imm) << 1));
+			imm = compiler->imm & 0x1f;
+			return push_inst(compiler, SLWI(imm) | RC(flags) | S(src1) | A(dst));
 		}
+
+		if (op == SLJIT_MSHL) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | 0x1f));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, SLW | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			compiler->imm &= 0x1f;
-			return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | (((32 - compiler->imm) & 0x1f) << 11) | (compiler->imm << 6) | (31 << 1));
+			imm = compiler->imm & 0x1f;
+			/* Since imm can be 0, SRWI() cannot be used. */
+			return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | RLWI_SH((32 - imm) & 0x1f) | RLWI_MBE(imm, 31));
 		}
+
+		if (op == SLJIT_MLSHR) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | 0x1f));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, SRW | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			compiler->imm &= 0x1f;
-			return push_inst(compiler, SRAWI | RC(flags) | S(src1) | A(dst) | (compiler->imm << 11));
+			imm = compiler->imm & 0x1f;
+			return push_inst(compiler, SRAWI | RC(flags) | S(src1) | A(dst) | (imm << 11));
 		}
+
+		if (op == SLJIT_MASHR) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | 0x1f));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, SRAW | RC(flags) | S(src1) | A(dst) | B(src2));
+
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		if (flags & ALT_FORM1) {
+			SLJIT_ASSERT(src2 == TMP_REG2);
+			imm = compiler->imm;
+
+			if (op == SLJIT_ROTR)
+				imm = (sljit_u32)(-(sljit_s32)imm);
+
+			imm &= 0x1f;
+			return push_inst(compiler, RLWINM | S(src1) | A(dst) | RLWI_SH(imm) | RLWI_MBE(0, 31));
+		}
+
+		if (op == SLJIT_ROTR) {
+			FAIL_IF(push_inst(compiler, SUBFIC | D(TMP_REG2) | A(src2) | 0));
+			src2 = TMP_REG2;
+		}
+
+		return push_inst(compiler, RLWNM | S(src1) | A(dst) | B(src2) | RLWI_MBE(0, 31));
 	}
 
 	SLJIT_UNREACHABLE();
@@ -277,8 +338,3 @@
 	inst = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(inst, executable_offset);
 	SLJIT_CACHE_FLUSH(inst, inst + 2);
 }
-
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
-{
-	sljit_set_jump_addr(addr, (sljit_uw)new_constant, executable_offset);
-}
diff --git a/src/sljit/sljitNativePPC_64.c b/src/sljit/sljitNativePPC_64.c
index d104f6d..8054910 100644
--- a/src/sljit/sljitNativePPC_64.c
+++ b/src/sljit/sljitNativePPC_64.c
@@ -35,8 +35,9 @@
 #error "Must implement count leading zeroes"
 #endif
 
-#define PUSH_RLDICR(reg, shift) \
-	push_inst(compiler, RLDI(reg, reg, 63 - shift, shift, 1))
+/* Computes SLDI(63 - shift). */
+#define PUSH_SLDI_NEG(reg, shift) \
+	push_inst(compiler, RLDICR | S(reg) | A(reg) | RLDI_SH(63 - shift) | RLDI_ME(shift))
 
 static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 reg, sljit_sw imm)
 {
@@ -66,14 +67,14 @@
 	if ((tmp & ~0xffff000000000000ul) == 0) {
 		FAIL_IF(push_inst(compiler, ADDI | D(reg) | A(0) | (sljit_ins)(tmp >> 48)));
 		shift += 15;
-		return PUSH_RLDICR(reg, shift);
+		return PUSH_SLDI_NEG(reg, shift);
 	}
 
 	if ((tmp & ~0xffffffff00000000ul) == 0) {
 		FAIL_IF(push_inst(compiler, ADDIS | D(reg) | A(0) | (sljit_ins)(tmp >> 48)));
 		FAIL_IF(push_inst(compiler, ORI | S(reg) | A(reg) | IMM(tmp >> 32)));
 		shift += 31;
-		return PUSH_RLDICR(reg, shift);
+		return PUSH_SLDI_NEG(reg, shift);
 	}
 
 	/* Cut out the 16 bit from immediate. */
@@ -82,13 +83,13 @@
 
 	if (tmp2 <= 0xffff) {
 		FAIL_IF(push_inst(compiler, ADDI | D(reg) | A(0) | (sljit_ins)(tmp >> 48)));
-		FAIL_IF(PUSH_RLDICR(reg, shift));
+		FAIL_IF(PUSH_SLDI_NEG(reg, shift));
 		return push_inst(compiler, ORI | S(reg) | A(reg) | (sljit_ins)tmp2);
 	}
 
 	if (tmp2 <= 0xffffffff) {
 		FAIL_IF(push_inst(compiler, ADDI | D(reg) | A(0) | IMM(tmp >> 48)));
-		FAIL_IF(PUSH_RLDICR(reg, shift));
+		FAIL_IF(PUSH_SLDI_NEG(reg, shift));
 		FAIL_IF(push_inst(compiler, ORIS | S(reg) | A(reg) | (sljit_ins)(tmp2 >> 16)));
 		return (imm & 0xffff) ? push_inst(compiler, ORI | S(reg) | A(reg) | IMM(tmp2)) : SLJIT_SUCCESS;
 	}
@@ -100,22 +101,23 @@
 		FAIL_IF(push_inst(compiler, ADDI | D(reg) | A(0) | (sljit_ins)(tmp >> 48)));
 		shift2 += 15;
 		shift += (63 - shift2);
-		FAIL_IF(PUSH_RLDICR(reg, shift));
+		FAIL_IF(PUSH_SLDI_NEG(reg, shift));
 		FAIL_IF(push_inst(compiler, ORI | S(reg) | A(reg) | (sljit_ins)(tmp2 >> 48)));
-		return PUSH_RLDICR(reg, shift2);
+		return PUSH_SLDI_NEG(reg, shift2);
 	}
 
 	/* The general version. */
 	FAIL_IF(push_inst(compiler, ADDIS | D(reg) | A(0) | (sljit_ins)((sljit_uw)imm >> 48)));
 	FAIL_IF(push_inst(compiler, ORI | S(reg) | A(reg) | IMM(imm >> 32)));
-	FAIL_IF(PUSH_RLDICR(reg, 31));
+	FAIL_IF(PUSH_SLDI_NEG(reg, 31));
 	FAIL_IF(push_inst(compiler, ORIS | S(reg) | A(reg) | IMM(imm >> 16)));
 	return push_inst(compiler, ORI | S(reg) | A(reg) | IMM(imm));
 }
 
-/* Simplified mnemonics: clrldi. */
-#define INS_CLEAR_LEFT(dst, src, from) \
-	(RLDICL | S(src) | A(dst) | ((from) << 6) | (1 << 5))
+#undef PUSH_SLDI_NEG
+
+#define CLRLDI(dst, src, n) \
+	(RLDICL | S(src) | A(dst) | RLDI_SH(0) | RLDI_MB(n))
 
 /* Sign extension for integer operations. */
 #define UN_EXTS() \
@@ -145,6 +147,8 @@
 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
 	sljit_s32 dst, sljit_s32 src1, sljit_s32 src2)
 {
+	sljit_u32 imm;
+
 	switch (op) {
 	case SLJIT_MOV:
 	case SLJIT_MOV_P:
@@ -159,7 +163,7 @@
 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
 			if (op == SLJIT_MOV_S32)
 				return push_inst(compiler, EXTSW | S(src2) | A(dst));
-			return push_inst(compiler, INS_CLEAR_LEFT(dst, src2, 0));
+			return push_inst(compiler, CLRLDI(dst, src2, 32));
 		}
 		else {
 			SLJIT_ASSERT(dst == src2);
@@ -172,7 +176,7 @@
 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
 			if (op == SLJIT_MOV_S8)
 				return push_inst(compiler, EXTSB | S(src2) | A(dst));
-			return push_inst(compiler, INS_CLEAR_LEFT(dst, src2, 24));
+			return push_inst(compiler, CLRLDI(dst, src2, 56));
 		}
 		else if ((flags & REG_DEST) && op == SLJIT_MOV_S8)
 			return push_inst(compiler, EXTSB | S(src2) | A(dst));
@@ -187,7 +191,7 @@
 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
 			if (op == SLJIT_MOV_S16)
 				return push_inst(compiler, EXTSH | S(src2) | A(dst));
-			return push_inst(compiler, INS_CLEAR_LEFT(dst, src2, 16));
+			return push_inst(compiler, CLRLDI(dst, src2, 48));
 		}
 		else {
 			SLJIT_ASSERT(dst == src2);
@@ -201,22 +205,30 @@
 
 	case SLJIT_CLZ:
 		SLJIT_ASSERT(src1 == TMP_REG1);
-		if (flags & ALT_FORM1)
-			return push_inst(compiler, CNTLZW | S(src2) | A(dst));
-		return push_inst(compiler, CNTLZD | S(src2) | A(dst));
+		return push_inst(compiler, ((flags & ALT_FORM1) ? CNTLZW : CNTLZD) | S(src2) | A(dst));
+
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(src1 == TMP_REG1);
+		FAIL_IF(push_inst(compiler, NEG | D(TMP_REG1) | A(src2)));
+		FAIL_IF(push_inst(compiler, AND | S(src2) | A(dst) | B(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, ((flags & ALT_FORM1) ? CNTLZW : CNTLZD) | S(dst) | A(dst)));
+		FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG1) | A(dst) | IMM((flags & ALT_FORM1) ? -32 : -64)));
+		/* The highest bits are set, if dst < bit width, zero otherwise. */
+		FAIL_IF(push_inst(compiler, ((flags & ALT_FORM1) ? SRWI(27) : SRDI(58)) | S(TMP_REG1) | A(TMP_REG1)));
+		return push_inst(compiler, XOR | S(dst) | A(dst) | B(TMP_REG1));
 
 	case SLJIT_ADD:
 		if (flags & ALT_FORM1) {
 			if (flags & ALT_SIGN_EXT) {
-				FAIL_IF(push_inst(compiler, RLDI(TMP_REG1, src1, 32, 31, 1)));
+				FAIL_IF(push_inst(compiler, SLDI(32) | S(src1) | A(TMP_REG1)));
 				src1 = TMP_REG1;
-				FAIL_IF(push_inst(compiler, RLDI(TMP_REG2, src2, 32, 31, 1)));
+				FAIL_IF(push_inst(compiler, SLDI(32) | S(src2) | A(TMP_REG2)));
 				src2 = TMP_REG2;
 			}
 			/* Setting XER SO is not enough, CR SO is also needed. */
 			FAIL_IF(push_inst(compiler, ADD | OE(ALT_SET_FLAGS) | RC(ALT_SET_FLAGS) | D(dst) | A(src1) | B(src2)));
 			if (flags & ALT_SIGN_EXT)
-				return push_inst(compiler, RLDI(dst, dst, 32, 32, 0));
+				return push_inst(compiler, SRDI(32) | S(dst) | A(dst));
 			return SLJIT_SUCCESS;
 		}
 
@@ -227,12 +239,14 @@
 			if (flags & ALT_FORM3)
 				return push_inst(compiler, ADDIS | D(dst) | A(src1) | compiler->imm);
 
+			imm = compiler->imm;
+
 			if (flags & ALT_FORM4) {
-				FAIL_IF(push_inst(compiler, ADDIS | D(dst) | A(src1) | (((compiler->imm >> 16) & 0xffff) + ((compiler->imm >> 15) & 0x1))));
+				FAIL_IF(push_inst(compiler, ADDIS | D(dst) | A(src1) | (((imm >> 16) & 0xffff) + ((imm >> 15) & 0x1))));
 				src1 = dst;
 			}
 
-			return push_inst(compiler, ADDI | D(dst) | A(src1) | (compiler->imm & 0xffff));
+			return push_inst(compiler, ADDI | D(dst) | A(src1) | (imm & 0xffff));
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
@@ -287,11 +301,11 @@
 		if (flags & ALT_FORM3) {
 			if (flags & ALT_SIGN_EXT) {
 				if (src1 != TMP_ZERO) {
-					FAIL_IF(push_inst(compiler, RLDI(TMP_REG1, src1, 32, 31, 1)));
+					FAIL_IF(push_inst(compiler, SLDI(32) | S(src1) | A(TMP_REG1)));
 					src1 = TMP_REG1;
 				}
 				if (src2 != TMP_ZERO) {
-					FAIL_IF(push_inst(compiler, RLDI(TMP_REG2, src2, 32, 31, 1)));
+					FAIL_IF(push_inst(compiler, SLDI(32) | S(src2) | A(TMP_REG2)));
 					src2 = TMP_REG2;
 				}
 			}
@@ -303,7 +317,7 @@
 				FAIL_IF(push_inst(compiler, NEG | OE(ALT_SET_FLAGS) | RC(ALT_SET_FLAGS) | D(dst) | A(src2)));
 
 			if (flags & ALT_SIGN_EXT)
-				return push_inst(compiler, RLDI(dst, dst, 32, 32, 0));
+				return push_inst(compiler, SRDI(32) | S(dst) | A(dst));
 			return SLJIT_SUCCESS;
 		}
 
@@ -362,8 +376,10 @@
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			FAIL_IF(push_inst(compiler, ORI | S(src1) | A(dst) | IMM(compiler->imm)));
-			return push_inst(compiler, ORIS | S(dst) | A(dst) | IMM(compiler->imm >> 16));
+			imm = compiler->imm;
+
+			FAIL_IF(push_inst(compiler, ORI | S(src1) | A(dst) | IMM(imm)));
+			return push_inst(compiler, ORIS | S(dst) | A(dst) | IMM(imm >> 16));
 		}
 		return push_inst(compiler, OR | RC(flags) | S(src1) | A(dst) | B(src2));
 
@@ -378,46 +394,105 @@
 		}
 		if (flags & ALT_FORM3) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
-			FAIL_IF(push_inst(compiler, XORI | S(src1) | A(dst) | IMM(compiler->imm)));
-			return push_inst(compiler, XORIS | S(dst) | A(dst) | IMM(compiler->imm >> 16));
+			imm = compiler->imm;
+
+			FAIL_IF(push_inst(compiler, XORI | S(src1) | A(dst) | IMM(imm)));
+			return push_inst(compiler, XORIS | S(dst) | A(dst) | IMM(imm >> 16));
 		}
 		return push_inst(compiler, XOR | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
+			imm = compiler->imm;
+
 			if (flags & ALT_FORM2) {
-				compiler->imm &= 0x1f;
-				return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | (compiler->imm << 11) | ((31 - compiler->imm) << 1));
+				imm &= 0x1f;
+				return push_inst(compiler, SLWI(imm) | RC(flags) | S(src1) | A(dst));
 			}
-			compiler->imm &= 0x3f;
-			return push_inst(compiler, RLDI(dst, src1, compiler->imm, 63 - compiler->imm, 1) | RC(flags));
+
+			imm &= 0x3f;
+			return push_inst(compiler, SLDI(imm) | RC(flags) | S(src1) | A(dst));
 		}
+
+		if (op == SLJIT_MSHL) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | ((flags & ALT_FORM2) ? 0x1f : 0x3f)));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, ((flags & ALT_FORM2) ? SLW : SLD) | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
+			imm = compiler->imm;
+
 			if (flags & ALT_FORM2) {
-				compiler->imm &= 0x1f;
-				return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | (((32 - compiler->imm) & 0x1f) << 11) | (compiler->imm << 6) | (31 << 1));
+				imm &= 0x1f;
+				/* Since imm can be 0, SRWI() cannot be used. */
+				return push_inst(compiler, RLWINM | RC(flags) | S(src1) | A(dst) | RLWI_SH((32 - imm) & 0x1f) | RLWI_MBE(imm, 31));
 			}
-			compiler->imm &= 0x3f;
-			return push_inst(compiler, RLDI(dst, src1, 64 - compiler->imm, compiler->imm, 0) | RC(flags));
+
+			imm &= 0x3f;
+			/* Since imm can be 0, SRDI() cannot be used. */
+			return push_inst(compiler, RLDICL | RC(flags) | S(src1) | A(dst) | RLDI_SH((64 - imm) & 0x3f) | RLDI_MB(imm));
 		}
+
+		if (op == SLJIT_MLSHR) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | ((flags & ALT_FORM2) ? 0x1f : 0x3f)));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, ((flags & ALT_FORM2) ? SRW : SRD) | RC(flags) | S(src1) | A(dst) | B(src2));
 
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		if (flags & ALT_FORM1) {
 			SLJIT_ASSERT(src2 == TMP_REG2);
+			imm = compiler->imm;
+
 			if (flags & ALT_FORM2) {
-				compiler->imm &= 0x1f;
-				return push_inst(compiler, SRAWI | RC(flags) | S(src1) | A(dst) | (compiler->imm << 11));
+				imm &= 0x1f;
+				return push_inst(compiler, SRAWI | RC(flags) | S(src1) | A(dst) | (imm << 11));
 			}
-			compiler->imm &= 0x3f;
-			return push_inst(compiler, SRADI | RC(flags) | S(src1) | A(dst) | ((compiler->imm & 0x1f) << 11) | ((compiler->imm & 0x20) >> 4));
+
+			imm &= 0x3f;
+			return push_inst(compiler, SRADI | RC(flags) | S(src1) | A(dst) | RLDI_SH(imm));
 		}
+
+		if (op == SLJIT_MASHR) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | ((flags & ALT_FORM2) ? 0x1f : 0x3f)));
+			src2 = TMP_REG2;
+		}
+
 		return push_inst(compiler, ((flags & ALT_FORM2) ? SRAW : SRAD) | RC(flags) | S(src1) | A(dst) | B(src2));
+
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		if (flags & ALT_FORM1) {
+			SLJIT_ASSERT(src2 == TMP_REG2);
+			imm = compiler->imm;
+
+			if (op == SLJIT_ROTR)
+				imm = (sljit_u32)(-(sljit_s32)imm);
+
+			if (flags & ALT_FORM2) {
+				imm &= 0x1f;
+				return push_inst(compiler, RLWINM | S(src1) | A(dst) | RLWI_SH(imm) | RLWI_MBE(0, 31));
+			}
+
+			imm &= 0x3f;
+			return push_inst(compiler, RLDICL | S(src1) | A(dst) | RLDI_SH(imm));
+		}
+
+		if (op == SLJIT_ROTR) {
+			FAIL_IF(push_inst(compiler, SUBFIC | D(TMP_REG2) | A(src2) | 0));
+			src2 = TMP_REG2;
+		}
+
+		return push_inst(compiler, ((flags & ALT_FORM2) ? (RLWNM | RLWI_MBE(0, 31)) : (RLDCL | RLDI_MB(0))) | S(src1) | A(dst) | B(src2));
 	}
 
 	SLJIT_UNREACHABLE();
@@ -483,7 +558,7 @@
 {
 	FAIL_IF(push_inst(compiler, ADDIS | D(reg) | A(0) | IMM(init_value >> 48)));
 	FAIL_IF(push_inst(compiler, ORI | S(reg) | A(reg) | IMM(init_value >> 32)));
-	FAIL_IF(PUSH_RLDICR(reg, 31));
+	FAIL_IF(push_inst(compiler, SLDI(32) | S(reg) | A(reg)));
 	FAIL_IF(push_inst(compiler, ORIS | S(reg) | A(reg) | IMM(init_value >> 16)));
 	return push_inst(compiler, ORI | S(reg) | A(reg) | IMM(init_value));
 }
@@ -502,8 +577,3 @@
 	inst = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(inst, executable_offset);
 	SLJIT_CACHE_FLUSH(inst, inst + 5);
 }
-
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
-{
-	sljit_set_jump_addr(addr, (sljit_uw)new_constant, executable_offset);
-}
diff --git a/src/sljit/sljitNativePPC_common.c b/src/sljit/sljitNativePPC_common.c
index 8bfdc69..f387114 100644
--- a/src/sljit/sljitNativePPC_common.c
+++ b/src/sljit/sljitNativePPC_common.c
@@ -203,8 +203,13 @@
 #define OR		(HI(31) | LO(444))
 #define ORI		(HI(24))
 #define ORIS		(HI(25))
-#define RLDICL		(HI(30))
+#define RLDCL		(HI(30) | LO(8))
+#define RLDICL		(HI(30) | LO(0 << 1))
+#define RLDICR		(HI(30) | LO(1 << 1))
+#define RLDIMI		(HI(30) | LO(3 << 1))
+#define RLWIMI		(HI(20))
 #define RLWINM		(HI(21))
+#define RLWNM		(HI(23))
 #define SLD		(HI(31) | LO(27))
 #define SLW		(HI(31) | LO(24))
 #define SRAD		(HI(31) | LO(794))
@@ -233,9 +238,24 @@
 #define SIMM_MIN	(-0x8000)
 #define UIMM_MAX	(0xffff)
 
-#define RLDI(dst, src, sh, mb, type) \
-	(HI(30) | S(src) | A(dst) | ((sljit_ins)(type) << 2) | (((sljit_ins)(sh) & 0x1f) << 11) \
-	| (((sljit_ins)(sh) & 0x20) >> 4) | (((sljit_ins)(mb) & 0x1f) << 6) | ((sljit_ins)(mb) & 0x20))
+/* Shift helpers. */
+#define RLWI_SH(sh) ((sljit_ins)(sh) << 11)
+#define RLWI_MBE(mb, me) (((sljit_ins)(mb) << 6) | ((sljit_ins)(me) << 1))
+#define RLDI_SH(sh) ((((sljit_ins)(sh) & 0x1f) << 11) | (((sljit_ins)(sh) & 0x20) >> 4))
+#define RLDI_MB(mb) ((((sljit_ins)(mb) & 0x1f) << 6) | ((sljit_ins)(mb) & 0x20))
+#define RLDI_ME(me) RLDI_MB(me)
+
+#define SLWI(shift) (RLWINM | RLWI_SH(shift) | RLWI_MBE(0, 31 - (shift)))
+#define SLDI(shift) (RLDICR | RLDI_SH(shift) | RLDI_ME(63 - (shift)))
+/* shift > 0 */
+#define SRWI(shift) (RLWINM | RLWI_SH(32 - (shift)) | RLWI_MBE((shift), 31))
+#define SRDI(shift) (RLDICL | RLDI_SH(64 - (shift)) | RLDI_MB(shift))
+
+#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
+#define SLWI_W(shift) SLWI(shift)
+#else /* !SLJIT_CONFIG_PPC_32 */
+#define SLWI_W(shift) SLDI(shift)
+#endif /* SLJIT_CONFIG_PPC_32 */
 
 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_function_context(void** func_ptr, struct sljit_function_context* context, sljit_uw addr, void* func)
@@ -368,10 +388,10 @@
 		else {
 			inst[0] = ORIS | S(TMP_ZERO) | A(reg) | IMM(addr >> 48);
 			inst[1] = ORI | S(reg) | A(reg) | IMM((addr >> 32) & 0xffff);
-			inst ++;
+			inst++;
 		}
 
-		inst[1] = RLDI(reg, reg, 32, 31, 1);
+		inst[1] = SLDI(32) | S(reg) | A(reg);
 		inst[2] = ORIS | S(reg) | A(reg) | IMM((addr >> 16) & 0xffff);
 		inst += 2;
 	}
@@ -379,7 +399,7 @@
 	inst[1] = ORI | S(reg) | A(reg) | IMM(addr & 0xffff);
 }
 
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
 {
@@ -497,8 +517,8 @@
 				}
 				next_addr = compute_next_addr(label, jump, const_, put_label);
 			}
-			code_ptr ++;
-			word_count ++;
+			code_ptr++;
+			word_count++;
 		} while (buf_ptr < buf_end);
 
 		buf = buf->next;
@@ -641,14 +661,23 @@
 	/* A saved register is set to a zero value. */
 	case SLJIT_HAS_ZERO_REGISTER:
 	case SLJIT_HAS_CLZ:
+	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_PREFETCH:
 		return 1;
 
+	case SLJIT_HAS_CTZ:
+		return 2;
+
 	default:
 		return 0;
 	}
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
+{
+	return (type >= SLJIT_UNORDERED && type <= SLJIT_ORDERED_LESS_EQUAL);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Entry, exit                                                          */
 /* --------------------------------------------------------------------- */
@@ -715,13 +744,16 @@
 
 #define STACK_MAX_DISTANCE	(0x8000 - SSIZE_OF(sw) - LR_SAVE_OFFSET)
 
+static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 inp_flags, sljit_s32 reg,
+	sljit_s32 arg, sljit_sw argw, sljit_s32 tmp_reg);
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
 	sljit_s32 i, tmp, base, offset;
 	sljit_s32 word_arg_count = 0;
-	sljit_s32 saved_arg_count = 0;
+	sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options);
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 	sljit_s32 arg_count = 0;
 #endif
@@ -730,8 +762,12 @@
 	CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
 	set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
-	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1)
+	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 0)
 		+ GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+
+	if (!(options & SLJIT_ENTER_REG_ARG))
+		local_size += SSIZE_OF(sw);
+
 	local_size = (local_size + SLJIT_LOCALS_OFFSET + 15) & ~0xf;
 	compiler->local_size = local_size;
 
@@ -770,11 +806,13 @@
 		FAIL_IF(push_inst(compiler, STFD | FS(i) | A(base) | IMM(offset)));
 	}
 
-	offset -= SSIZE_OF(sw);
-	FAIL_IF(push_inst(compiler, STACK_STORE | S(TMP_ZERO) | A(base) | IMM(offset)));
+	if (!(options & SLJIT_ENTER_REG_ARG)) {
+		offset -= SSIZE_OF(sw);
+		FAIL_IF(push_inst(compiler, STACK_STORE | S(TMP_ZERO) | A(base) | IMM(offset)));
+	}
 
 	tmp = SLJIT_S0 - saveds;
-	for (i = SLJIT_S0; i > tmp; i--) {
+	for (i = SLJIT_S0 - saved_arg_count; i > tmp; i--) {
 		offset -= SSIZE_OF(sw);
 		FAIL_IF(push_inst(compiler, STACK_STORE | S(i) | A(base) | IMM(offset)));
 	}
@@ -785,9 +823,14 @@
 	}
 
 	FAIL_IF(push_inst(compiler, STACK_STORE | S(0) | A(base) | IMM(local_size + LR_SAVE_OFFSET)));
+
+	if (options & SLJIT_ENTER_REG_ARG)
+		return SLJIT_SUCCESS;
+
 	FAIL_IF(push_inst(compiler, ADDI | D(TMP_ZERO) | A(0) | 0));
 
 	arg_types >>= SLJIT_ARG_SHIFT;
+	saved_arg_count = 0;
 
 	while (arg_types > 0) {
 		if ((arg_types & SLJIT_ARG_MASK) < SLJIT_ARG_TYPE_F64) {
@@ -829,14 +872,17 @@
 	CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
 	set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
-	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1)
+	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 0)
 		+ GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+
+	if (!(options & SLJIT_ENTER_REG_ARG))
+		local_size += SSIZE_OF(sw);
+
 	compiler->local_size = (local_size + SLJIT_LOCALS_OFFSET + 15) & ~0xf;
 	return SLJIT_SUCCESS;
 }
 
-
-static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 is_return_to)
 {
 	sljit_s32 i, tmp, base, offset;
 	sljit_s32 local_size = compiler->local_size;
@@ -854,7 +900,8 @@
 	}
 
 	offset = local_size;
-	FAIL_IF(push_inst(compiler, STACK_LOAD | S(0) | A(base) | IMM(offset + LR_SAVE_OFFSET)));
+	if (!is_return_to)
+		FAIL_IF(push_inst(compiler, STACK_LOAD | S(0) | A(base) | IMM(offset + LR_SAVE_OFFSET)));
 
 	tmp = SLJIT_FS0 - compiler->fsaveds;
 	for (i = SLJIT_FS0; i > tmp; i--) {
@@ -867,11 +914,13 @@
 		FAIL_IF(push_inst(compiler, LFD | FS(i) | A(base) | IMM(offset)));
 	}
 
-	offset -= SSIZE_OF(sw);
-	FAIL_IF(push_inst(compiler, STACK_LOAD | S(TMP_ZERO) | A(base) | IMM(offset)));
+	if (!(compiler->options & SLJIT_ENTER_REG_ARG)) {
+		offset -= SSIZE_OF(sw);
+		FAIL_IF(push_inst(compiler, STACK_LOAD | S(TMP_ZERO) | A(base) | IMM(offset)));
+	}
 
 	tmp = SLJIT_S0 - compiler->saveds;
-	for (i = SLJIT_S0; i > tmp; i--) {
+	for (i = SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options); i > tmp; i--) {
 		offset -= SSIZE_OF(sw);
 		FAIL_IF(push_inst(compiler, STACK_LOAD | S(i) | A(base) | IMM(offset)));
 	}
@@ -881,7 +930,8 @@
 		FAIL_IF(push_inst(compiler, STACK_LOAD | S(i) | A(base) | IMM(offset)));
 	}
 
-	push_inst(compiler, MTLR | S(0));
+	if (!is_return_to)
+		push_inst(compiler, MTLR | S(0));
 
 	if (local_size > 0)
 		return push_inst(compiler, ADDI | D(SLJIT_SP) | A(base) | IMM(local_size));
@@ -890,17 +940,40 @@
 	return push_inst(compiler, OR | S(base) | A(SLJIT_SP) | B(base));
 }
 
+#undef STACK_STORE
+#undef STACK_LOAD
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
 {
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_return_void(compiler));
 
-	FAIL_IF(emit_stack_frame_release(compiler));
+	FAIL_IF(emit_stack_frame_release(compiler, 0));
 	return push_inst(compiler, BLR);
 }
 
-#undef STACK_STORE
-#undef STACK_LOAD
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_CALL_REG, src, srcw, TMP_CALL_REG));
+		src = TMP_CALL_REG;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, OR | S(src) | A(TMP_CALL_REG) | B(src)));
+		src = TMP_CALL_REG;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
 
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
@@ -1066,7 +1139,6 @@
 {
 	sljit_ins inst;
 	sljit_s32 offs_reg;
-	sljit_sw high_short;
 
 	/* Should work when (arg & REG_MASK) == 0. */
 	SLJIT_ASSERT(A(0) == 0);
@@ -1077,11 +1149,7 @@
 		offs_reg = OFFS_REG(arg);
 
 		if (argw != 0) {
-#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-			FAIL_IF(push_inst(compiler, RLWINM | S(OFFS_REG(arg)) | A(tmp_reg) | ((sljit_ins)argw << 11) | ((31 - (sljit_ins)argw) << 1)));
-#else
-			FAIL_IF(push_inst(compiler, RLDI(tmp_reg, OFFS_REG(arg), argw, 63 - argw, 1)));
-#endif
+			FAIL_IF(push_inst(compiler, SLWI_W(argw) | S(OFFS_REG(arg)) | A(tmp_reg)));
 			offs_reg = tmp_reg;
 		}
 
@@ -1089,7 +1157,7 @@
 
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 		SLJIT_ASSERT(!(inst & INT_ALIGNED));
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 
 		return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg & REG_MASK) | B(offs_reg));
 	}
@@ -1104,36 +1172,24 @@
 		inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
 		return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg) | B(tmp_reg));
 	}
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 
 	if (argw <= SIMM_MAX && argw >= SIMM_MIN)
 		return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg) | IMM(argw));
 
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 	if (argw <= 0x7fff7fffl && argw >= -0x80000000l) {
-#endif
-
-		high_short = (sljit_s32)(argw + ((argw & 0x8000) << 1)) & ~0xffff;
-
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-		SLJIT_ASSERT(high_short && high_short <= 0x7fffffffl && high_short >= -0x80000000l);
-#else
-		SLJIT_ASSERT(high_short);
-#endif
-
-		FAIL_IF(push_inst(compiler, ADDIS | D(tmp_reg) | A(arg) | IMM(high_short >> 16)));
+#endif /* SLJIT_CONFIG_PPC_64 */
+		FAIL_IF(push_inst(compiler, ADDIS | D(tmp_reg) | A(arg) | IMM((argw + 0x8000) >> 16)));
 		return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(tmp_reg) | IMM(argw));
-
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 	}
 
-	/* The rest is PPC-64 only. */
-
 	FAIL_IF(load_immediate(compiler, tmp_reg, argw));
 
 	inst = data_transfer_insts[(inp_flags | INDEXED) & MEM_MASK];
 	return push_inst(compiler, INST_CODE_AND_DST(inst, inp_flags, reg) | A(arg) | B(tmp_reg));
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 }
 
 static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 input_flags,
@@ -1273,11 +1329,7 @@
 	if (srcw == 0)
 		return push_inst(compiler, DCBT | A(src & REG_MASK) | B(OFFS_REG(src)));
 
-#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-	FAIL_IF(push_inst(compiler, RLWINM | S(OFFS_REG(src)) | A(TMP_REG1) | ((sljit_ins)srcw << 11) | ((31 - (sljit_ins)srcw) << 1)));
-#else
-	FAIL_IF(push_inst(compiler, RLDI(TMP_REG1, OFFS_REG(src), srcw, 63 - srcw, 1)));
-#endif
+	FAIL_IF(push_inst(compiler, SLWI_W(srcw) | S(OFFS_REG(src)) | A(TMP_REG1)));
 	return push_inst(compiler, DCBT | A(src & REG_MASK) | B(TMP_REG1));
 }
 
@@ -1362,10 +1414,11 @@
 		return emit_op(compiler, SLJIT_NOT, flags, dst, dstw, TMP_REG1, 0, src, srcw);
 
 	case SLJIT_CLZ:
+	case SLJIT_CTZ:
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-		return emit_op(compiler, SLJIT_CLZ, flags | (!(op_flags & SLJIT_32) ? 0 : ALT_FORM1), dst, dstw, TMP_REG1, 0, src, srcw);
+		return emit_op(compiler, op, flags | (!(op_flags & SLJIT_32) ? 0 : ALT_FORM1), dst, dstw, TMP_REG1, 0, src, srcw);
 #else
-		return emit_op(compiler, SLJIT_CLZ, flags, dst, dstw, TMP_REG1, 0, src, srcw);
+		return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);
 #endif
 	}
 
@@ -1626,7 +1679,7 @@
 				return emit_op(compiler, GET_OPCODE(op), flags | ALT_FORM2, dst, dstw, src2, src2w, TMP_REG2, 0);
 			}
 		}
-		if (GET_OPCODE(op) != SLJIT_AND) {
+		if (!HAS_FLAGS(op) && GET_OPCODE(op) != SLJIT_AND) {
 			/* Unlike or and xor, the and resets unwanted bits as well. */
 			if (TEST_UI_IMM(src2, src2w)) {
 				compiler->imm = (sljit_ins)src2w;
@@ -1640,8 +1693,13 @@
 		return emit_op(compiler, GET_OPCODE(op), flags, dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 		if (op & SLJIT_32)
 			flags |= ALT_FORM2;
@@ -1663,10 +1721,7 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w);
 }
 
@@ -1674,6 +1729,102 @@
 #undef TEST_SUB_FORM2
 #undef TEST_SUB_FORM3
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src_dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_s32 is_right;
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+	sljit_s32 inp_flags = ((op & SLJIT_32) ? INT_DATA : WORD_DATA) | LOAD_DATA;
+	sljit_sw bit_length = (op & SLJIT_32) ? 32 : 64;
+#else /* !SLJIT_CONFIG_PPC_64 */
+	sljit_s32 inp_flags = WORD_DATA | LOAD_DATA;
+	sljit_sw bit_length = 32;
+#endif /* SLJIT_CONFIG_PPC_64 */
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, src_dst, src1, src1w, src2, src2w));
+
+	is_right = (GET_OPCODE(op) == SLJIT_LSHR || GET_OPCODE(op) == SLJIT_MLSHR);
+
+	if (src_dst == src1) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, (is_right ? SLJIT_ROTR : SLJIT_ROTL) | (op & SLJIT_32), src_dst, 0, src_dst, 0, src2, src2w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+	if (src2 & SLJIT_IMM) {
+		src2w &= bit_length - 1;
+
+		if (src2w == 0)
+			return SLJIT_SUCCESS;
+	} else if (src2 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inp_flags, TMP_REG2, src2, src2w, TMP_REG2));
+		src2 = TMP_REG2;
+	}
+
+	if (src1 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inp_flags, TMP_REG1, src1, src1w, TMP_REG1));
+		src1 = TMP_REG1;
+	} else if (src1 & SLJIT_IMM) {
+		FAIL_IF(load_immediate(compiler, TMP_REG1, src1w));
+		src1 = TMP_REG1;
+	}
+
+	if (src2 & SLJIT_IMM) {
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+		if (!(op & SLJIT_32)) {
+			if (is_right) {
+				FAIL_IF(push_inst(compiler, SRDI(src2w) | S(src_dst) | A(src_dst)));
+				return push_inst(compiler, RLDIMI | S(src1) | A(src_dst) | RLDI_SH(64 - src2w) | RLDI_MB(0));
+			}
+
+			FAIL_IF(push_inst(compiler, SLDI(src2w) | S(src_dst) | A(src_dst)));
+			/* Computes SRDI(64 - src2w). */
+			FAIL_IF(push_inst(compiler, RLDICL | S(src1) | A(TMP_REG1) | RLDI_SH(src2w) | RLDI_MB(64 - src2w)));
+			return push_inst(compiler, OR | S(src_dst) | A(src_dst) | B(TMP_REG1));
+		}
+#endif /* SLJIT_CONFIG_PPC_64 */
+
+		if (is_right) {
+			FAIL_IF(push_inst(compiler, SRWI(src2w) | S(src_dst) | A(src_dst)));
+			return push_inst(compiler, RLWIMI | S(src1) | A(src_dst) | RLWI_SH(32 - src2w) | RLWI_MBE(0, src2w - 1));
+		}
+
+		FAIL_IF(push_inst(compiler, SLWI(src2w) | S(src_dst) | A(src_dst)));
+		return push_inst(compiler, RLWIMI | S(src1) | A(src_dst) | RLWI_SH(src2w) | RLWI_MBE(32 - src2w, 31));
+	}
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+	if (!(op & SLJIT_32)) {
+		if (GET_OPCODE(op) == SLJIT_MSHL || GET_OPCODE(op) == SLJIT_MLSHR) {
+			FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | 0x3f));
+			src2 = TMP_REG2;
+		}
+
+		FAIL_IF(push_inst(compiler, (is_right ? SRD : SLD) | S(src_dst) | A(src_dst) | B(src2)));
+		FAIL_IF(push_inst(compiler, (is_right ? SLDI(1) : SRDI(1)) | S(src1) | A(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, XORI | S(src2) | A(TMP_REG2) | 0x3f));
+		FAIL_IF(push_inst(compiler, (is_right ? SLD : SRD) | S(TMP_REG1) | A(TMP_REG1) | B(TMP_REG2)));
+		return push_inst(compiler, OR | S(src_dst) | A(src_dst) | B(TMP_REG1));
+	}
+#endif /* SLJIT_CONFIG_PPC_64 */
+
+	if (GET_OPCODE(op) == SLJIT_MSHL || GET_OPCODE(op) == SLJIT_MLSHR) {
+		FAIL_IF(push_inst(compiler, ANDI | S(src2) | A(TMP_REG2) | 0x1f));
+		src2 = TMP_REG2;
+	}
+
+	FAIL_IF(push_inst(compiler, (is_right ? SRW : SLW) | S(src_dst) | A(src_dst) | B(src2)));
+	FAIL_IF(push_inst(compiler, (is_right ? SLWI(1) : SRWI(1)) | S(src1) | A(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, XORI | S(src2) | A(TMP_REG2) | 0x1f));
+	FAIL_IF(push_inst(compiler, (is_right ? SLW : SRW) | S(TMP_REG1) | A(TMP_REG1) | B(TMP_REG2)));
+	return push_inst(compiler, OR | S(src_dst) | A(src_dst) | B(TMP_REG1));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -1686,7 +1837,7 @@
 		if (FAST_IS_REG(src))
 			FAIL_IF(push_inst(compiler, MTLR | S(src)));
 		else {
-			FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_REG2, 0, TMP_REG1, 0, src, srcw));
+			FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG2, src, srcw, TMP_REG2));
 			FAIL_IF(push_inst(compiler, MTLR | S(TMP_REG2)));
 		}
 
@@ -1782,11 +1933,7 @@
 	if (dst & OFFS_REG_MASK) {
 		dstw &= 0x3;
 		if (dstw) {
-#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
-			FAIL_IF(push_inst(compiler, RLWINM | S(OFFS_REG(dst)) | A(TMP_REG1) | ((sljit_ins)dstw << 11) | ((31 - (sljit_ins)dstw) << 1)));
-#else
-			FAIL_IF(push_inst(compiler, RLDI(TMP_REG1, OFFS_REG(dst), dstw, 63 - dstw, 1)));
-#endif
+			FAIL_IF(push_inst(compiler, SLWI_W(dstw) | S(OFFS_REG(dst)) | A(TMP_REG1)));
 			dstw = TMP_REG1;
 		}
 		else
@@ -1818,6 +1965,7 @@
 	if (src & SLJIT_IMM) {
 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
 			srcw = (sljit_s32)srcw;
+
 		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
 		src = TMP_REG1;
 	}
@@ -1863,7 +2011,7 @@
 	   The double precision format has exactly 53 bit precision, so the lower 32 bit represents
 	   the lower 32 bit of such value. The result of xor 2^31 is the same as adding 0x80000000
 	   to the input, which shifts it into the 0 - 0xffffffff range. To get the converted floating
-	   point value, we need to substract 2^53 + 2^31 from the constructed value. */
+	   point value, we need to subtract 2^53 + 2^31 from the constructed value. */
 	FAIL_IF(push_inst(compiler, ADDIS | D(TMP_REG2) | A(0) | 0x4330));
 	if (invert_sign)
 		FAIL_IF(push_inst(compiler, XORIS | S(src) | A(TMP_REG1) | 0x8000));
@@ -1899,7 +2047,21 @@
 		src2 = TMP_FREG2;
 	}
 
-	return push_inst(compiler, FCMPU | CRD(4) | FA(src1) | FB(src2));
+	FAIL_IF(push_inst(compiler, FCMPU | CRD(4) | FA(src1) | FB(src2)));
+
+	switch (GET_FLAG_TYPE(op)) {
+	case SLJIT_UNORDERED_OR_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL:
+		return push_inst(compiler, CROR | ((4 + 2) << 21) | ((4 + 2) << 16) | ((4 + 3) << 11));
+	case SLJIT_UNORDERED_OR_LESS:
+	case SLJIT_ORDERED_GREATER_EQUAL:
+		return push_inst(compiler, CROR | ((4 + 0) << 21) | ((4 + 0) << 16) | ((4 + 3) << 11));
+	case SLJIT_UNORDERED_OR_GREATER:
+	case SLJIT_ORDERED_LESS_EQUAL:
+		return push_inst(compiler, CROR | ((4 + 1) << 21) | ((4 + 1) << 16) | ((4 + 3) << 11));
+	}
+
+	return SLJIT_SUCCESS;
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
@@ -2076,38 +2238,50 @@
 	case SLJIT_SIG_LESS_EQUAL:
 		return (4 << 21) | (1 << 16);
 
-	case SLJIT_LESS_F64:
-		return (12 << 21) | ((4 + 0) << 16);
-
-	case SLJIT_GREATER_EQUAL_F64:
-		return (4 << 21) | ((4 + 0) << 16);
-
-	case SLJIT_GREATER_F64:
-		return (12 << 21) | ((4 + 1) << 16);
-
-	case SLJIT_LESS_EQUAL_F64:
-		return (4 << 21) | ((4 + 1) << 16);
-
 	case SLJIT_OVERFLOW:
 		return (12 << 21) | (3 << 16);
 
 	case SLJIT_NOT_OVERFLOW:
 		return (4 << 21) | (3 << 16);
 
-	case SLJIT_EQUAL_F64:
+	case SLJIT_F_LESS:
+	case SLJIT_ORDERED_LESS:
+	case SLJIT_UNORDERED_OR_LESS:
+		return (12 << 21) | ((4 + 0) << 16);
+
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_ORDERED_GREATER_EQUAL:
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+		return (4 << 21) | ((4 + 0) << 16);
+
+	case SLJIT_F_GREATER:
+	case SLJIT_ORDERED_GREATER:
+	case SLJIT_UNORDERED_OR_GREATER:
+		return (12 << 21) | ((4 + 1) << 16);
+
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_ORDERED_LESS_EQUAL:
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
+		return (4 << 21) | ((4 + 1) << 16);
+
+	case SLJIT_F_EQUAL:
+	case SLJIT_ORDERED_EQUAL:
+	case SLJIT_UNORDERED_OR_EQUAL:
 		return (12 << 21) | ((4 + 2) << 16);
 
-	case SLJIT_NOT_EQUAL_F64:
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
 		return (4 << 21) | ((4 + 2) << 16);
 
-	case SLJIT_UNORDERED_F64:
+	case SLJIT_UNORDERED:
 		return (12 << 21) | ((4 + 3) << 16);
 
-	case SLJIT_ORDERED_F64:
+	case SLJIT_ORDERED:
 		return (4 << 21) | ((4 + 3) << 16);
 
 	default:
-		SLJIT_ASSERT(type >= SLJIT_JUMP && type <= SLJIT_CALL_CDECL);
+		SLJIT_ASSERT(type >= SLJIT_JUMP && type <= SLJIT_CALL_REG_ARG);
 		return (20 << 21);
 	}
 }
@@ -2154,19 +2328,16 @@
 	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
 
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-	PTR_FAIL_IF(call_with_args(compiler, arg_types, NULL));
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG)
+		PTR_FAIL_IF(call_with_args(compiler, arg_types, NULL));
 #endif
 
 	if (type & SLJIT_CALL_RETURN) {
-		PTR_FAIL_IF(emit_stack_frame_release(compiler));
+		PTR_FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 	}
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_jump(compiler, type);
 }
 
@@ -2177,7 +2348,6 @@
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
 
 	if (FAST_IS_REG(src)) {
 #if (defined SLJIT_PASS_ENTRY_ADDR_TO_CALL && SLJIT_PASS_ENTRY_ADDR_TO_CALL)
@@ -2204,9 +2374,9 @@
 
 		FAIL_IF(emit_const(compiler, TMP_CALL_REG, 0));
 		src_r = TMP_CALL_REG;
-	}
-	else {
-		FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_CALL_REG, 0, TMP_REG1, 0, src, srcw));
+	} else {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_CALL_REG, src, srcw, TMP_CALL_REG));
 		src_r = TMP_CALL_REG;
 	}
 
@@ -2225,29 +2395,26 @@
 
 	if (src & SLJIT_MEM) {
 		ADJUST_LOCAL_OFFSET(src, srcw);
-		FAIL_IF(emit_op(compiler, SLJIT_MOV, WORD_DATA, TMP_CALL_REG, 0, TMP_REG1, 0, src, srcw));
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_CALL_REG, src, srcw, TMP_CALL_REG));
 		src = TMP_CALL_REG;
 	}
 
 	if (type & SLJIT_CALL_RETURN) {
-		if (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0) {
+		if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
 			FAIL_IF(push_inst(compiler, OR | S(src) | A(TMP_CALL_REG) | B(src)));
 			src = TMP_CALL_REG;
 		}
 
-		FAIL_IF(emit_stack_frame_release(compiler));
+		FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP;
 	}
 
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-	FAIL_IF(call_with_args(compiler, arg_types, &src));
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG)
+		FAIL_IF(call_with_args(compiler, arg_types, &src));
 #endif
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_ijump(compiler, type, src, srcw);
 }
 
@@ -2279,7 +2446,7 @@
 	bit = 0;
 	from_xer = 0;
 
-	switch (type & 0xff) {
+	switch (type) {
 	case SLJIT_LESS:
 	case SLJIT_SIG_LESS:
 		break;
@@ -2332,38 +2499,50 @@
 		invert = (compiler->status_flags_state & SLJIT_CURRENT_FLAGS_ADD) != 0;
 		break;
 
-	case SLJIT_LESS_F64:
+	case SLJIT_F_LESS:
+	case SLJIT_ORDERED_LESS:
+	case SLJIT_UNORDERED_OR_LESS:
 		bit = 4 + 0;
 		break;
 
-	case SLJIT_GREATER_EQUAL_F64:
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_ORDERED_GREATER_EQUAL:
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
 		bit = 4 + 0;
 		invert = 1;
 		break;
 
-	case SLJIT_GREATER_F64:
+	case SLJIT_F_GREATER:
+	case SLJIT_ORDERED_GREATER:
+	case SLJIT_UNORDERED_OR_GREATER:
 		bit = 4 + 1;
 		break;
 
-	case SLJIT_LESS_EQUAL_F64:
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_ORDERED_LESS_EQUAL:
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
 		bit = 4 + 1;
 		invert = 1;
 		break;
 
-	case SLJIT_EQUAL_F64:
+	case SLJIT_F_EQUAL:
+	case SLJIT_ORDERED_EQUAL:
+	case SLJIT_UNORDERED_OR_EQUAL:
 		bit = 4 + 2;
 		break;
 
-	case SLJIT_NOT_EQUAL_F64:
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
 		bit = 4 + 2;
 		invert = 1;
 		break;
 
-	case SLJIT_UNORDERED_F64:
+	case SLJIT_UNORDERED:
 		bit = 4 + 3;
 		break;
 
-	case SLJIT_ORDERED_F64:
+	case SLJIT_ORDERED:
 		bit = 4 + 3;
 		invert = 1;
 		break;
@@ -2374,7 +2553,8 @@
 	}
 
 	FAIL_IF(push_inst(compiler, (from_xer ? MFXER : MFCR) | D(reg)));
-	FAIL_IF(push_inst(compiler, RLWINM | S(reg) | A(reg) | ((1 + bit) << 11) | (31 << 6) | (31 << 1)));
+	/* Simplified mnemonics: extrwi. */
+	FAIL_IF(push_inst(compiler, RLWINM | S(reg) | A(reg) | RLWI_SH(1 + bit) | RLWI_MBE(31, 31)));
 
 	if (invert)
 		FAIL_IF(push_inst(compiler, XORI | S(reg) | A(reg) | 0x1));
@@ -2385,10 +2565,8 @@
 		return emit_op_mem(compiler, input_flags, reg, dst, dstw, TMP_REG1);
 	}
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
+
 	if (dst & SLJIT_MEM)
 		return sljit_emit_op2(compiler, saved_op, dst, saved_dstw, TMP_REG1, 0, TMP_REG2, 0);
 	return sljit_emit_op2(compiler, saved_op, dst, 0, dst, 0, TMP_REG2, 0);
@@ -2404,15 +2582,94 @@
 	return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);;
 }
 
+#if (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
+
+#define EMIT_MEM_LOAD_IMM(inst, mem, memw) \
+	((sljit_s16)(memw) > SIMM_MAX - SSIZE_OF(sw))
+
+#else /* !SLJIT_CONFIG_PPC_32 */
+
+#define EMIT_MEM_LOAD_IMM(inst, mem, memw) \
+	((((inst) & INT_ALIGNED) && ((memw) & 0x3) != 0) \
+		|| ((sljit_s16)(memw) > SIMM_MAX - SSIZE_OF(sw)) \
+		|| ((memw) > 0x7fff7fffl || (memw) < -0x80000000l)) \
+
+#endif /* SLJIT_CONFIG_PPC_32 */
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw)
 {
+	sljit_ins inst;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+
+	ADJUST_LOCAL_OFFSET(mem, memw);
+
+	inst = data_transfer_insts[WORD_DATA | ((type & SLJIT_MEM_STORE) ? 0 : LOAD_DATA)];
+
+	if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
+		memw &= 0x3;
+
+		if (memw != 0) {
+			FAIL_IF(push_inst(compiler, SLWI_W(memw) | S(OFFS_REG(mem)) | A(TMP_REG1)));
+			FAIL_IF(push_inst(compiler, ADD | D(TMP_REG1) | A(TMP_REG1) | B(mem & REG_MASK)));
+		} else
+			FAIL_IF(push_inst(compiler, ADD | D(TMP_REG1) | A(mem & REG_MASK) | B(OFFS_REG(mem))));
+
+		mem = TMP_REG1;
+		memw = 0;
+	} else {
+		if (EMIT_MEM_LOAD_IMM(inst, mem, memw)) {
+			if ((mem & REG_MASK) != 0) {
+				SLJIT_SKIP_CHECKS(compiler);
+				FAIL_IF(sljit_emit_op2(compiler, SLJIT_ADD, TMP_REG1, 0, mem & REG_MASK, 0, SLJIT_IMM, memw));
+			} else
+				FAIL_IF(load_immediate(compiler, TMP_REG1, memw));
+
+			memw = 0;
+			mem = TMP_REG1;
+		} else if (memw > SIMM_MAX || memw < SIMM_MIN) {
+			FAIL_IF(push_inst(compiler, ADDIS | D(TMP_REG1) | A(mem & REG_MASK) | IMM((memw + 0x8000) >> 16)));
+
+			memw &= 0xffff;
+			mem = TMP_REG1;
+		} else {
+			memw &= 0xffff;
+			mem &= REG_MASK;
+		}
+	}
+
+	SLJIT_ASSERT((memw >= 0 && memw <= SIMM_MAX - SSIZE_OF(sw)) || (memw >= 0x8000 && memw <= 0xffff));
+
+#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+	inst &= (sljit_ins)~INT_ALIGNED;
+#endif /* SLJIT_CONFIG_PPC_64 */
+
+	if (!(type & SLJIT_MEM_STORE) && mem == REG_PAIR_FIRST(reg)) {
+		FAIL_IF(push_inst(compiler, inst | D(REG_PAIR_SECOND(reg)) | A(mem) | IMM(memw + SSIZE_OF(sw))));
+		return push_inst(compiler, inst | D(REG_PAIR_FIRST(reg)) | A(mem) | IMM(memw));
+	}
+
+	FAIL_IF(push_inst(compiler, inst | D(REG_PAIR_FIRST(reg)) | A(mem) | IMM(memw)));
+	return push_inst(compiler, inst | D(REG_PAIR_SECOND(reg)) | A(mem) | IMM(memw + SSIZE_OF(sw)));
+}
+
+#undef EMIT_MEM_LOAD_IMM
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem_update(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
 	sljit_s32 mem_flags;
 	sljit_ins inst;
 
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+	CHECK(check_sljit_emit_mem_update(compiler, type, reg, mem, memw));
 
 	if (type & SLJIT_MEM_POST)
 		return SLJIT_ERR_UNSUPPORTED;
@@ -2500,7 +2757,7 @@
 	return SLJIT_SUCCESS;
 }
 
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem(struct sljit_compiler *compiler, sljit_s32 type,
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 freg,
 	sljit_s32 mem, sljit_sw memw)
 {
@@ -2508,7 +2765,7 @@
 	sljit_ins inst;
 
 	CHECK_ERROR();
-	CHECK(check_sljit_emit_fmem(compiler, type, freg, mem, memw));
+	CHECK(check_sljit_emit_fmem_update(compiler, type, freg, mem, memw));
 
 	if (type & SLJIT_MEM_POST)
 		return SLJIT_ERR_UNSUPPORTED;
@@ -2587,3 +2844,8 @@
 
 	return put_label;
 }
+
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
+{
+	sljit_set_jump_addr(addr, (sljit_uw)new_constant, executable_offset);
+}
diff --git a/src/sljit/sljitNativeRISCV_32.c b/src/sljit/sljitNativeRISCV_32.c
new file mode 100644
index 0000000..b38e692
--- /dev/null
+++ b/src/sljit/sljitNativeRISCV_32.c
@@ -0,0 +1,73 @@
+/*
+ *    Stack-less Just-In-Time compiler
+ *
+ *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice, this list of
+ *      conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *      of conditions and the following disclaimer in the documentation and/or other materials
+ *      provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r, sljit_sw imm, sljit_s32 tmp_r)
+{
+	SLJIT_UNUSED_ARG(tmp_r);
+	SLJIT_ASSERT(dst_r != tmp_r);
+
+	if (imm <= SIMM_MAX && imm >= SIMM_MIN)
+		return push_inst(compiler, ADDI | RD(dst_r) | RS1(TMP_ZERO) | IMM_I(imm));
+
+	if (imm & 0x800)
+		imm += 0x1000;
+
+	FAIL_IF(push_inst(compiler, LUI | RD(dst_r) | (sljit_ins)(imm & ~0xfff)));
+
+	if ((imm & 0xfff) == 0)
+		return SLJIT_SUCCESS;
+
+	return push_inst(compiler, ADDI | RD(dst_r) | RS1(dst_r) | IMM_I(imm));
+}
+
+static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw init_value, sljit_ins last_ins)
+{
+	if ((init_value & 0x800) != 0)
+		init_value += 0x1000;
+
+	FAIL_IF(push_inst(compiler, LUI | RD(dst) | (sljit_ins)(init_value & ~0xfff)));
+	return push_inst(compiler, last_ins | RS1(dst) | IMM_I(init_value));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
+{
+	sljit_ins *inst = (sljit_ins*)addr;
+	SLJIT_UNUSED_ARG(executable_offset);
+
+	if ((new_target & 0x800) != 0)
+		new_target += 0x1000;
+
+	SLJIT_UPDATE_WX_FLAGS(inst, inst + 5, 0);
+
+	SLJIT_ASSERT((inst[0] & 0x7f) == LUI);
+	inst[0] = (inst[0] & 0xfff) | (sljit_ins)((sljit_sw)new_target & ~0xfff);
+	SLJIT_ASSERT((inst[1] & 0x707f) == ADDI || (inst[1] & 0x707f) == JALR);
+	inst[1] = (inst[1] & 0xfffff) | IMM_I(new_target);
+
+	SLJIT_UPDATE_WX_FLAGS(inst, inst + 5, 1);
+	inst = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(inst, executable_offset);
+	SLJIT_CACHE_FLUSH(inst, inst + 5);
+}
diff --git a/src/sljit/sljitNativeRISCV_64.c b/src/sljit/sljitNativeRISCV_64.c
new file mode 100644
index 0000000..32cec78
--- /dev/null
+++ b/src/sljit/sljitNativeRISCV_64.c
@@ -0,0 +1,183 @@
+/*
+ *    Stack-less Just-In-Time compiler
+ *
+ *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice, this list of
+ *      conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *      of conditions and the following disclaimer in the documentation and/or other materials
+ *      provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r, sljit_sw imm, sljit_s32 tmp_r)
+{
+	sljit_sw high;
+
+	SLJIT_ASSERT(dst_r != tmp_r);
+
+	if (imm <= SIMM_MAX && imm >= SIMM_MIN)
+		return push_inst(compiler, ADDI | RD(dst_r) | RS1(TMP_ZERO) | IMM_I(imm));
+
+	if (imm <= 0x7fffffffl && imm >= S32_MIN) {
+		if (imm > S32_MAX) {
+			SLJIT_ASSERT((imm & 0x800) != 0);
+			FAIL_IF(push_inst(compiler, LUI | RD(dst_r) | (sljit_ins)0x80000000u));
+			return push_inst(compiler, XORI | RD(dst_r) | RS1(dst_r) | IMM_I(imm));
+		}
+
+		if ((imm & 0x800) != 0)
+			imm += 0x1000;
+
+		FAIL_IF(push_inst(compiler, LUI | RD(dst_r) | (sljit_ins)(imm & ~0xfff)));
+
+		if ((imm & 0xfff) == 0)
+			return SLJIT_SUCCESS;
+
+		return push_inst(compiler, ADDI | RD(dst_r) | RS1(dst_r) | IMM_I(imm));
+	}
+
+	/* Trailing zeroes could be used to produce shifted immediates. */
+
+	if (imm <= 0x7ffffffffffl && imm >= -0x80000000000l) {
+		high = imm >> 12;
+
+		if (imm & 0x800)
+			high = ~high;
+
+		if (high > S32_MAX) {
+			SLJIT_ASSERT((high & 0x800) != 0);
+			FAIL_IF(push_inst(compiler, LUI | RD(dst_r) | (sljit_ins)0x80000000u));
+			FAIL_IF(push_inst(compiler, XORI | RD(dst_r) | RS1(dst_r) | IMM_I(high)));
+		} else {
+			if ((high & 0x800) != 0)
+				high += 0x1000;
+
+			FAIL_IF(push_inst(compiler, LUI | RD(dst_r) | (sljit_ins)(high & ~0xfff)));
+
+			if ((high & 0xfff) != 0)
+				FAIL_IF(push_inst(compiler, ADDI | RD(dst_r) | RS1(dst_r) | IMM_I(high)));
+		}
+
+		FAIL_IF(push_inst(compiler, SLLI | RD(dst_r) | RS1(dst_r) | IMM_I(12)));
+
+		if ((imm & 0xfff) != 0)
+			return push_inst(compiler, XORI | RD(dst_r) | RS1(dst_r) | IMM_I(imm));
+
+		return SLJIT_SUCCESS;
+	}
+
+	high = imm >> 32;
+	imm = (sljit_s32)imm;
+
+	if ((imm & 0x80000000l) != 0)
+		high = ~high;
+
+	if (high <= 0x7ffff && high >= -0x80000) {
+		FAIL_IF(push_inst(compiler, LUI | RD(tmp_r) | (sljit_ins)(high << 12)));
+		high = 0x1000;
+	} else {
+		if ((high & 0x800) != 0)
+			high += 0x1000;
+
+		FAIL_IF(push_inst(compiler, LUI | RD(tmp_r) | (sljit_ins)(high & ~0xfff)));
+		high &= 0xfff;
+	}
+
+	if (imm <= SIMM_MAX && imm >= SIMM_MIN) {
+		FAIL_IF(push_inst(compiler, ADDI | RD(dst_r) | RS1(TMP_ZERO) | IMM_I(imm)));
+		imm = 0;
+	} else if (imm > S32_MAX) {
+		SLJIT_ASSERT((imm & 0x800) != 0);
+
+		FAIL_IF(push_inst(compiler, LUI | RD(dst_r) | (sljit_ins)0x80000000u));
+		imm = 0x1000 | (imm & 0xfff);
+	} else {
+		if ((imm & 0x800) != 0)
+			imm += 0x1000;
+
+		FAIL_IF(push_inst(compiler, LUI | RD(dst_r) | (sljit_ins)(imm & ~0xfff)));
+		imm &= 0xfff;
+	}
+
+	if ((high & 0xfff) != 0)
+		FAIL_IF(push_inst(compiler, ADDI | RD(tmp_r) | RS1(tmp_r) | IMM_I(high)));
+
+	if (imm & 0x1000)
+		FAIL_IF(push_inst(compiler, XORI | RD(dst_r) | RS1(dst_r) | IMM_I(imm)));
+	else if (imm != 0)
+		FAIL_IF(push_inst(compiler, ADDI | RD(dst_r) | RS1(dst_r) | IMM_I(imm)));
+
+	FAIL_IF(push_inst(compiler, SLLI | RD(tmp_r) | RS1(tmp_r) | IMM_I((high & 0x1000) ? 20 : 32)));
+	return push_inst(compiler, XOR | RD(dst_r) | RS1(dst_r) | RS2(tmp_r));
+}
+
+static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw init_value, sljit_ins last_ins)
+{
+	sljit_sw high;
+
+	if ((init_value & 0x800) != 0)
+		init_value += 0x1000;
+
+	high = init_value >> 32;
+
+	if ((init_value & 0x80000000l) != 0)
+		high = ~high;
+
+	if ((high & 0x800) != 0)
+		high += 0x1000;
+
+	FAIL_IF(push_inst(compiler, LUI | RD(TMP_REG3) | (sljit_ins)(high & ~0xfff)));
+	FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG3) | RS1(TMP_REG3) | IMM_I(high)));
+	FAIL_IF(push_inst(compiler, LUI | RD(dst) | (sljit_ins)(init_value & ~0xfff)));
+	FAIL_IF(push_inst(compiler, SLLI | RD(TMP_REG3) | RS1(TMP_REG3) | IMM_I(32)));
+	FAIL_IF(push_inst(compiler, XOR | RD(dst) | RS1(dst) | RS2(TMP_REG3)));
+	return push_inst(compiler, last_ins | RS1(dst) | IMM_I(init_value));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
+{
+	sljit_ins *inst = (sljit_ins*)addr;
+	sljit_sw high;
+	SLJIT_UNUSED_ARG(executable_offset);
+
+	if ((new_target & 0x800) != 0)
+		new_target += 0x1000;
+
+	high = (sljit_sw)new_target >> 32;
+
+	if ((new_target & 0x80000000l) != 0)
+		high = ~high;
+
+	if ((high & 0x800) != 0)
+		high += 0x1000;
+
+	SLJIT_UPDATE_WX_FLAGS(inst, inst + 5, 0);
+
+	SLJIT_ASSERT((inst[0] & 0x7f) == LUI);
+	inst[0] = (inst[0] & 0xfff) | (sljit_ins)(high & ~0xfff);
+	SLJIT_ASSERT((inst[1] & 0x707f) == ADDI);
+	inst[1] = (inst[1] & 0xfffff) | IMM_I(high);
+	SLJIT_ASSERT((inst[2] & 0x7f) == LUI);
+	inst[2] = (inst[2] & 0xfff) | (sljit_ins)((sljit_sw)new_target & ~0xfff);
+	SLJIT_ASSERT((inst[5] & 0x707f) == ADDI || (inst[5] & 0x707f) == JALR);
+	inst[5] = (inst[5] & 0xfffff) | IMM_I(new_target);
+	SLJIT_UPDATE_WX_FLAGS(inst, inst + 5, 1);
+
+	inst = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(inst, executable_offset);
+	SLJIT_CACHE_FLUSH(inst, inst + 5);
+}
diff --git a/src/sljit/sljitNativeRISCV_common.c b/src/sljit/sljitNativeRISCV_common.c
new file mode 100644
index 0000000..58a48c6
--- /dev/null
+++ b/src/sljit/sljitNativeRISCV_common.c
@@ -0,0 +1,2762 @@
+/*
+ *    Stack-less Just-In-Time compiler
+ *
+ *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice, this list of
+ *      conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *      of conditions and the following disclaimer in the documentation and/or other materials
+ *      provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
+{
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	return "RISC-V-32" SLJIT_CPUINFO;
+#else /* !SLJIT_CONFIG_RISCV_32 */
+	return "RISC-V-64" SLJIT_CPUINFO;
+#endif /* SLJIT_CONFIG_RISCV_32 */
+}
+
+/* Length of an instruction word
+   Both for riscv-32 and riscv-64 */
+typedef sljit_u32 sljit_ins;
+
+#define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
+#define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
+#define TMP_REG3	(SLJIT_NUMBER_OF_REGISTERS + 4)
+#define TMP_ZERO	0
+
+/* Flags are kept in volatile registers. */
+#define EQUAL_FLAG	(SLJIT_NUMBER_OF_REGISTERS + 5)
+#define RETURN_ADDR_REG	TMP_REG2
+#define OTHER_FLAG	(SLJIT_NUMBER_OF_REGISTERS + 6)
+
+#define TMP_FREG1	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG2	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
+
+static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 7] = {
+	0, 10, 11, 12, 13, 14, 15, 16, 17, 29, 30, 31, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 9, 8, 2, 6, 1, 7, 5, 28
+};
+
+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
+	0, 10, 11, 12, 13, 14, 15, 16, 17, 2, 3, 4, 5, 6, 7, 28, 29, 30, 31, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 9, 8, 0, 1,
+};
+
+/* --------------------------------------------------------------------- */
+/*  Instrucion forms                                                     */
+/* --------------------------------------------------------------------- */
+
+#define RD(rd)		((sljit_ins)reg_map[rd] << 7)
+#define RS1(rs1)	((sljit_ins)reg_map[rs1] << 15)
+#define RS2(rs2)	((sljit_ins)reg_map[rs2] << 20)
+#define FRD(rd)		((sljit_ins)freg_map[rd] << 7)
+#define FRS1(rs1)	((sljit_ins)freg_map[rs1] << 15)
+#define FRS2(rs2)	((sljit_ins)freg_map[rs2] << 20)
+#define IMM_I(imm)	((sljit_ins)(imm) << 20)
+#define IMM_S(imm)	((((sljit_ins)(imm) & 0xfe0) << 20) | (((sljit_ins)(imm) & 0x1f) << 7))
+
+/* Represents funct(i) parts of the instructions. */
+#define OPC(o)		((sljit_ins)(o))
+#define F3(f)		((sljit_ins)(f) << 12)
+#define F12(f)		((sljit_ins)(f) << 20)
+#define F7(f)		((sljit_ins)(f) << 25)
+
+#define ADD		(F7(0x0) | F3(0x0) | OPC(0x33))
+#define ADDI		(F3(0x0) | OPC(0x13))
+#define AND		(F7(0x0) | F3(0x7) | OPC(0x33))
+#define ANDI		(F3(0x7) | OPC(0x13))
+#define AUIPC		(OPC(0x17))
+#define BEQ		(F3(0x0) | OPC(0x63))
+#define BNE		(F3(0x1) | OPC(0x63))
+#define BLT		(F3(0x4) | OPC(0x63))
+#define BGE		(F3(0x5) | OPC(0x63))
+#define BLTU		(F3(0x6) | OPC(0x63))
+#define BGEU		(F3(0x7) | OPC(0x63))
+#define DIV		(F7(0x1) | F3(0x4) | OPC(0x33))
+#define DIVU		(F7(0x1) | F3(0x5) | OPC(0x33))
+#define EBREAK		(F12(0x1) | F3(0x0) | OPC(0x73))
+#define FADD_S		(F7(0x0) | F3(0x7) | OPC(0x53))
+#define FDIV_S		(F7(0xc) | F3(0x7) | OPC(0x53))
+#define FEQ_S		(F7(0x50) | F3(0x2) | OPC(0x53))
+#define FLD		(F3(0x3) | OPC(0x7))
+#define FLE_S		(F7(0x50) | F3(0x0) | OPC(0x53))
+#define FLT_S		(F7(0x50) | F3(0x1) | OPC(0x53))
+#define FSD		(F3(0x3) | OPC(0x27))
+/* These conversion opcodes are partly defined. */
+#define FCVT_S_D	(F7(0x20) | OPC(0x53))
+#define FCVT_S_W	(F7(0x68) | OPC(0x53))
+#define FCVT_W_S	(F7(0x60) | F3(0x1) | OPC(0x53))
+#define FMUL_S		(F7(0x8) | F3(0x7) | OPC(0x53))
+#define FSGNJ_S		(F7(0x10) | F3(0x0) | OPC(0x53))
+#define FSGNJN_S	(F7(0x10) | F3(0x1) | OPC(0x53))
+#define FSGNJX_S	(F7(0x10) | F3(0x2) | OPC(0x53))
+#define FSUB_S		(F7(0x4) | F3(0x7) | OPC(0x53))
+#define JAL		(OPC(0x6f))
+#define JALR		(F3(0x0) | OPC(0x67))
+#define LD		(F3(0x3) | OPC(0x3))
+#define LUI		(OPC(0x37))
+#define LW		(F3(0x2) | OPC(0x3))
+#define MUL		(F7(0x1) | F3(0x0) | OPC(0x33))
+#define MULH		(F7(0x1) | F3(0x1) | OPC(0x33))
+#define MULHU		(F7(0x1) | F3(0x3) | OPC(0x33))
+#define OR		(F7(0x0) | F3(0x6) | OPC(0x33))
+#define ORI		(F3(0x6) | OPC(0x13))
+#define REM		(F7(0x1) | F3(0x6) | OPC(0x33))
+#define REMU		(F7(0x1) | F3(0x7) | OPC(0x33))
+#define SD		(F3(0x3) | OPC(0x23))
+#define SLL		(F7(0x0) | F3(0x1) | OPC(0x33))
+#define SLLI		(IMM_I(0x0) | F3(0x1) | OPC(0x13))
+#define SLT		(F7(0x0) | F3(0x2) | OPC(0x33))
+#define SLTI		(F3(0x2) | OPC(0x13))
+#define SLTU		(F7(0x0) | F3(0x3) | OPC(0x33))
+#define SLTUI		(F3(0x3) | OPC(0x13))
+#define SRL		(F7(0x0) | F3(0x5) | OPC(0x33))
+#define SRLI		(IMM_I(0x0) | F3(0x5) | OPC(0x13))
+#define SRA		(F7(0x20) | F3(0x5) | OPC(0x33))
+#define SRAI		(IMM_I(0x400) | F3(0x5) | OPC(0x13))
+#define SUB		(F7(0x20) | F3(0x0) | OPC(0x33))
+#define SW		(F3(0x2) | OPC(0x23))
+#define XOR		(F7(0x0) | F3(0x4) | OPC(0x33))
+#define XORI		(F3(0x4) | OPC(0x13))
+
+#define SIMM_MAX	(0x7ff)
+#define SIMM_MIN	(-0x800)
+#define BRANCH_MAX	(0xfff)
+#define BRANCH_MIN	(-0x1000)
+#define JUMP_MAX	(0xfffff)
+#define JUMP_MIN	(-0x100000)
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+#define S32_MAX		(0x7ffff7ffl)
+#define S32_MIN		(-0x80000000l)
+#define S44_MAX		(0x7fffffff7ffl)
+#define S52_MAX		(0x7ffffffffffffl)
+#endif
+
+static sljit_s32 push_inst(struct sljit_compiler *compiler, sljit_ins ins)
+{
+	sljit_ins *ptr = (sljit_ins*)ensure_buf(compiler, sizeof(sljit_ins));
+	FAIL_IF(!ptr);
+	*ptr = ins;
+	compiler->size++;
+	return SLJIT_SUCCESS;
+}
+
+static sljit_s32 push_imm_s_inst(struct sljit_compiler *compiler, sljit_ins ins, sljit_sw imm)
+{
+	return push_inst(compiler, ins | IMM_S(imm));
+}
+
+static SLJIT_INLINE sljit_ins* detect_jump_type(struct sljit_jump *jump, sljit_ins *code, sljit_sw executable_offset)
+{
+	sljit_sw diff;
+	sljit_uw target_addr;
+	sljit_ins *inst;
+
+	inst = (sljit_ins *)jump->addr;
+
+	if (jump->flags & SLJIT_REWRITABLE_JUMP)
+		goto exit;
+
+	if (jump->flags & JUMP_ADDR)
+		target_addr = jump->u.target;
+	else {
+		SLJIT_ASSERT(jump->flags & JUMP_LABEL);
+		target_addr = (sljit_uw)(code + jump->u.label->size) + (sljit_uw)executable_offset;
+	}
+
+	diff = (sljit_sw)target_addr - (sljit_sw)inst - executable_offset;
+
+	if (jump->flags & IS_COND) {
+		inst--;
+		diff += SSIZE_OF(ins);
+
+		if (diff >= BRANCH_MIN && diff <= BRANCH_MAX) {
+			jump->flags |= PATCH_B;
+			inst[0] = (inst[0] & 0x1fff07f) ^ 0x1000;
+			jump->addr = (sljit_uw)inst;
+			return inst;
+		}
+
+		inst++;
+		diff -= SSIZE_OF(ins);
+	}
+
+	if (diff >= JUMP_MIN && diff <= JUMP_MAX) {
+		if (jump->flags & IS_COND) {
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+			inst[-1] -= (sljit_ins)(1 * sizeof(sljit_ins)) << 7;
+#else
+			inst[-1] -= (sljit_ins)(5 * sizeof(sljit_ins)) << 7;
+#endif
+		}
+
+		jump->flags |= PATCH_J;
+		return inst;
+	}
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	if (diff >= S32_MIN && diff <= S32_MAX) {
+		if (jump->flags & IS_COND)
+			inst[-1] -= (sljit_ins)(4 * sizeof(sljit_ins)) << 7;
+
+		jump->flags |= PATCH_REL32;
+		inst[1] = inst[0];
+		return inst + 1;
+	}
+
+	if (target_addr <= (sljit_uw)S32_MAX) {
+		if (jump->flags & IS_COND)
+			inst[-1] -= (sljit_ins)(4 * sizeof(sljit_ins)) << 7;
+
+		jump->flags |= PATCH_ABS32;
+		inst[1] = inst[0];
+		return inst + 1;
+	}
+
+	if (target_addr <= S44_MAX) {
+		if (jump->flags & IS_COND)
+			inst[-1] -= (sljit_ins)(2 * sizeof(sljit_ins)) << 7;
+
+		jump->flags |= PATCH_ABS44;
+		inst[3] = inst[0];
+		return inst + 3;
+	}
+
+	if (target_addr <= S52_MAX) {
+		if (jump->flags & IS_COND)
+			inst[-1] -= (sljit_ins)(1 * sizeof(sljit_ins)) << 7;
+
+		jump->flags |= PATCH_ABS52;
+		inst[4] = inst[0];
+		return inst + 4;
+	}
+#endif
+
+exit:
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	inst[1] = inst[0];
+	return inst + 1;
+#else
+	inst[5] = inst[0];
+	return inst + 5;
+#endif
+}
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+
+static SLJIT_INLINE sljit_sw put_label_get_length(struct sljit_put_label *put_label, sljit_uw max_label)
+{
+	if (max_label <= (sljit_uw)S32_MAX) {
+		put_label->flags = PATCH_ABS32;
+		return 1;
+	}
+
+	if (max_label <= S44_MAX) {
+		put_label->flags = PATCH_ABS44;
+		return 3;
+	}
+
+	if (max_label <= S52_MAX) {
+		put_label->flags = PATCH_ABS52;
+		return 4;
+	}
+
+	put_label->flags = 0;
+	return 5;
+}
+
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+static SLJIT_INLINE void load_addr_to_reg(void *dst, sljit_u32 reg)
+{
+	struct sljit_jump *jump = NULL;
+	struct sljit_put_label *put_label;
+	sljit_uw flags;
+	sljit_ins *inst;
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	sljit_sw high;
+#endif
+	sljit_uw addr;
+
+	if (reg != 0) {
+		jump = (struct sljit_jump*)dst;
+		flags = jump->flags;
+		inst = (sljit_ins*)jump->addr;
+		addr = (flags & JUMP_LABEL) ? jump->u.label->addr : jump->u.target;
+	} else {
+		put_label = (struct sljit_put_label*)dst;
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+		flags = put_label->flags;
+#endif
+		inst = (sljit_ins*)put_label->addr;
+		addr = put_label->label->addr;
+		reg = *inst;
+	}
+
+	if ((addr & 0x800) != 0)
+		addr += 0x1000;
+
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	inst[0] = LUI | RD(reg) | (sljit_ins)((sljit_sw)addr & ~0xfff);
+#else /* !SLJIT_CONFIG_RISCV_32 */
+
+	if (flags & PATCH_ABS32) {
+		SLJIT_ASSERT(addr <= S32_MAX);
+		inst[0] = LUI | RD(reg) | (sljit_ins)((sljit_sw)addr & ~0xfff);
+	} else if (flags & PATCH_ABS44) {
+		high = (sljit_sw)addr >> 12;
+		SLJIT_ASSERT((sljit_uw)high <= 0x7fffffff);
+
+		if (high > S32_MAX) {
+			SLJIT_ASSERT((high & 0x800) != 0);
+			inst[0] = LUI | RD(reg) | (sljit_ins)0x80000000u;
+			inst[1] = XORI | RD(reg) | RS1(reg) | IMM_I(high);
+		} else {
+			if ((high & 0x800) != 0)
+				high += 0x1000;
+
+			inst[0] = LUI | RD(reg) | (sljit_ins)(high & ~0xfff);
+			inst[1] = ADDI | RD(reg) | RS1(reg) | IMM_I(high);
+		}
+
+		inst[2] = SLLI | RD(reg) | RS1(reg) | IMM_I(12);
+		inst += 2;
+	} else {
+		high = (sljit_sw)addr >> 32;
+
+		if ((addr & 0x80000000l) != 0)
+			high = ~high;
+
+		if ((high & 0x800) != 0)
+			high += 0x1000;
+
+		if (flags & PATCH_ABS52) {
+			SLJIT_ASSERT(addr <= S52_MAX);
+			inst[0] = LUI | RD(TMP_REG3) | (sljit_ins)(high << 12);
+		} else {
+			inst[0] = LUI | RD(TMP_REG3) | (sljit_ins)(high & ~0xfff);
+			inst[1] = ADDI | RD(TMP_REG3) | RS1(TMP_REG3) | IMM_I(high);
+			inst++;
+		}
+
+		inst[1] = LUI | RD(reg) | (sljit_ins)((sljit_sw)addr & ~0xfff);
+		inst[2] = SLLI | RD(TMP_REG3) | RS1(TMP_REG3) | IMM_I((flags & PATCH_ABS52) ? 20 : 32);
+		inst[3] = XOR | RD(reg) | RS1(reg) | RS2(TMP_REG3);
+		inst += 3;
+	}
+#endif /* !SLJIT_CONFIG_RISCV_32 */
+
+	if (jump != NULL) {
+		SLJIT_ASSERT((inst[1] & 0x707f) == JALR);
+		inst[1] = (inst[1] & 0xfffff) | IMM_I(addr);
+	} else
+		inst[1] = ADDI | RD(reg) | RS1(reg) | IMM_I(addr);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
+{
+	struct sljit_memory_fragment *buf;
+	sljit_ins *code;
+	sljit_ins *code_ptr;
+	sljit_ins *buf_ptr;
+	sljit_ins *buf_end;
+	sljit_uw word_count;
+	sljit_uw next_addr;
+	sljit_sw executable_offset;
+	sljit_uw addr;
+
+	struct sljit_label *label;
+	struct sljit_jump *jump;
+	struct sljit_const *const_;
+	struct sljit_put_label *put_label;
+
+	CHECK_ERROR_PTR();
+	CHECK_PTR(check_sljit_generate_code(compiler));
+	reverse_buf(compiler);
+
+	code = (sljit_ins*)SLJIT_MALLOC_EXEC(compiler->size * sizeof(sljit_ins), compiler->exec_allocator_data);
+	PTR_FAIL_WITH_EXEC_IF(code);
+	buf = compiler->buf;
+
+	code_ptr = code;
+	word_count = 0;
+	next_addr = 0;
+	executable_offset = SLJIT_EXEC_OFFSET(code);
+
+	label = compiler->labels;
+	jump = compiler->jumps;
+	const_ = compiler->consts;
+	put_label = compiler->put_labels;
+
+	do {
+		buf_ptr = (sljit_ins*)buf->memory;
+		buf_end = buf_ptr + (buf->used_size >> 2);
+		do {
+			*code_ptr = *buf_ptr++;
+			if (next_addr == word_count) {
+				SLJIT_ASSERT(!label || label->size >= word_count);
+				SLJIT_ASSERT(!jump || jump->addr >= word_count);
+				SLJIT_ASSERT(!const_ || const_->addr >= word_count);
+				SLJIT_ASSERT(!put_label || put_label->addr >= word_count);
+
+				/* These structures are ordered by their address. */
+				if (label && label->size == word_count) {
+					label->addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
+					label->size = (sljit_uw)(code_ptr - code);
+					label = label->next;
+				}
+				if (jump && jump->addr == word_count) {
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+					word_count += 1;
+#else
+					word_count += 5;
+#endif
+					jump->addr = (sljit_uw)code_ptr;
+					code_ptr = detect_jump_type(jump, code, executable_offset);
+					jump = jump->next;
+				}
+				if (const_ && const_->addr == word_count) {
+					const_->addr = (sljit_uw)code_ptr;
+					const_ = const_->next;
+				}
+				if (put_label && put_label->addr == word_count) {
+					SLJIT_ASSERT(put_label->label);
+					put_label->addr = (sljit_uw)code_ptr;
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+					code_ptr += 1;
+					word_count += 1;
+#else
+					code_ptr += put_label_get_length(put_label, (sljit_uw)(SLJIT_ADD_EXEC_OFFSET(code, executable_offset) + put_label->label->size));
+					word_count += 5;
+#endif
+					put_label = put_label->next;
+				}
+				next_addr = compute_next_addr(label, jump, const_, put_label);
+			}
+			code_ptr++;
+			word_count++;
+		} while (buf_ptr < buf_end);
+
+		buf = buf->next;
+	} while (buf);
+
+	if (label && label->size == word_count) {
+		label->addr = (sljit_uw)code_ptr;
+		label->size = (sljit_uw)(code_ptr - code);
+		label = label->next;
+	}
+
+	SLJIT_ASSERT(!label);
+	SLJIT_ASSERT(!jump);
+	SLJIT_ASSERT(!const_);
+	SLJIT_ASSERT(!put_label);
+	SLJIT_ASSERT(code_ptr - code <= (sljit_sw)compiler->size);
+
+	jump = compiler->jumps;
+	while (jump) {
+		do {
+			if (!(jump->flags & (PATCH_B | PATCH_J | PATCH_REL32))) {
+				load_addr_to_reg(jump, TMP_REG1);
+				break;
+			}
+
+			addr = (jump->flags & JUMP_LABEL) ? jump->u.label->addr : jump->u.target;
+			buf_ptr = (sljit_ins *)jump->addr;
+			addr -= (sljit_uw)SLJIT_ADD_EXEC_OFFSET(buf_ptr, executable_offset);
+
+			if (jump->flags & PATCH_B) {
+				SLJIT_ASSERT((sljit_sw)addr >= BRANCH_MIN && (sljit_sw)addr <= BRANCH_MAX);
+				addr = ((addr & 0x800) >> 4) | ((addr & 0x1e) << 7) | ((addr & 0x7e0) << 20) | ((addr & 0x1000) << 19);
+				buf_ptr[0] |= (sljit_ins)addr;
+				break;
+			}
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+			if (jump->flags & PATCH_REL32) {
+				SLJIT_ASSERT((sljit_sw)addr >= S32_MIN && (sljit_sw)addr <= S32_MAX);
+
+				if ((addr & 0x800) != 0)
+					addr += 0x1000;
+
+				buf_ptr[0] = AUIPC | RD(TMP_REG1) | (sljit_ins)((sljit_sw)addr & ~0xfff);
+				SLJIT_ASSERT((buf_ptr[1] & 0x707f) == JALR);
+				buf_ptr[1] |= IMM_I(addr);
+				break;
+			}
+#endif
+
+			SLJIT_ASSERT((sljit_sw)addr >= JUMP_MIN && (sljit_sw)addr <= JUMP_MAX);
+			addr = (addr & 0xff000) | ((addr & 0x800) << 9) | ((addr & 0x7fe) << 20) | ((addr & 0x100000) << 11);
+			buf_ptr[0] = JAL | RD((jump->flags & IS_CALL) ? RETURN_ADDR_REG : TMP_ZERO) | (sljit_ins)addr;
+		} while (0);
+		jump = jump->next;
+	}
+
+	put_label = compiler->put_labels;
+	while (put_label) {
+		load_addr_to_reg(put_label, 0);
+		put_label = put_label->next;
+	}
+
+	compiler->error = SLJIT_ERR_COMPILED;
+	compiler->executable_offset = executable_offset;
+	compiler->executable_size = (sljit_uw)(code_ptr - code) * sizeof(sljit_ins);
+
+	code = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
+	code_ptr = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
+
+	SLJIT_CACHE_FLUSH(code, code_ptr);
+	SLJIT_UPDATE_WX_FLAGS(code, code_ptr, 1);
+	return code;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
+{
+	switch (feature_type) {
+	case SLJIT_HAS_FPU:
+	case SLJIT_HAS_ZERO_REGISTER:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
+{
+	return (type >= SLJIT_ORDERED_EQUAL && type <= SLJIT_ORDERED_LESS_EQUAL);
+}
+
+/* --------------------------------------------------------------------- */
+/*  Entry, exit                                                          */
+/* --------------------------------------------------------------------- */
+
+/* Creates an index in data_transfer_insts array. */
+#define LOAD_DATA	0x01
+#define WORD_DATA	0x00
+#define BYTE_DATA	0x02
+#define HALF_DATA	0x04
+#define INT_DATA	0x06
+#define SIGNED_DATA	0x08
+/* Separates integer and floating point registers */
+#define GPR_REG		0x0f
+#define DOUBLE_DATA	0x10
+#define SINGLE_DATA	0x12
+
+#define MEM_MASK	0x1f
+
+#define ARG_TEST	0x00020
+#define ALT_KEEP_CACHE	0x00040
+#define CUMULATIVE_OP	0x00080
+#define IMM_OP		0x00100
+#define MOVE_OP		0x00200
+#define SRC2_IMM	0x00400
+
+#define UNUSED_DEST	0x00800
+#define REG_DEST	0x01000
+#define REG1_SOURCE	0x02000
+#define REG2_SOURCE	0x04000
+#define SLOW_SRC1	0x08000
+#define SLOW_SRC2	0x10000
+#define SLOW_DEST	0x20000
+
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+#define STACK_STORE	SW
+#define STACK_LOAD	LW
+#else
+#define STACK_STORE	SD
+#define STACK_LOAD	LD
+#endif
+
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+#include "sljitNativeRISCV_32.c"
+#else
+#include "sljitNativeRISCV_64.c"
+#endif
+
+#define STACK_MAX_DISTANCE (-SIMM_MIN)
+
+static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw);
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
+	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
+	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
+{
+	sljit_s32 i, tmp, offset;
+	sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options);
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+	set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
+
+	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 1);
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
+		if ((local_size & SSIZE_OF(sw)) != 0)
+			local_size += SSIZE_OF(sw);
+		local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+	}
+#else
+	local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+#endif
+	local_size = (local_size + SLJIT_LOCALS_OFFSET + 15) & ~0xf;
+	compiler->local_size = local_size;
+
+	if (local_size <= STACK_MAX_DISTANCE) {
+		/* Frequent case. */
+		FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RS1(SLJIT_SP) | IMM_I(-local_size)));
+		offset = local_size - SSIZE_OF(sw);
+		local_size = 0;
+	} else {
+		FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RS1(SLJIT_SP) | IMM_I(STACK_MAX_DISTANCE)));
+		local_size -= STACK_MAX_DISTANCE;
+
+		if (local_size > STACK_MAX_DISTANCE)
+			FAIL_IF(load_immediate(compiler, TMP_REG1, local_size, TMP_REG3));
+		offset = STACK_MAX_DISTANCE - SSIZE_OF(sw);
+	}
+
+	FAIL_IF(push_imm_s_inst(compiler, STACK_STORE | RS1(SLJIT_SP) | RS2(RETURN_ADDR_REG), offset));
+
+	tmp = SLJIT_S0 - saveds;
+	for (i = SLJIT_S0 - saved_arg_count; i > tmp; i--) {
+		offset -= SSIZE_OF(sw);
+		FAIL_IF(push_imm_s_inst(compiler, STACK_STORE | RS1(SLJIT_SP) | RS2(i), offset));
+	}
+
+	for (i = scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
+		offset -= SSIZE_OF(sw);
+		FAIL_IF(push_imm_s_inst(compiler, STACK_STORE | RS1(SLJIT_SP) | RS2(i), offset));
+	}
+
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	/* This alignment is valid because offset is not used after storing FPU regs. */
+	if ((offset & SSIZE_OF(sw)) != 0)
+		offset -= SSIZE_OF(sw);
+#endif
+
+	tmp = SLJIT_FS0 - fsaveds;
+	for (i = SLJIT_FS0; i > tmp; i--) {
+		offset -= SSIZE_OF(f64);
+		FAIL_IF(push_imm_s_inst(compiler, FSD | RS1(SLJIT_SP) | FRS2(i), offset));
+	}
+
+	for (i = fscratches; i >= SLJIT_FIRST_SAVED_FLOAT_REG; i--) {
+		offset -= SSIZE_OF(f64);
+		FAIL_IF(push_imm_s_inst(compiler, FSD | RS1(SLJIT_SP) | FRS2(i), offset));
+	}
+
+	if (local_size > STACK_MAX_DISTANCE)
+		FAIL_IF(push_inst(compiler, SUB | RD(SLJIT_SP) | RS1(SLJIT_SP) | RS2(TMP_REG1)));
+	else if (local_size > 0)
+		FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RS1(SLJIT_SP) | IMM_I(-local_size)));
+
+	if (options & SLJIT_ENTER_REG_ARG)
+		return SLJIT_SUCCESS;
+
+	arg_types >>= SLJIT_ARG_SHIFT;
+	saved_arg_count = 0;
+	tmp = SLJIT_R0;
+
+	while (arg_types > 0) {
+		if ((arg_types & SLJIT_ARG_MASK) < SLJIT_ARG_TYPE_F64) {
+			if (!(arg_types & SLJIT_ARG_TYPE_SCRATCH_REG)) {
+				FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_S0 - saved_arg_count) | RS1(tmp) | IMM_I(0)));
+				saved_arg_count++;
+			}
+			tmp++;
+		}
+
+		arg_types >>= SLJIT_ARG_SHIFT;
+	}
+
+	return SLJIT_SUCCESS;
+}
+
+#undef STACK_MAX_DISTANCE
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
+	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
+	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
+	set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
+
+	local_size += GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1);
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	if (fsaveds > 0 || fscratches >= SLJIT_FIRST_SAVED_FLOAT_REG) {
+		if ((local_size & SSIZE_OF(sw)) != 0)
+			local_size += SSIZE_OF(sw);
+		local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+	}
+#else
+	local_size += GET_SAVED_FLOAT_REGISTERS_SIZE(fscratches, fsaveds, sizeof(sljit_f64));
+#endif
+	compiler->local_size = (local_size + SLJIT_LOCALS_OFFSET + 15) & ~0xf;
+
+	return SLJIT_SUCCESS;
+}
+
+#define STACK_MAX_DISTANCE (-SIMM_MIN - 16)
+
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 is_return_to)
+{
+	sljit_s32 i, tmp, offset;
+	sljit_s32 local_size = compiler->local_size;
+
+	if (local_size > STACK_MAX_DISTANCE) {
+		local_size -= STACK_MAX_DISTANCE;
+
+		if (local_size > STACK_MAX_DISTANCE) {
+			FAIL_IF(load_immediate(compiler, TMP_REG2, local_size, TMP_REG3));
+			FAIL_IF(push_inst(compiler, ADD | RD(SLJIT_SP) | RS1(SLJIT_SP) | RS2(TMP_REG2)));
+		} else
+			FAIL_IF(push_inst(compiler, ADDI | RD(SLJIT_SP) | RS1(SLJIT_SP) | IMM_I(local_size)));
+
+		local_size = STACK_MAX_DISTANCE;
+	}
+
+	SLJIT_ASSERT(local_size > 0);
+
+	offset = local_size - SSIZE_OF(sw);
+	if (!is_return_to)
+		FAIL_IF(push_inst(compiler, STACK_LOAD | RD(RETURN_ADDR_REG) | RS1(SLJIT_SP) | IMM_I(offset)));
+
+	tmp = SLJIT_S0 - compiler->saveds;
+	for (i = SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options); i > tmp; i--) {
+		offset -= SSIZE_OF(sw);
+		FAIL_IF(push_inst(compiler, STACK_LOAD | RD(i) | RS1(SLJIT_SP) | IMM_I(offset)));
+	}
+
+	for (i = compiler->scratches; i >= SLJIT_FIRST_SAVED_REG; i--) {
+		offset -= SSIZE_OF(sw);
+		FAIL_IF(push_inst(compiler, STACK_LOAD | RD(i) | RS1(SLJIT_SP) | IMM_I(offset)));
+	}
+
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	/* This alignment is valid because offset is not used after storing FPU regs. */
+	if ((offset & SSIZE_OF(sw)) != 0)
+		offset -= SSIZE_OF(sw);
+#endif
+
+	tmp = SLJIT_FS0 - compiler->fsaveds;
+	for (i = SLJIT_FS0; i > tmp; i--) {
+		offset -= SSIZE_OF(f64);
+		FAIL_IF(push_inst(compiler, FLD | FRD(i) | RS1(SLJIT_SP) | IMM_I(offset)));
+	}
+
+	for (i = compiler->fscratches; i >= SLJIT_FIRST_SAVED_FLOAT_REG; i--) {
+		offset -= SSIZE_OF(f64);
+		FAIL_IF(push_inst(compiler, FLD | FRD(i) | RS1(SLJIT_SP) | IMM_I(offset)));
+	}
+
+	return push_inst(compiler, ADDI | RD(SLJIT_SP) | RS1(SLJIT_SP) | IMM_I(local_size));
+}
+
+#undef STACK_MAX_DISTANCE
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_void(compiler));
+
+	FAIL_IF(emit_stack_frame_release(compiler, 0));
+	return push_inst(compiler, JALR | RD(TMP_ZERO) | RS1(RETURN_ADDR_REG) | IMM_I(0));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
+		src = TMP_REG1;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG1) | RS1(src) | IMM_I(0)));
+		src = TMP_REG1;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
+/* --------------------------------------------------------------------- */
+/*  Operators                                                            */
+/* --------------------------------------------------------------------- */
+
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+#define ARCH_32_64(a, b)	a
+#else
+#define ARCH_32_64(a, b)	b
+#endif
+
+static const sljit_ins data_transfer_insts[16 + 4] = {
+/* u w s */ ARCH_32_64(F3(0x2) | OPC(0x23) /* sw */, F3(0x3) | OPC(0x23) /* sd */),
+/* u w l */ ARCH_32_64(F3(0x2) | OPC(0x3) /* lw */, F3(0x3) | OPC(0x3) /* ld */),
+/* u b s */ F3(0x0) | OPC(0x23) /* sb */,
+/* u b l */ F3(0x4) | OPC(0x3) /* lbu */,
+/* u h s */ F3(0x1) | OPC(0x23) /* sh */,
+/* u h l */ F3(0x5) | OPC(0x3) /* lhu */,
+/* u i s */ F3(0x2) | OPC(0x23) /* sw */,
+/* u i l */ ARCH_32_64(F3(0x2) | OPC(0x3) /* lw */, F3(0x6) | OPC(0x3) /* lwu */),
+
+/* s w s */ ARCH_32_64(F3(0x2) | OPC(0x23) /* sw */, F3(0x3) | OPC(0x23) /* sd */),
+/* s w l */ ARCH_32_64(F3(0x2) | OPC(0x3) /* lw */, F3(0x3) | OPC(0x3) /* ld */),
+/* s b s */ F3(0x0) | OPC(0x23) /* sb */,
+/* s b l */ F3(0x0) | OPC(0x3) /* lb */,
+/* s h s */ F3(0x1) | OPC(0x23) /* sh */,
+/* s h l */ F3(0x1) | OPC(0x3) /* lh */,
+/* s i s */ F3(0x2) | OPC(0x23) /* sw */,
+/* s i l */ F3(0x2) | OPC(0x3) /* lw */,
+
+/* d   s */ F3(0x3) | OPC(0x27) /* fsd */,
+/* d   l */ F3(0x3) | OPC(0x7) /* fld */,
+/* s   s */ F3(0x2) | OPC(0x27) /* fsw */,
+/* s   l */ F3(0x2) | OPC(0x7) /* flw */,
+};
+
+#undef ARCH_32_64
+
+static sljit_s32 push_mem_inst(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 base, sljit_sw offset)
+{
+	sljit_ins ins;
+
+	SLJIT_ASSERT(FAST_IS_REG(base) && offset <= 0xfff && offset >= SIMM_MIN);
+
+	ins = data_transfer_insts[flags & MEM_MASK] | RS1(base);
+	if (flags & LOAD_DATA)
+		ins |= ((flags & MEM_MASK) <= GPR_REG ? RD(reg) : FRD(reg)) | IMM_I(offset);
+	else
+		ins |= ((flags & MEM_MASK) <= GPR_REG ? RS2(reg) : FRS2(reg)) | IMM_S(offset);
+
+	return push_inst(compiler, ins);
+}
+
+/* Can perform an operation using at most 1 instruction. */
+static sljit_s32 getput_arg_fast(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
+{
+
+	SLJIT_ASSERT(arg & SLJIT_MEM);
+
+	if (!(arg & OFFS_REG_MASK) && argw <= SIMM_MAX && argw >= SIMM_MIN) {
+		/* Works for both absoulte and relative addresses. */
+		if (SLJIT_UNLIKELY(flags & ARG_TEST))
+			return 1;
+
+		FAIL_IF(push_mem_inst(compiler, flags, reg, arg & REG_MASK, argw));
+		return -1;
+	}
+	return 0;
+}
+
+#define TO_ARGW_HI(argw) (((argw) & ~0xfff) + (((argw) & 0x800) ? 0x1000 : 0))
+
+/* See getput_arg below.
+   Note: can_cache is called only for binary operators. */
+static sljit_s32 can_cache(sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
+{
+	SLJIT_ASSERT((arg & SLJIT_MEM) && (next_arg & SLJIT_MEM));
+
+	/* Simple operation except for updates. */
+	if (arg & OFFS_REG_MASK) {
+		argw &= 0x3;
+		next_argw &= 0x3;
+		if (argw && argw == next_argw && (arg == next_arg || (arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK)))
+			return 1;
+		return 0;
+	}
+
+	if (arg == next_arg) {
+		if (((next_argw - argw) <= SIMM_MAX && (next_argw - argw) >= SIMM_MIN)
+				|| TO_ARGW_HI(argw) == TO_ARGW_HI(next_argw))
+			return 1;
+		return 0;
+	}
+
+	return 0;
+}
+
+/* Emit the necessary instructions. See can_cache above. */
+static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
+{
+	sljit_s32 base = arg & REG_MASK;
+	sljit_s32 tmp_r = TMP_REG1;
+	sljit_sw offset, argw_hi;
+
+	SLJIT_ASSERT(arg & SLJIT_MEM);
+	if (!(next_arg & SLJIT_MEM)) {
+		next_arg = 0;
+		next_argw = 0;
+	}
+
+	/* Since tmp can be the same as base or offset registers,
+	 * these might be unavailable after modifying tmp. */
+	if ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA))
+		tmp_r = reg;
+
+	if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
+		argw &= 0x3;
+
+		/* Using the cache. */
+		if (argw == compiler->cache_argw) {
+			if (arg == compiler->cache_arg)
+				return push_mem_inst(compiler, flags, reg, TMP_REG3, 0);
+
+			if ((SLJIT_MEM | (arg & OFFS_REG_MASK)) == compiler->cache_arg) {
+				if (arg == next_arg && argw == (next_argw & 0x3)) {
+					compiler->cache_arg = arg;
+					compiler->cache_argw = argw;
+					FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG3) | RS1(TMP_REG3) | RS2(base)));
+					return push_mem_inst(compiler, flags, reg, TMP_REG3, 0);
+				}
+				FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(base) | RS2(TMP_REG3)));
+				return push_mem_inst(compiler, flags, reg, tmp_r, 0);
+			}
+		}
+
+		if (SLJIT_UNLIKELY(argw)) {
+			compiler->cache_arg = SLJIT_MEM | (arg & OFFS_REG_MASK);
+			compiler->cache_argw = argw;
+			FAIL_IF(push_inst(compiler, SLLI | RD(TMP_REG3) | RS1(OFFS_REG(arg)) | IMM_I(argw)));
+		}
+
+		if (arg == next_arg && argw == (next_argw & 0x3)) {
+			compiler->cache_arg = arg;
+			compiler->cache_argw = argw;
+			FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG3) | RS1(base) | RS2(!argw ? OFFS_REG(arg) : TMP_REG3)));
+			tmp_r = TMP_REG3;
+		}
+		else
+			FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(base) | RS2(!argw ? OFFS_REG(arg) : TMP_REG3)));
+		return push_mem_inst(compiler, flags, reg, tmp_r, 0);
+	}
+
+	if (compiler->cache_arg == arg && argw - compiler->cache_argw <= SIMM_MAX && argw - compiler->cache_argw >= SIMM_MIN)
+		return push_mem_inst(compiler, flags, reg, TMP_REG3, argw - compiler->cache_argw);
+
+	if (compiler->cache_arg == SLJIT_MEM && (argw - compiler->cache_argw <= SIMM_MAX) && (argw - compiler->cache_argw >= SIMM_MIN)) {
+		offset = argw - compiler->cache_argw;
+	} else {
+		compiler->cache_arg = SLJIT_MEM;
+
+		argw_hi = TO_ARGW_HI(argw);
+
+		if (next_arg && next_argw - argw <= SIMM_MAX && next_argw - argw >= SIMM_MIN && argw_hi != TO_ARGW_HI(next_argw)) {
+			FAIL_IF(load_immediate(compiler, TMP_REG3, argw, tmp_r));
+			compiler->cache_argw = argw;
+			offset = 0;
+		} else {
+			FAIL_IF(load_immediate(compiler, TMP_REG3, argw_hi, tmp_r));
+			compiler->cache_argw = argw_hi;
+			offset = argw & 0xfff;
+			argw = argw_hi;
+		}
+	}
+
+	if (!base)
+		return push_mem_inst(compiler, flags, reg, TMP_REG3, offset);
+
+	if (arg == next_arg && next_argw - argw <= SIMM_MAX && next_argw - argw >= SIMM_MIN) {
+		compiler->cache_arg = arg;
+		FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG3) | RS1(TMP_REG3) | RS2(base)));
+		return push_mem_inst(compiler, flags, reg, TMP_REG3, offset);
+	}
+
+	FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(TMP_REG3) | RS2(base)));
+	return push_mem_inst(compiler, flags, reg, tmp_r, offset);
+}
+
+static sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
+{
+	sljit_s32 base = arg & REG_MASK;
+	sljit_s32 tmp_r = TMP_REG1;
+
+	if (getput_arg_fast(compiler, flags, reg, arg, argw))
+		return compiler->error;
+
+	if ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA))
+		tmp_r = reg;
+
+	if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
+		argw &= 0x3;
+
+		if (SLJIT_UNLIKELY(argw)) {
+			FAIL_IF(push_inst(compiler, SLLI | RD(tmp_r) | RS1(OFFS_REG(arg)) | IMM_I(argw)));
+			FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(tmp_r) | RS2(base)));
+		}
+		else
+			FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(base) | RS2(OFFS_REG(arg))));
+
+		argw = 0;
+	} else {
+		FAIL_IF(load_immediate(compiler, tmp_r, TO_ARGW_HI(argw), TMP_REG3));
+
+		if (base != 0)
+			FAIL_IF(push_inst(compiler, ADD | RD(tmp_r) | RS1(tmp_r) | RS2(base)));
+	}
+
+	return push_mem_inst(compiler, flags, reg, tmp_r, argw & 0xfff);
+}
+
+static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, sljit_s32 flags, sljit_s32 reg, sljit_s32 arg1, sljit_sw arg1w, sljit_s32 arg2, sljit_sw arg2w)
+{
+	if (getput_arg_fast(compiler, flags, reg, arg1, arg1w))
+		return compiler->error;
+	return getput_arg(compiler, flags, reg, arg1, arg1w, arg2, arg2w);
+}
+
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+#define WORD 0
+#define IMM_EXTEND(v) (IMM_I(v))
+#else /* !SLJIT_CONFIG_RISCV_32 */
+#define WORD word
+#define IMM_EXTEND(v) (IMM_I((op & SLJIT_32) ? (v) : (32 + (v))))
+#endif /* SLJIT_CONFIG_RISCV_32 */
+
+static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw src)
+{
+	sljit_s32 is_clz = (GET_OPCODE(op) == SLJIT_CLZ);
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	sljit_ins word = (op & SLJIT_32) >> 5;
+	sljit_ins max = (op & SLJIT_32) ? 32 : 64;
+#else /* !SLJIT_CONFIG_RISCV_64 */
+	sljit_ins max = 32;
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+	SLJIT_ASSERT(WORD == 0 || WORD == 0x8);
+
+	/* The OTHER_FLAG is the counter. */
+	FAIL_IF(push_inst(compiler, ADDI | WORD | RD(OTHER_FLAG) | RS1(TMP_ZERO) | IMM_I(max)));
+
+	/* The TMP_REG2 is the next value. */
+	if (src != TMP_REG2)
+		FAIL_IF(push_inst(compiler, ADDI | WORD | RD(TMP_REG2) | RS1(src) | IMM_I(0)));
+
+	FAIL_IF(push_inst(compiler, BEQ | RS1(TMP_REG2) | RS2(TMP_ZERO) | ((sljit_ins)((is_clz ? 4 : 5) * SSIZE_OF(ins)) << 7) | ((sljit_ins)(8 * SSIZE_OF(ins)) << 20)));
+
+	FAIL_IF(push_inst(compiler, ADDI | WORD | RD(OTHER_FLAG) | RS1(TMP_ZERO) | IMM_I(0)));
+	if (!is_clz) {
+		FAIL_IF(push_inst(compiler, ANDI | RD(TMP_REG1) | RS1(TMP_REG2) | IMM_I(1)));
+		FAIL_IF(push_inst(compiler, BNE | RS1(TMP_REG1) | RS2(TMP_ZERO) | ((sljit_ins)(2 * SSIZE_OF(ins)) << 7) | ((sljit_ins)(8 * SSIZE_OF(ins)) << 20)));
+	} else
+		FAIL_IF(push_inst(compiler, BLT | RS1(TMP_REG2) | RS2(TMP_ZERO) | ((sljit_ins)(2 * SSIZE_OF(ins)) << 7) | ((sljit_ins)(8 * SSIZE_OF(ins)) << 20)));
+
+	/* The TMP_REG1 is the next shift. */
+	FAIL_IF(push_inst(compiler, ADDI | WORD | RD(TMP_REG1) | RS1(TMP_ZERO) | IMM_I(max)));
+
+	FAIL_IF(push_inst(compiler, ADDI | WORD | RD(EQUAL_FLAG) | RS1(TMP_REG2) | IMM_I(0)));
+	FAIL_IF(push_inst(compiler, SRLI | WORD | RD(TMP_REG1) | RS1(TMP_REG1) | IMM_I(1)));
+
+	FAIL_IF(push_inst(compiler, (is_clz ? SRL : SLL) | WORD | RD(TMP_REG2) | RS1(EQUAL_FLAG) | RS2(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, BNE | RS1(TMP_REG2) | RS2(TMP_ZERO) | ((sljit_ins)0xfe000e80 - ((2 * SSIZE_OF(ins)) << 7))));
+	FAIL_IF(push_inst(compiler, ADDI | WORD | RD(TMP_REG2) | RS1(TMP_REG1) | IMM_I(-1)));
+	FAIL_IF(push_inst(compiler, (is_clz ? SRL : SLL) | WORD | RD(TMP_REG2) | RS1(EQUAL_FLAG) | RS2(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, OR | RD(OTHER_FLAG) | RS1(OTHER_FLAG) | RS2(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, BEQ | RS1(TMP_REG2) | RS2(TMP_ZERO) | ((sljit_ins)0xfe000e80 - ((5 * SSIZE_OF(ins)) << 7))));
+
+	return push_inst(compiler, ADDI | WORD | RD(dst) | RS1(OTHER_FLAG) | IMM_I(0));
+}
+
+#define EMIT_LOGICAL(op_imm, op_reg) \
+	if (flags & SRC2_IMM) { \
+		if (op & SLJIT_SET_Z) \
+			FAIL_IF(push_inst(compiler, op_imm | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(src2))); \
+		if (!(flags & UNUSED_DEST)) \
+			FAIL_IF(push_inst(compiler, op_imm | RD(dst) | RS1(src1) | IMM_I(src2))); \
+	} \
+	else { \
+		if (op & SLJIT_SET_Z) \
+			FAIL_IF(push_inst(compiler, op_reg | RD(EQUAL_FLAG) | RS1(src1) | RS2(src2))); \
+		if (!(flags & UNUSED_DEST)) \
+			FAIL_IF(push_inst(compiler, op_reg | RD(dst) | RS1(src1) | RS2(src2))); \
+	}
+
+#define EMIT_SHIFT(imm, reg) \
+	op_imm = (imm); \
+	op_reg = (reg);
+
+static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
+	sljit_s32 dst, sljit_s32 src1, sljit_sw src2)
+{
+	sljit_s32 is_overflow, is_carry, carry_src_r, is_handled;
+	sljit_ins op_imm, op_reg;
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	sljit_ins word = (op & SLJIT_32) >> 5;
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+	SLJIT_ASSERT(WORD == 0 || WORD == 0x8);
+
+	switch (GET_OPCODE(op)) {
+	case SLJIT_MOV:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if (dst != src2)
+			return push_inst(compiler, ADDI | RD(dst) | RS1(src2) | IMM_I(0));
+		return SLJIT_SUCCESS;
+
+	case SLJIT_MOV_U8:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE))
+			return push_inst(compiler, ANDI | RD(dst) | RS1(src2) | IMM_I(0xff));
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_MOV_S8:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
+			FAIL_IF(push_inst(compiler, SLLI | WORD | RD(dst) | RS1(src2) | IMM_EXTEND(24)));
+			return push_inst(compiler, SRAI | WORD | RD(dst) | RS1(dst) | IMM_EXTEND(24));
+		}
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_MOV_U16:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
+			FAIL_IF(push_inst(compiler, SLLI | WORD | RD(dst) | RS1(src2) | IMM_EXTEND(16)));
+			return push_inst(compiler, SRLI | WORD | RD(dst) | RS1(dst) | IMM_EXTEND(16));
+		}
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_MOV_S16:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
+			FAIL_IF(push_inst(compiler, SLLI | WORD | RD(dst) | RS1(src2) | IMM_EXTEND(16)));
+			return push_inst(compiler, SRAI | WORD | RD(dst) | RS1(dst) | IMM_EXTEND(16));
+		}
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	case SLJIT_MOV_U32:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
+			FAIL_IF(push_inst(compiler, SLLI | RD(dst) | RS1(src2) | IMM_I(32)));
+			return push_inst(compiler, SRLI | RD(dst) | RS1(dst) | IMM_I(32));
+		}
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_MOV_S32:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE))
+			return push_inst(compiler, ADDI | 0x8 | RD(dst) | RS1(src2) | IMM_I(0));
+		SLJIT_ASSERT(dst == src2);
+		return SLJIT_SUCCESS;
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+	case SLJIT_CLZ:
+	case SLJIT_CTZ:
+		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
+		return emit_clz_ctz(compiler, op, dst, src2);
+
+	case SLJIT_ADD:
+		/* Overflow computation (both add and sub): overflow = src1_sign ^ src2_sign ^ result_sign ^ carry_flag */
+		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
+		carry_src_r = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+
+		if (flags & SRC2_IMM) {
+			if (is_overflow) {
+				if (src2 >= 0)
+					FAIL_IF(push_inst(compiler, ADDI | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(0)));
+				else
+					FAIL_IF(push_inst(compiler, XORI | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(-1)));
+			}
+			else if (op & SLJIT_SET_Z)
+				FAIL_IF(push_inst(compiler, ADDI | WORD | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(src2)));
+
+			/* Only the zero flag is needed. */
+			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
+				FAIL_IF(push_inst(compiler, ADDI | WORD | RD(dst) | RS1(src1) | IMM_I(src2)));
+		}
+		else {
+			if (is_overflow)
+				FAIL_IF(push_inst(compiler, XOR | RD(EQUAL_FLAG) | RS1(src1) | RS2(src2)));
+			else if (op & SLJIT_SET_Z)
+				FAIL_IF(push_inst(compiler, ADD | WORD | RD(EQUAL_FLAG) | RS1(src1) | RS2(src2)));
+
+			if (is_overflow || carry_src_r != 0) {
+				if (src1 != dst)
+					carry_src_r = (sljit_s32)src1;
+				else if (src2 != dst)
+					carry_src_r = (sljit_s32)src2;
+				else {
+					FAIL_IF(push_inst(compiler, ADDI | RD(OTHER_FLAG) | RS1(src1) | IMM_I(0)));
+					carry_src_r = OTHER_FLAG;
+				}
+			}
+
+			/* Only the zero flag is needed. */
+			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
+				FAIL_IF(push_inst(compiler, ADD | WORD | RD(dst) | RS1(src1) | RS2(src2)));
+		}
+
+		/* Carry is zero if a + b >= a or a + b >= b, otherwise it is 1. */
+		if (is_overflow || carry_src_r != 0) {
+			if (flags & SRC2_IMM)
+				FAIL_IF(push_inst(compiler, SLTUI | RD(OTHER_FLAG) | RS1(dst) | IMM_I(src2)));
+			else
+				FAIL_IF(push_inst(compiler, SLTU | RD(OTHER_FLAG) | RS1(dst) | RS2(carry_src_r)));
+		}
+
+		if (!is_overflow)
+			return SLJIT_SUCCESS;
+
+		FAIL_IF(push_inst(compiler, XOR | RD(TMP_REG1) | RS1(dst) | RS2(EQUAL_FLAG)));
+		if (op & SLJIT_SET_Z)
+			FAIL_IF(push_inst(compiler, ADDI | RD(EQUAL_FLAG) | RS1(dst) | IMM_I(0)));
+		FAIL_IF(push_inst(compiler, SRLI | WORD | RD(TMP_REG1) | RS1(TMP_REG1) | IMM_EXTEND(31)));
+		return push_inst(compiler, XOR | RD(OTHER_FLAG) | RS1(TMP_REG1) | RS2(OTHER_FLAG));
+
+	case SLJIT_ADDC:
+		carry_src_r = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+
+		if (flags & SRC2_IMM) {
+			FAIL_IF(push_inst(compiler, ADDI | WORD | RD(dst) | RS1(src1) | IMM_I(src2)));
+		} else {
+			if (carry_src_r != 0) {
+				if (src1 != dst)
+					carry_src_r = (sljit_s32)src1;
+				else if (src2 != dst)
+					carry_src_r = (sljit_s32)src2;
+				else {
+					FAIL_IF(push_inst(compiler, ADDI | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(0)));
+					carry_src_r = EQUAL_FLAG;
+				}
+			}
+
+			FAIL_IF(push_inst(compiler, ADD | WORD | RD(dst) | RS1(src1) | RS2(src2)));
+		}
+
+		/* Carry is zero if a + b >= a or a + b >= b, otherwise it is 1. */
+		if (carry_src_r != 0) {
+			if (flags & SRC2_IMM)
+				FAIL_IF(push_inst(compiler, SLTUI | RD(EQUAL_FLAG) | RS1(dst) | IMM_I(src2)));
+			else
+				FAIL_IF(push_inst(compiler, SLTU | RD(EQUAL_FLAG) | RS1(dst) | RS2(carry_src_r)));
+		}
+
+		FAIL_IF(push_inst(compiler, ADD | WORD | RD(dst) | RS1(dst) | RS2(OTHER_FLAG)));
+
+		if (carry_src_r == 0)
+			return SLJIT_SUCCESS;
+
+		/* Set ULESS_FLAG (dst == 0) && (OTHER_FLAG == 1). */
+		FAIL_IF(push_inst(compiler, SLTU | RD(OTHER_FLAG) | RS1(dst) | RS2(OTHER_FLAG)));
+		/* Set carry flag. */
+		return push_inst(compiler, OR | RD(OTHER_FLAG) | RS1(OTHER_FLAG) | RS2(EQUAL_FLAG));
+
+	case SLJIT_SUB:
+		if ((flags & SRC2_IMM) && src2 == SIMM_MIN) {
+			FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG2) | RS1(TMP_ZERO) | IMM_I(src2)));
+			src2 = TMP_REG2;
+			flags &= ~SRC2_IMM;
+		}
+
+		is_handled = 0;
+
+		if (flags & SRC2_IMM) {
+			if (GET_FLAG_TYPE(op) == SLJIT_LESS || GET_FLAG_TYPE(op) == SLJIT_GREATER_EQUAL) {
+				FAIL_IF(push_inst(compiler, SLTUI | RD(OTHER_FLAG) | RS1(src1) | IMM_I(src2)));
+				is_handled = 1;
+			}
+			else if (GET_FLAG_TYPE(op) == SLJIT_SIG_LESS || GET_FLAG_TYPE(op) == SLJIT_SIG_GREATER_EQUAL) {
+				FAIL_IF(push_inst(compiler, SLTI | RD(OTHER_FLAG) | RS1(src1) | IMM_I(src2)));
+				is_handled = 1;
+			}
+		}
+
+		if (!is_handled && GET_FLAG_TYPE(op) >= SLJIT_LESS && GET_FLAG_TYPE(op) <= SLJIT_SIG_LESS_EQUAL) {
+			is_handled = 1;
+
+			if (flags & SRC2_IMM) {
+				FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG2) | RS1(TMP_ZERO) | IMM_I(src2)));
+				src2 = TMP_REG2;
+				flags &= ~SRC2_IMM;
+			}
+
+			switch (GET_FLAG_TYPE(op)) {
+			case SLJIT_LESS:
+			case SLJIT_GREATER_EQUAL:
+				FAIL_IF(push_inst(compiler, SLTU | RD(OTHER_FLAG) | RS1(src1) | RS2(src2)));
+				break;
+			case SLJIT_GREATER:
+			case SLJIT_LESS_EQUAL:
+				FAIL_IF(push_inst(compiler, SLTU | RD(OTHER_FLAG) | RS1(src2) | RS2(src1)));
+				break;
+			case SLJIT_SIG_LESS:
+			case SLJIT_SIG_GREATER_EQUAL:
+				FAIL_IF(push_inst(compiler, SLT | RD(OTHER_FLAG) | RS1(src1) | RS2(src2)));
+				break;
+			case SLJIT_SIG_GREATER:
+			case SLJIT_SIG_LESS_EQUAL:
+				FAIL_IF(push_inst(compiler, SLT | RD(OTHER_FLAG) | RS1(src2) | RS2(src1)));
+				break;
+			}
+		}
+
+		if (is_handled) {
+			if (flags & SRC2_IMM) {
+				if (op & SLJIT_SET_Z)
+					FAIL_IF(push_inst(compiler, ADDI | WORD | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(-src2)));
+				if (!(flags & UNUSED_DEST))
+					return push_inst(compiler, ADDI | WORD | RD(dst) | RS1(src1) | IMM_I(-src2));
+			}
+			else {
+				if (op & SLJIT_SET_Z)
+					FAIL_IF(push_inst(compiler, SUB | WORD | RD(EQUAL_FLAG) | RS1(src1) | RS2(src2)));
+				if (!(flags & UNUSED_DEST))
+					return push_inst(compiler, SUB | WORD | RD(dst) | RS1(src1) | RS2(src2));
+			}
+			return SLJIT_SUCCESS;
+		}
+
+		is_overflow = GET_FLAG_TYPE(op) == SLJIT_OVERFLOW;
+		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+
+		if (flags & SRC2_IMM) {
+			if (is_overflow) {
+				if (src2 >= 0)
+					FAIL_IF(push_inst(compiler, ADDI | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(0)));
+				else
+					FAIL_IF(push_inst(compiler, XORI | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(-1)));
+			}
+			else if (op & SLJIT_SET_Z)
+				FAIL_IF(push_inst(compiler, ADDI | WORD | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(-src2)));
+
+			if (is_overflow || is_carry)
+				FAIL_IF(push_inst(compiler, SLTUI | RD(OTHER_FLAG) | RS1(src1) | IMM_I(src2)));
+
+			/* Only the zero flag is needed. */
+			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
+				FAIL_IF(push_inst(compiler, ADDI | WORD | RD(dst) | RS1(src1) | IMM_I(-src2)));
+		}
+		else {
+			if (is_overflow)
+				FAIL_IF(push_inst(compiler, XOR | RD(EQUAL_FLAG) | RS1(src1) | RS2(src2)));
+			else if (op & SLJIT_SET_Z)
+				FAIL_IF(push_inst(compiler, SUB | WORD | RD(EQUAL_FLAG) | RS1(src1) | RS2(src2)));
+
+			if (is_overflow || is_carry)
+				FAIL_IF(push_inst(compiler, SLTU | RD(OTHER_FLAG) | RS1(src1) | RS2(src2)));
+
+			/* Only the zero flag is needed. */
+			if (!(flags & UNUSED_DEST) || (op & VARIABLE_FLAG_MASK))
+				FAIL_IF(push_inst(compiler, SUB | WORD | RD(dst) | RS1(src1) | RS2(src2)));
+		}
+
+		if (!is_overflow)
+			return SLJIT_SUCCESS;
+
+		FAIL_IF(push_inst(compiler, XOR | RD(TMP_REG1) | RS1(dst) | RS2(EQUAL_FLAG)));
+		if (op & SLJIT_SET_Z)
+			FAIL_IF(push_inst(compiler, ADDI | RD(EQUAL_FLAG) | RS1(dst) | IMM_I(0)));
+		FAIL_IF(push_inst(compiler, SRLI | WORD | RD(TMP_REG1) | RS1(TMP_REG1) | IMM_EXTEND(31)));
+		return push_inst(compiler, XOR | RD(OTHER_FLAG) | RS1(TMP_REG1) | RS2(OTHER_FLAG));
+
+	case SLJIT_SUBC:
+		if ((flags & SRC2_IMM) && src2 == SIMM_MIN) {
+			FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG2) | RS1(TMP_ZERO) | IMM_I(src2)));
+			src2 = TMP_REG2;
+			flags &= ~SRC2_IMM;
+		}
+
+		is_carry = GET_FLAG_TYPE(op) == GET_FLAG_TYPE(SLJIT_SET_CARRY);
+
+		if (flags & SRC2_IMM) {
+			if (is_carry)
+				FAIL_IF(push_inst(compiler, SLTUI | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(src2)));
+
+			FAIL_IF(push_inst(compiler, ADDI | WORD | RD(dst) | RS1(src1) | IMM_I(-src2)));
+		}
+		else {
+			if (is_carry)
+				FAIL_IF(push_inst(compiler, SLTU | RD(EQUAL_FLAG) | RS1(src1) | RS2(src2)));
+
+			FAIL_IF(push_inst(compiler, SUB | WORD | RD(dst) | RS1(src1) | RS2(src2)));
+		}
+
+		if (is_carry)
+			FAIL_IF(push_inst(compiler, SLTU | RD(TMP_REG1) | RS1(dst) | RS2(OTHER_FLAG)));
+
+		FAIL_IF(push_inst(compiler, SUB | WORD | RD(dst) | RS1(dst) | RS2(OTHER_FLAG)));
+
+		if (!is_carry)
+			return SLJIT_SUCCESS;
+
+		return push_inst(compiler, OR | RD(OTHER_FLAG) | RS1(EQUAL_FLAG) | RS2(TMP_REG1));
+
+	case SLJIT_MUL:
+		SLJIT_ASSERT(!(flags & SRC2_IMM));
+
+		if (GET_FLAG_TYPE(op) != SLJIT_OVERFLOW)
+			return push_inst(compiler, MUL | WORD | RD(dst) | RS1(src1) | RS2(src2));
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+		if (word) {
+			FAIL_IF(push_inst(compiler, MUL | RD(OTHER_FLAG) | RS1(src1) | RS2(src2)));
+			FAIL_IF(push_inst(compiler, MUL | 0x8 | RD(dst) | RS1(src1) | RS2(src2)));
+			return push_inst(compiler, SUB | RD(OTHER_FLAG) | RS1(dst) | RS2(OTHER_FLAG));
+		}
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+		FAIL_IF(push_inst(compiler, MULH | RD(EQUAL_FLAG) | RS1(src1) | RS2(src2)));
+		FAIL_IF(push_inst(compiler, MUL | RD(dst) | RS1(src1) | RS2(src2)));
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+		FAIL_IF(push_inst(compiler, SRAI | RD(OTHER_FLAG) | RS1(dst) | IMM_I(31)));
+#else /* !SLJIT_CONFIG_RISCV_32 */
+		FAIL_IF(push_inst(compiler, SRAI | RD(OTHER_FLAG) | RS1(dst) | IMM_I(63)));
+#endif /* SLJIT_CONFIG_RISCV_32 */
+		return push_inst(compiler, SUB | RD(OTHER_FLAG) | RS1(EQUAL_FLAG) | RS2(OTHER_FLAG));
+
+	case SLJIT_AND:
+		EMIT_LOGICAL(ANDI, AND);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_OR:
+		EMIT_LOGICAL(ORI, OR);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_XOR:
+		EMIT_LOGICAL(XORI, XOR);
+		return SLJIT_SUCCESS;
+
+	case SLJIT_SHL:
+	case SLJIT_MSHL:
+		EMIT_SHIFT(SLLI, SLL);
+		break;
+
+	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
+		EMIT_SHIFT(SRLI, SRL);
+		break;
+
+	case SLJIT_ASHR:
+	case SLJIT_MASHR:
+		EMIT_SHIFT(SRAI, SRA);
+		break;
+
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		if (flags & SRC2_IMM) {
+			SLJIT_ASSERT(src2 != 0);
+
+			op_imm = (GET_OPCODE(op) == SLJIT_ROTL) ? SLLI : SRLI;
+			FAIL_IF(push_inst(compiler, op_imm | WORD | RD(OTHER_FLAG) | RS1(src1) | IMM_I(src2)));
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+			src2 = ((op & SLJIT_32) ? 32 : 64) - src2;
+#else /* !SLJIT_CONFIG_RISCV_64 */
+			src2 = 32 - src2;
+#endif /* SLJIT_CONFIG_RISCV_64 */
+			op_imm = (GET_OPCODE(op) == SLJIT_ROTL) ? SRLI : SLLI;
+			FAIL_IF(push_inst(compiler, op_imm | WORD | RD(dst) | RS1(src1) | IMM_I(src2)));
+			return push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(OTHER_FLAG));
+		}
+
+		if (src2 == TMP_ZERO) {
+			if (dst != src1)
+				return push_inst(compiler, ADDI | WORD | RD(dst) | RS1(src1) | IMM_I(0));
+			return SLJIT_SUCCESS;
+		}
+
+		FAIL_IF(push_inst(compiler, SUB | WORD | RD(EQUAL_FLAG) | RS1(TMP_ZERO) | RS2(src2)));
+		op_reg = (GET_OPCODE(op) == SLJIT_ROTL) ? SLL : SRL;
+		FAIL_IF(push_inst(compiler, op_reg | WORD | RD(OTHER_FLAG) | RS1(src1) | RS2(src2)));
+		op_reg = (GET_OPCODE(op) == SLJIT_ROTL) ? SRL : SLL;
+		FAIL_IF(push_inst(compiler, op_reg | WORD | RD(dst) | RS1(src1) | RS2(EQUAL_FLAG)));
+		return push_inst(compiler, OR | RD(dst) | RS1(dst) | RS2(OTHER_FLAG));
+
+	default:
+		SLJIT_UNREACHABLE();
+		return SLJIT_SUCCESS;
+	}
+
+	if (flags & SRC2_IMM) {
+		if (op & SLJIT_SET_Z)
+			FAIL_IF(push_inst(compiler, op_imm | WORD | RD(EQUAL_FLAG) | RS1(src1) | IMM_I(src2)));
+
+		if (flags & UNUSED_DEST)
+			return SLJIT_SUCCESS;
+		return push_inst(compiler, op_imm | WORD | RD(dst) | RS1(src1) | IMM_I(src2));
+	}
+
+	if (op & SLJIT_SET_Z)
+		FAIL_IF(push_inst(compiler, op_reg | WORD | RD(EQUAL_FLAG) | RS1(src1) | RS2(src2)));
+
+	if (flags & UNUSED_DEST)
+		return SLJIT_SUCCESS;
+	return push_inst(compiler, op_reg | WORD | RD(dst) | RS1(src1) | RS2(src2));
+}
+
+#undef IMM_EXTEND
+
+static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	/* arg1 goes to TMP_REG1 or src reg
+	   arg2 goes to TMP_REG2, imm or src reg
+	   TMP_REG3 can be used for caching
+	   result goes to TMP_REG2, so put result can use TMP_REG1 and TMP_REG3. */
+	sljit_s32 dst_r = TMP_REG2;
+	sljit_s32 src1_r;
+	sljit_sw src2_r = 0;
+	sljit_s32 sugg_src2_r = TMP_REG2;
+
+	if (!(flags & ALT_KEEP_CACHE)) {
+		compiler->cache_arg = 0;
+		compiler->cache_argw = 0;
+	}
+
+	if (dst == TMP_REG2) {
+		SLJIT_ASSERT(HAS_FLAGS(op));
+		flags |= UNUSED_DEST;
+	}
+	else if (FAST_IS_REG(dst)) {
+		dst_r = dst;
+		flags |= REG_DEST;
+		if (flags & MOVE_OP)
+			sugg_src2_r = dst_r;
+	}
+	else if ((dst & SLJIT_MEM) && !getput_arg_fast(compiler, flags | ARG_TEST, TMP_REG1, dst, dstw))
+		flags |= SLOW_DEST;
+
+	if (flags & IMM_OP) {
+		if ((src2 & SLJIT_IMM) && src2w != 0 && src2w <= SIMM_MAX && src2w >= SIMM_MIN) {
+			flags |= SRC2_IMM;
+			src2_r = src2w;
+		}
+		else if ((flags & CUMULATIVE_OP) && (src1 & SLJIT_IMM) && src1w != 0 && src1w <= SIMM_MAX && src1w >= SIMM_MIN) {
+			flags |= SRC2_IMM;
+			src2_r = src1w;
+
+			/* And swap arguments. */
+			src1 = src2;
+			src1w = src2w;
+			src2 = SLJIT_IMM;
+			/* src2w = src2_r unneeded. */
+		}
+	}
+
+	/* Source 1. */
+	if (FAST_IS_REG(src1)) {
+		src1_r = src1;
+		flags |= REG1_SOURCE;
+	}
+	else if (src1 & SLJIT_IMM) {
+		if (src1w) {
+			FAIL_IF(load_immediate(compiler, TMP_REG1, src1w, TMP_REG3));
+			src1_r = TMP_REG1;
+		}
+		else
+			src1_r = TMP_ZERO;
+	}
+	else {
+		if (getput_arg_fast(compiler, flags | LOAD_DATA, TMP_REG1, src1, src1w))
+			FAIL_IF(compiler->error);
+		else
+			flags |= SLOW_SRC1;
+		src1_r = TMP_REG1;
+	}
+
+	/* Source 2. */
+	if (FAST_IS_REG(src2)) {
+		src2_r = src2;
+		flags |= REG2_SOURCE;
+		if ((flags & (REG_DEST | MOVE_OP)) == MOVE_OP)
+			dst_r = (sljit_s32)src2_r;
+	}
+	else if (src2 & SLJIT_IMM) {
+		if (!(flags & SRC2_IMM)) {
+			if (src2w) {
+				FAIL_IF(load_immediate(compiler, sugg_src2_r, src2w, TMP_REG3));
+				src2_r = sugg_src2_r;
+			}
+			else {
+				src2_r = TMP_ZERO;
+				if (flags & MOVE_OP) {
+					if (dst & SLJIT_MEM)
+						dst_r = 0;
+					else
+						op = SLJIT_MOV;
+				}
+			}
+		}
+	}
+	else {
+		if (getput_arg_fast(compiler, flags | LOAD_DATA, sugg_src2_r, src2, src2w))
+			FAIL_IF(compiler->error);
+		else
+			flags |= SLOW_SRC2;
+		src2_r = sugg_src2_r;
+	}
+
+	if ((flags & (SLOW_SRC1 | SLOW_SRC2)) == (SLOW_SRC1 | SLOW_SRC2)) {
+		SLJIT_ASSERT(src2_r == TMP_REG2);
+		if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, src1w, dst, dstw)) {
+			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG2, src2, src2w, src1, src1w));
+			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG1, src1, src1w, dst, dstw));
+		}
+		else {
+			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG1, src1, src1w, src2, src2w));
+			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG2, src2, src2w, dst, dstw));
+		}
+	}
+	else if (flags & SLOW_SRC1)
+		FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG1, src1, src1w, dst, dstw));
+	else if (flags & SLOW_SRC2)
+		FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, sugg_src2_r, src2, src2w, dst, dstw));
+
+	FAIL_IF(emit_single_op(compiler, op, flags, dst_r, src1_r, src2_r));
+
+	if (dst & SLJIT_MEM) {
+		if (!(flags & SLOW_DEST)) {
+			getput_arg_fast(compiler, flags, dst_r, dst, dstw);
+			return compiler->error;
+		}
+		return getput_arg(compiler, flags, dst_r, dst, dstw, 0, 0);
+	}
+
+	return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
+{
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	sljit_ins word = (op & SLJIT_32) >> 5;
+
+	SLJIT_ASSERT(word == 0 || word == 0x8);
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op0(compiler, op));
+
+	switch (GET_OPCODE(op)) {
+	case SLJIT_BREAKPOINT:
+		return push_inst(compiler, EBREAK);
+	case SLJIT_NOP:
+		return push_inst(compiler, ADDI | RD(TMP_ZERO) | RS1(TMP_ZERO) | IMM_I(0));
+	case SLJIT_LMUL_UW:
+		FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG1) | RS1(SLJIT_R1) | IMM_I(0)));
+		FAIL_IF(push_inst(compiler, MULHU | RD(SLJIT_R1) | RS1(SLJIT_R0) | RS2(SLJIT_R1)));
+		return push_inst(compiler, MUL | RD(SLJIT_R0) | RS1(SLJIT_R0) | RS2(TMP_REG1));
+	case SLJIT_LMUL_SW:
+		FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG1) | RS1(SLJIT_R1) | IMM_I(0)));
+		FAIL_IF(push_inst(compiler, MULH | RD(SLJIT_R1) | RS1(SLJIT_R0) | RS2(SLJIT_R1)));
+		return push_inst(compiler, MUL | RD(SLJIT_R0) | RS1(SLJIT_R0) | RS2(TMP_REG1));
+	case SLJIT_DIVMOD_UW:
+		FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG1) | RS1(SLJIT_R0) | IMM_I(0)));
+		FAIL_IF(push_inst(compiler, DIVU | WORD | RD(SLJIT_R0) | RS1(SLJIT_R0) | RS2(SLJIT_R1)));
+		return push_inst(compiler, REMU | WORD | RD(SLJIT_R1) | RS1(TMP_REG1) | RS2(SLJIT_R1));
+	case SLJIT_DIVMOD_SW:
+		FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG1) | RS1(SLJIT_R0) | IMM_I(0)));
+		FAIL_IF(push_inst(compiler, DIV | WORD | RD(SLJIT_R0) | RS1(SLJIT_R0) | RS2(SLJIT_R1)));
+		return push_inst(compiler, REM | WORD | RD(SLJIT_R1) | RS1(TMP_REG1) | RS2(SLJIT_R1));
+	case SLJIT_DIV_UW:
+		return push_inst(compiler, DIVU | WORD | RD(SLJIT_R0) | RS1(SLJIT_R0) | RS2(SLJIT_R1));
+	case SLJIT_DIV_SW:
+		return push_inst(compiler, DIV | WORD | RD(SLJIT_R0) | RS1(SLJIT_R0) | RS2(SLJIT_R1));
+	case SLJIT_ENDBR:
+	case SLJIT_SKIP_FRAMES_BEFORE_RETURN:
+		return SLJIT_SUCCESS;
+	}
+
+	return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_s32 flags = 0;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+	ADJUST_LOCAL_OFFSET(src, srcw);
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	if (op & SLJIT_32)
+		flags = INT_DATA | SIGNED_DATA;
+#endif
+
+	switch (GET_OPCODE(op)) {
+	case SLJIT_MOV:
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	case SLJIT_MOV_U32:
+	case SLJIT_MOV_S32:
+	case SLJIT_MOV32:
+#endif
+	case SLJIT_MOV_P:
+		return emit_op(compiler, SLJIT_MOV, WORD_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, srcw);
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	case SLJIT_MOV_U32:
+		return emit_op(compiler, SLJIT_MOV_U32, INT_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u32)srcw : srcw);
+
+	case SLJIT_MOV_S32:
+	/* Logical operators have no W variant, so sign extended input is necessary for them. */
+	case SLJIT_MOV32:
+		return emit_op(compiler, SLJIT_MOV_S32, INT_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s32)srcw : srcw);
+#endif
+
+	case SLJIT_MOV_U8:
+		return emit_op(compiler, op, BYTE_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8)srcw : srcw);
+
+	case SLJIT_MOV_S8:
+		return emit_op(compiler, op, BYTE_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8)srcw : srcw);
+
+	case SLJIT_MOV_U16:
+		return emit_op(compiler, op, HALF_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16)srcw : srcw);
+
+	case SLJIT_MOV_S16:
+		return emit_op(compiler, op, HALF_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
+
+	case SLJIT_NOT:
+		return emit_op(compiler, SLJIT_XOR | (op & (SLJIT_32 | SLJIT_SET_Z)), flags, dst, dstw, src, srcw, SLJIT_IMM, -1);
+
+	case SLJIT_CLZ:
+	case SLJIT_CTZ:
+		return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);
+	}
+
+	SLJIT_UNREACHABLE();
+	return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_s32 flags = 0;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op2(compiler, op, 0, dst, dstw, src1, src1w, src2, src2w));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	if (op & SLJIT_32) {
+		flags |= INT_DATA | SIGNED_DATA;
+		if (src1 & SLJIT_IMM)
+			src1w = (sljit_s32)src1w;
+		if (src2 & SLJIT_IMM)
+			src2w = (sljit_s32)src2w;
+	}
+#endif
+
+	switch (GET_OPCODE(op)) {
+	case SLJIT_ADD:
+	case SLJIT_ADDC:
+		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
+		return emit_op(compiler, op, flags | CUMULATIVE_OP | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
+
+	case SLJIT_SUB:
+	case SLJIT_SUBC:
+		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_SUB;
+		return emit_op(compiler, op, flags | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
+
+	case SLJIT_MUL:
+		compiler->status_flags_state = 0;
+		return emit_op(compiler, op, flags | CUMULATIVE_OP, dst, dstw, src1, src1w, src2, src2w);
+
+	case SLJIT_AND:
+	case SLJIT_OR:
+	case SLJIT_XOR:
+		return emit_op(compiler, op, flags | CUMULATIVE_OP | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
+
+	case SLJIT_SHL:
+	case SLJIT_MSHL:
+	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
+	case SLJIT_ASHR:
+	case SLJIT_MASHR:
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		if (src2 & SLJIT_IMM) {
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+			src2w &= 0x1f;
+#else /* !SLJIT_CONFIG_RISCV_32 */
+			if (op & SLJIT_32)
+				src2w &= 0x1f;
+			else
+				src2w &= 0x3f;
+#endif /* SLJIT_CONFIG_RISCV_32 */
+		}
+
+		return emit_op(compiler, op, flags | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
+	}
+
+	SLJIT_UNREACHABLE();
+	return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src_dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_s32 is_left;
+	sljit_ins ins1, ins2, ins3;
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	sljit_ins word = (op & SLJIT_32) >> 5;
+	sljit_s32 inp_flags = ((op & SLJIT_32) ? INT_DATA : WORD_DATA) | LOAD_DATA;
+	sljit_sw bit_length = (op & SLJIT_32) ? 32 : 64;
+#else /* !SLJIT_CONFIG_RISCV_64 */
+	sljit_s32 inp_flags = WORD_DATA | LOAD_DATA;
+	sljit_sw bit_length = 32;
+#endif /* SLJIT_CONFIG_RISCV_64 */
+
+	SLJIT_ASSERT(WORD == 0 || WORD == 0x8);
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, src_dst, src1, src1w, src2, src2w));
+
+	is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL);
+
+	if (src_dst == src1) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, (is_left ? SLJIT_ROTL : SLJIT_ROTR) | (op & SLJIT_32), src_dst, 0, src_dst, 0, src2, src2w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+	if (src2 & SLJIT_IMM) {
+		src2w &= bit_length - 1;
+
+		if (src2w == 0)
+			return SLJIT_SUCCESS;
+	} else if (src2 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inp_flags, TMP_REG2, src2, src2w));
+		src2 = TMP_REG2;
+	}
+
+	if (src1 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem(compiler, inp_flags, TMP_REG1, src1, src1w));
+		src1 = TMP_REG1;
+	} else if (src1 & SLJIT_IMM) {
+		FAIL_IF(load_immediate(compiler, TMP_REG1, src1w, TMP_REG3));
+		src1 = TMP_REG1;
+	}
+
+	if (src2 & SLJIT_IMM) {
+		if (is_left) {
+			ins1 = SLLI | WORD | IMM_I(src2w);
+			src2w = bit_length - src2w;
+			ins2 = SRLI | WORD | IMM_I(src2w);
+		} else {
+			ins1 = SRLI | WORD | IMM_I(src2w);
+			src2w = bit_length - src2w;
+			ins2 = SLLI | WORD | IMM_I(src2w);
+		}
+
+		FAIL_IF(push_inst(compiler, ins1 | RD(src_dst) | RS1(src_dst)));
+		FAIL_IF(push_inst(compiler, ins2 | RD(TMP_REG1) | RS1(src1)));
+		return push_inst(compiler, OR | RD(src_dst) | RS1(src_dst) | RS2(TMP_REG1));
+	}
+
+	if (is_left) {
+		ins1 = SLL;
+		ins2 = SRLI;
+		ins3 = SRL;
+	} else {
+		ins1 = SRL;
+		ins2 = SLLI;
+		ins3 = SLL;
+	}
+
+	FAIL_IF(push_inst(compiler, ins1 | WORD | RD(src_dst) | RS1(src_dst) | RS2(src2)));
+
+	if (!(op & SLJIT_SHIFT_INTO_NON_ZERO)) {
+		FAIL_IF(push_inst(compiler, ins2 | WORD | RD(TMP_REG1) | RS1(src1) | IMM_I(1)));
+		FAIL_IF(push_inst(compiler, XORI | RD(TMP_REG2) | RS1(src2) | IMM_I((sljit_ins)bit_length - 1)));
+		src1 = TMP_REG1;
+	} else
+		FAIL_IF(push_inst(compiler, SUB | WORD | RD(TMP_REG2) | RS1(TMP_ZERO) | RS2(src2)));
+
+	FAIL_IF(push_inst(compiler, ins3 | WORD | RD(TMP_REG1) | RS1(src1) | RS2(TMP_REG2)));
+	return push_inst(compiler, OR | RD(src_dst) | RS1(src_dst) | RS2(TMP_REG1));
+}
+
+#undef WORD
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_src(compiler, op, src, srcw));
+	ADJUST_LOCAL_OFFSET(src, srcw);
+
+	switch (op) {
+	case SLJIT_FAST_RETURN:
+		if (FAST_IS_REG(src))
+			FAIL_IF(push_inst(compiler, ADDI | RD(RETURN_ADDR_REG) | RS1(src) | IMM_I(0)));
+		else
+			FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, RETURN_ADDR_REG, src, srcw));
+
+		return push_inst(compiler, JALR | RD(TMP_ZERO) | RS1(RETURN_ADDR_REG) | IMM_I(0));
+	case SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN:
+		return SLJIT_SUCCESS;
+	case SLJIT_PREFETCH_L1:
+	case SLJIT_PREFETCH_L2:
+	case SLJIT_PREFETCH_L3:
+	case SLJIT_PREFETCH_ONCE:
+		return SLJIT_SUCCESS;
+	}
+
+	return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
+{
+	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
+	return reg_map[reg];
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
+{
+	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
+	return freg_map[reg];
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
+	void *instruction, sljit_u32 size)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
+
+	return push_inst(compiler, *(sljit_ins*)instruction);
+}
+
+/* --------------------------------------------------------------------- */
+/*  Floating point operators                                             */
+/* --------------------------------------------------------------------- */
+
+#define FLOAT_DATA(op) (DOUBLE_DATA | ((op & SLJIT_32) >> 7))
+#define FMT(op) ((sljit_ins)((op & SLJIT_32) ^ SLJIT_32) << 17)
+
+static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 src, sljit_sw srcw)
+{
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+#	define flags (sljit_u32)0
+#else
+	sljit_u32 flags = ((sljit_u32)(GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)) << 21;
+#endif
+	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
+
+	if (src & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src, srcw, dst, dstw));
+		src = TMP_FREG1;
+	}
+
+	FAIL_IF(push_inst(compiler, FCVT_W_S | FMT(op) | flags | RD(dst_r) | FRS1(src)));
+
+	/* Store the integer value from a VFP register. */
+	if (dst & SLJIT_MEM) {
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+		return emit_op_mem2(compiler, WORD_DATA, TMP_REG2, dst, dstw, 0, 0);
+#else
+		return emit_op_mem2(compiler, flags ? WORD_DATA : INT_DATA, TMP_REG2, dst, dstw, 0, 0);
+#endif
+	}
+	return SLJIT_SUCCESS;
+
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+#	undef flags
+#endif
+}
+
+static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_ins inst;
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+	sljit_u32 flags = ((sljit_u32)(GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)) << 21;
+#endif
+
+	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
+
+	if (src & SLJIT_MEM) {
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+		FAIL_IF(emit_op_mem2(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw, dst, dstw));
+#else
+		FAIL_IF(emit_op_mem2(compiler, (flags ? WORD_DATA : INT_DATA) | LOAD_DATA, TMP_REG1, src, srcw, dst, dstw));
+#endif
+		src = TMP_REG1;
+	} else if (src & SLJIT_IMM) {
+#if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
+		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
+			srcw = (sljit_s32)srcw;
+#endif
+
+		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw, TMP_REG3));
+		src = TMP_REG1;
+	}
+
+	inst = FCVT_S_W | FMT(op) | FRD(dst_r) | RS1(src);
+
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	if (op & SLJIT_32)
+		inst |= F3(0x7);
+#else
+	inst |= flags;
+
+	if (op != SLJIT_CONV_F64_FROM_S32)
+		inst |= F3(0x7);
+#endif
+
+	FAIL_IF(push_inst(compiler, inst));
+
+	if (dst & SLJIT_MEM)
+		return emit_op_mem2(compiler, FLOAT_DATA(op), TMP_FREG1, dst, dstw, 0, 0);
+	return SLJIT_SUCCESS;
+}
+
+static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_ins inst;
+
+	if (src1 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, src2, src2w));
+		src1 = TMP_FREG1;
+	}
+
+	if (src2 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, 0, 0));
+		src2 = TMP_FREG2;
+	}
+
+	switch (GET_FLAG_TYPE(op)) {
+	case SLJIT_F_EQUAL:
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_ORDERED_EQUAL:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
+		inst = FEQ_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src1) | FRS2(src2);
+		break;
+	case SLJIT_F_LESS:
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_ORDERED_LESS:
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+		inst = FLT_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src1) | FRS2(src2);
+		break;
+	case SLJIT_ORDERED_GREATER:
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
+		inst = FLT_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src2) | FRS2(src1);
+		break;
+	case SLJIT_F_GREATER:
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_UNORDERED_OR_GREATER:
+	case SLJIT_ORDERED_LESS_EQUAL:
+		inst = FLE_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src1) | FRS2(src2);
+		break;
+	case SLJIT_UNORDERED_OR_LESS:
+	case SLJIT_ORDERED_GREATER_EQUAL:
+		inst = FLE_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src2) | FRS2(src1);
+		break;
+	case SLJIT_UNORDERED_OR_EQUAL: /* Not supported. */
+	case SLJIT_ORDERED_NOT_EQUAL: /* Not supported. */
+		FAIL_IF(push_inst(compiler, FLT_S | FMT(op) | RD(OTHER_FLAG) | FRS1(src1) | FRS2(src2)));
+		FAIL_IF(push_inst(compiler, FLT_S | FMT(op) | RD(TMP_REG1) | FRS1(src2) | FRS2(src1)));
+		inst = OR | RD(OTHER_FLAG) | RS1(OTHER_FLAG) | RS2(TMP_REG1);
+		break;
+	default: /* SLJIT_UNORDERED, SLJIT_ORDERED */
+		FAIL_IF(push_inst(compiler, FADD_S | FMT(op) | FRD(TMP_FREG1) | FRS1(src1) | FRS2(src2)));
+		inst = FEQ_S | FMT(op) | RD(OTHER_FLAG) | FRS1(TMP_FREG1) | FRS2(TMP_FREG1);
+		break;
+	}
+
+	return push_inst(compiler, inst);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_s32 dst_r;
+
+	CHECK_ERROR();
+	compiler->cache_arg = 0;
+	compiler->cache_argw = 0;
+
+	SLJIT_COMPILE_ASSERT((SLJIT_32 == 0x100) && !(DOUBLE_DATA & 0x2), float_transfer_bit_error);
+	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
+
+	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32)
+		op ^= SLJIT_32;
+
+	dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
+
+	if (src & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, dst_r, src, srcw, dst, dstw));
+		src = dst_r;
+	}
+
+	switch (GET_OPCODE(op)) {
+	case SLJIT_MOV_F64:
+		if (src != dst_r) {
+			if (dst_r != TMP_FREG1)
+				FAIL_IF(push_inst(compiler, FSGNJ_S | FMT(op) | FRD(dst_r) | FRS1(src) | FRS2(src)));
+			else
+				dst_r = src;
+		}
+		break;
+	case SLJIT_NEG_F64:
+		FAIL_IF(push_inst(compiler, FSGNJN_S | FMT(op) | FRD(dst_r) | FRS1(src) | FRS2(src)));
+		break;
+	case SLJIT_ABS_F64:
+		FAIL_IF(push_inst(compiler, FSGNJX_S | FMT(op) | FRD(dst_r) | FRS1(src) | FRS2(src)));
+		break;
+	case SLJIT_CONV_F64_FROM_F32:
+		/* The SLJIT_32 bit is inverted because sljit_f32 needs to be loaded from the memory. */
+		FAIL_IF(push_inst(compiler, FCVT_S_D | ((op & SLJIT_32) ? (1 << 25) : ((1 << 20) | F3(7))) | FRD(dst_r) | FRS1(src)));
+		op ^= SLJIT_32;
+		break;
+	}
+
+	if (dst & SLJIT_MEM)
+		return emit_op_mem2(compiler, FLOAT_DATA(op), dst_r, dst, dstw, 0, 0);
+	return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_s32 dst_r, flags = 0;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+	compiler->cache_arg = 0;
+	compiler->cache_argw = 0;
+
+	dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG2;
+
+	if (src1 & SLJIT_MEM) {
+		if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w)) {
+			FAIL_IF(compiler->error);
+			src1 = TMP_FREG1;
+		} else
+			flags |= SLOW_SRC1;
+	}
+
+	if (src2 & SLJIT_MEM) {
+		if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w)) {
+			FAIL_IF(compiler->error);
+			src2 = TMP_FREG2;
+		} else
+			flags |= SLOW_SRC2;
+	}
+
+	if ((flags & (SLOW_SRC1 | SLOW_SRC2)) == (SLOW_SRC1 | SLOW_SRC2)) {
+		if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, src1w, dst, dstw)) {
+			FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, src1, src1w));
+			FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, dst, dstw));
+		}
+		else {
+			FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, src2, src2w));
+			FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, dst, dstw));
+		}
+	}
+	else if (flags & SLOW_SRC1)
+		FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, dst, dstw));
+	else if (flags & SLOW_SRC2)
+		FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, dst, dstw));
+
+	if (flags & SLOW_SRC1)
+		src1 = TMP_FREG1;
+	if (flags & SLOW_SRC2)
+		src2 = TMP_FREG2;
+
+	switch (GET_OPCODE(op)) {
+	case SLJIT_ADD_F64:
+		FAIL_IF(push_inst(compiler, FADD_S | FMT(op) | FRD(dst_r) | FRS1(src1) | FRS2(src2)));
+		break;
+
+	case SLJIT_SUB_F64:
+		FAIL_IF(push_inst(compiler, FSUB_S | FMT(op) | FRD(dst_r) | FRS1(src1) | FRS2(src2)));
+		break;
+
+	case SLJIT_MUL_F64:
+		FAIL_IF(push_inst(compiler, FMUL_S | FMT(op) | FRD(dst_r) | FRS1(src1) | FRS2(src2)));
+		break;
+
+	case SLJIT_DIV_F64:
+		FAIL_IF(push_inst(compiler, FDIV_S | FMT(op) | FRD(dst_r) | FRS1(src1) | FRS2(src2)));
+		break;
+	}
+
+	if (dst_r == TMP_FREG2)
+		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op), TMP_FREG2, dst, dstw, 0, 0));
+
+	return SLJIT_SUCCESS;
+}
+
+#undef FLOAT_DATA
+#undef FMT
+
+/* --------------------------------------------------------------------- */
+/*  Other instructions                                                   */
+/* --------------------------------------------------------------------- */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	if (FAST_IS_REG(dst))
+		return push_inst(compiler, ADDI | RD(dst) | RS1(RETURN_ADDR_REG) | IMM_I(0));
+
+	/* Memory. */
+	return emit_op_mem(compiler, WORD_DATA, RETURN_ADDR_REG, dst, dstw);
+}
+
+/* --------------------------------------------------------------------- */
+/*  Conditional instructions                                             */
+/* --------------------------------------------------------------------- */
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
+{
+	struct sljit_label *label;
+
+	CHECK_ERROR_PTR();
+	CHECK_PTR(check_sljit_emit_label(compiler));
+
+	if (compiler->last_label && compiler->last_label->size == compiler->size)
+		return compiler->last_label;
+
+	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
+	PTR_FAIL_IF(!label);
+	set_label(label, compiler);
+	return label;
+}
+
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+#define BRANCH_LENGTH	((sljit_ins)(3 * sizeof(sljit_ins)) << 7)
+#else
+#define BRANCH_LENGTH	((sljit_ins)(7 * sizeof(sljit_ins)) << 7)
+#endif
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
+{
+	struct sljit_jump *jump;
+	sljit_ins inst;
+
+	CHECK_ERROR_PTR();
+	CHECK_PTR(check_sljit_emit_jump(compiler, type));
+
+	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
+	PTR_FAIL_IF(!jump);
+	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
+	type &= 0xff;
+
+	switch (type) {
+	case SLJIT_EQUAL:
+		inst = BNE | RS1(EQUAL_FLAG) | RS2(TMP_ZERO) | BRANCH_LENGTH;
+		break;
+	case SLJIT_NOT_EQUAL:
+		inst = BEQ | RS1(EQUAL_FLAG) | RS2(TMP_ZERO) | BRANCH_LENGTH;
+		break;
+	case SLJIT_LESS:
+	case SLJIT_GREATER:
+	case SLJIT_SIG_LESS:
+	case SLJIT_SIG_GREATER:
+	case SLJIT_OVERFLOW:
+	case SLJIT_CARRY:
+	case SLJIT_F_EQUAL:
+	case SLJIT_ORDERED_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL: /* Not supported. */
+	case SLJIT_F_LESS:
+	case SLJIT_ORDERED_LESS:
+	case SLJIT_ORDERED_GREATER:
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_ORDERED_LESS_EQUAL:
+	case SLJIT_ORDERED_GREATER_EQUAL:
+	case SLJIT_ORDERED:
+		inst = BEQ | RS1(OTHER_FLAG) | RS2(TMP_ZERO) | BRANCH_LENGTH;
+		break;
+	case SLJIT_GREATER_EQUAL:
+	case SLJIT_LESS_EQUAL:
+	case SLJIT_SIG_GREATER_EQUAL:
+	case SLJIT_SIG_LESS_EQUAL:
+	case SLJIT_NOT_OVERFLOW:
+	case SLJIT_NOT_CARRY:
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
+	case SLJIT_UNORDERED_OR_EQUAL: /* Not supported. */
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
+	case SLJIT_F_GREATER:
+	case SLJIT_UNORDERED_OR_GREATER:
+	case SLJIT_UNORDERED_OR_LESS:
+	case SLJIT_UNORDERED:
+		inst = BNE | RS1(OTHER_FLAG) | RS2(TMP_ZERO) | BRANCH_LENGTH;
+		break;
+	default:
+		/* Not conditional branch. */
+		inst = 0;
+		break;
+	}
+
+	if (inst != 0) {
+		PTR_FAIL_IF(push_inst(compiler, inst));
+		jump->flags |= IS_COND;
+	}
+
+	jump->addr = compiler->size;
+	inst = JALR | RS1(TMP_REG1) | IMM_I(0);
+
+	if (type >= SLJIT_FAST_CALL) {
+		jump->flags |= IS_CALL;
+		inst |= RD(RETURN_ADDR_REG);
+	}
+
+	PTR_FAIL_IF(push_inst(compiler, inst));
+
+	/* Maximum number of instructions required for generating a constant. */
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	compiler->size += 1;
+#else
+	compiler->size += 5;
+#endif
+	return jump;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 arg_types)
+{
+	SLJIT_UNUSED_ARG(arg_types);
+	CHECK_ERROR_PTR();
+	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
+
+	if (type & SLJIT_CALL_RETURN) {
+		PTR_FAIL_IF(emit_stack_frame_release(compiler, 0));
+		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
+	}
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_jump(compiler, type);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_cmp(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	struct sljit_jump *jump;
+	sljit_s32 flags;
+	sljit_ins inst;
+
+	CHECK_ERROR_PTR();
+	CHECK_PTR(check_sljit_emit_cmp(compiler, type, src1, src1w, src2, src2w));
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+	compiler->cache_arg = 0;
+	compiler->cache_argw = 0;
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	flags = WORD_DATA | LOAD_DATA;
+#else /* !SLJIT_CONFIG_RISCV_32 */
+	flags = ((type & SLJIT_32) ? INT_DATA : WORD_DATA) | LOAD_DATA;
+#endif /* SLJIT_CONFIG_RISCV_32 */
+
+	if (src1 & SLJIT_MEM) {
+		PTR_FAIL_IF(emit_op_mem2(compiler, flags, TMP_REG1, src1, src1w, src2, src2w));
+		src1 = TMP_REG1;
+	}
+
+	if (src2 & SLJIT_MEM) {
+		PTR_FAIL_IF(emit_op_mem2(compiler, flags, TMP_REG2, src2, src2w, 0, 0));
+		src2 = TMP_REG2;
+	}
+
+	if (src1 & SLJIT_IMM) {
+		if (src1w != 0) {
+			PTR_FAIL_IF(load_immediate(compiler, TMP_REG1, src1w, TMP_REG3));
+			src1 = TMP_REG1;
+		}
+		else
+			src1 = TMP_ZERO;
+	}
+
+	if (src2 & SLJIT_IMM) {
+		if (src2w != 0) {
+			PTR_FAIL_IF(load_immediate(compiler, TMP_REG2, src2w, TMP_REG3));
+			src2 = TMP_REG2;
+		}
+		else
+			src2 = TMP_ZERO;
+	}
+
+	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
+	PTR_FAIL_IF(!jump);
+	set_jump(jump, compiler, (sljit_u32)((type & SLJIT_REWRITABLE_JUMP) | IS_COND));
+	type &= 0xff;
+
+	switch (type) {
+	case SLJIT_EQUAL:
+		inst = BNE | RS1(src1) | RS2(src2) | BRANCH_LENGTH;
+		break;
+	case SLJIT_NOT_EQUAL:
+		inst = BEQ | RS1(src1) | RS2(src2) | BRANCH_LENGTH;
+		break;
+	case SLJIT_LESS:
+		inst = BGEU | RS1(src1) | RS2(src2) | BRANCH_LENGTH;
+		break;
+	case SLJIT_GREATER_EQUAL:
+		inst = BLTU | RS1(src1) | RS2(src2) | BRANCH_LENGTH;
+		break;
+	case SLJIT_GREATER:
+		inst = BGEU | RS1(src2) | RS2(src1) | BRANCH_LENGTH;
+		break;
+	case SLJIT_LESS_EQUAL:
+		inst = BLTU | RS1(src2) | RS2(src1) | BRANCH_LENGTH;
+		break;
+	case SLJIT_SIG_LESS:
+		inst = BGE | RS1(src1) | RS2(src2) | BRANCH_LENGTH;
+		break;
+	case SLJIT_SIG_GREATER_EQUAL:
+		inst = BLT | RS1(src1) | RS2(src2) | BRANCH_LENGTH;
+		break;
+	case SLJIT_SIG_GREATER:
+		inst = BGE | RS1(src2) | RS2(src1) | BRANCH_LENGTH;
+		break;
+	case SLJIT_SIG_LESS_EQUAL:
+		inst = BLT | RS1(src2) | RS2(src1) | BRANCH_LENGTH;
+		break;
+	}
+
+	PTR_FAIL_IF(push_inst(compiler, inst));
+
+	jump->addr = compiler->size;
+	PTR_FAIL_IF(push_inst(compiler, JALR | RD(TMP_ZERO) | RS1(TMP_REG1) | IMM_I(0)));
+
+	/* Maximum number of instructions required for generating a constant. */
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	compiler->size += 1;
+#else
+	compiler->size += 5;
+#endif
+	return jump;
+}
+
+#undef BRANCH_LENGTH
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
+{
+	struct sljit_jump *jump;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
+
+	if (!(src & SLJIT_IMM)) {
+		if (src & SLJIT_MEM) {
+			ADJUST_LOCAL_OFFSET(src, srcw);
+			FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
+			src = TMP_REG1;
+		}
+		return push_inst(compiler, JALR | RD((type >= SLJIT_FAST_CALL) ? RETURN_ADDR_REG : TMP_ZERO) | RS1(src) | IMM_I(0));
+	}
+
+	/* These jumps are converted to jump/call instructions when possible. */
+	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
+	FAIL_IF(!jump);
+	set_jump(jump, compiler, JUMP_ADDR | ((type >= SLJIT_FAST_CALL) ? IS_CALL : 0));
+	jump->u.target = (sljit_uw)srcw;
+
+	jump->addr = compiler->size;
+	FAIL_IF(push_inst(compiler, JALR | RD((type >= SLJIT_FAST_CALL) ? RETURN_ADDR_REG : TMP_ZERO) | RS1(TMP_REG1) | IMM_I(0)));
+
+	/* Maximum number of instructions required for generating a constant. */
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	compiler->size += 1;
+#else
+	compiler->size += 5;
+#endif
+	return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 arg_types,
+	sljit_s32 src, sljit_sw srcw)
+{
+	SLJIT_UNUSED_ARG(arg_types);
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
+		src = TMP_REG1;
+	}
+
+	if (type & SLJIT_CALL_RETURN) {
+		if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+			FAIL_IF(push_inst(compiler, ADDI | RD(TMP_REG1) | RS1(src) | IMM_I(0)));
+			src = TMP_REG1;
+		}
+
+		FAIL_IF(emit_stack_frame_release(compiler, 0));
+		type = SLJIT_JUMP;
+	}
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, type, src, srcw);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst, sljit_sw dstw,
+	sljit_s32 type)
+{
+	sljit_s32 src_r, dst_r, invert;
+	sljit_s32 saved_op = op;
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	sljit_s32 mem_type = WORD_DATA;
+#else
+	sljit_s32 mem_type = ((op & SLJIT_32) || op == SLJIT_MOV32) ? (INT_DATA | SIGNED_DATA) : WORD_DATA;
+#endif
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	op = GET_OPCODE(op);
+	dst_r = (op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2;
+
+	compiler->cache_arg = 0;
+	compiler->cache_argw = 0;
+
+	if (op >= SLJIT_ADD && (dst & SLJIT_MEM))
+		FAIL_IF(emit_op_mem2(compiler, mem_type | LOAD_DATA, TMP_REG1, dst, dstw, dst, dstw));
+
+	if (type < SLJIT_F_EQUAL) {
+		src_r = OTHER_FLAG;
+		invert = type & 0x1;
+
+		switch (type) {
+		case SLJIT_EQUAL:
+		case SLJIT_NOT_EQUAL:
+			FAIL_IF(push_inst(compiler, SLTUI | RD(dst_r) | RS1(EQUAL_FLAG) | IMM_I(1)));
+			src_r = dst_r;
+			break;
+		case SLJIT_OVERFLOW:
+		case SLJIT_NOT_OVERFLOW:
+			if (compiler->status_flags_state & (SLJIT_CURRENT_FLAGS_ADD | SLJIT_CURRENT_FLAGS_SUB)) {
+				src_r = OTHER_FLAG;
+				break;
+			}
+			FAIL_IF(push_inst(compiler, SLTUI | RD(dst_r) | RS1(OTHER_FLAG) | IMM_I(1)));
+			src_r = dst_r;
+			invert ^= 0x1;
+			break;
+		}
+	} else {
+		invert = 0;
+		src_r = OTHER_FLAG;
+
+		switch (type) {
+		case SLJIT_F_NOT_EQUAL:
+		case SLJIT_UNORDERED_OR_NOT_EQUAL:
+		case SLJIT_UNORDERED_OR_EQUAL: /* Not supported. */
+		case SLJIT_F_GREATER_EQUAL:
+		case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+		case SLJIT_UNORDERED_OR_LESS_EQUAL:
+		case SLJIT_F_GREATER:
+		case SLJIT_UNORDERED_OR_GREATER:
+		case SLJIT_UNORDERED_OR_LESS:
+		case SLJIT_UNORDERED:
+			invert = 1;
+			break;
+		}
+	}
+
+	if (invert) {
+		FAIL_IF(push_inst(compiler, XORI | RD(dst_r) | RS1(src_r) | IMM_I(1)));
+		src_r = dst_r;
+	}
+
+	if (op < SLJIT_ADD) {
+		if (dst & SLJIT_MEM)
+			return emit_op_mem(compiler, mem_type, src_r, dst, dstw);
+
+		if (src_r != dst_r)
+			return push_inst(compiler, ADDI | RD(dst_r) | RS1(src_r) | IMM_I(0));
+		return SLJIT_SUCCESS;
+	}
+
+	mem_type |= CUMULATIVE_OP | IMM_OP | ALT_KEEP_CACHE;
+
+	if (dst & SLJIT_MEM)
+		return emit_op(compiler, saved_op, mem_type, dst, dstw, TMP_REG1, 0, src_r, 0);
+	return emit_op(compiler, saved_op, mem_type, dst, dstw, dst, dstw, src_r, 0);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 dst_reg,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
+
+	return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_s32 flags;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+
+	if (SLJIT_UNLIKELY(mem & OFFS_REG_MASK)) {
+		memw &= 0x3;
+
+		if (SLJIT_UNLIKELY(memw != 0)) {
+			FAIL_IF(push_inst(compiler, SLLI | RD(TMP_REG1) | RS1(OFFS_REG(mem)) | IMM_I(memw)));
+			FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RS1(TMP_REG1) | RS2(mem & REG_MASK)));
+		} else
+			FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RS1(mem & REG_MASK) | RS2(OFFS_REG(mem))));
+
+		mem = TMP_REG1;
+		memw = 0;
+	} else if (memw > SIMM_MAX - SSIZE_OF(sw) || memw < SIMM_MIN) {
+		if (((memw + 0x800) & 0xfff) <= 0xfff - SSIZE_OF(sw)) {
+			FAIL_IF(load_immediate(compiler, TMP_REG1, TO_ARGW_HI(memw), TMP_REG3));
+			memw &= 0xfff;
+		} else {
+			FAIL_IF(load_immediate(compiler, TMP_REG1, memw, TMP_REG3));
+			memw = 0;
+		}
+
+		if (mem & REG_MASK)
+			FAIL_IF(push_inst(compiler, ADD | RD(TMP_REG1) | RS1(TMP_REG1) | RS2(mem & REG_MASK)));
+
+		mem = TMP_REG1;
+	} else {
+		mem &= REG_MASK;
+		memw &= 0xfff;
+	}
+
+	SLJIT_ASSERT((memw >= 0 && memw <= SIMM_MAX - SSIZE_OF(sw)) || (memw > SIMM_MAX && memw <= 0xfff));
+
+	if (!(type & SLJIT_MEM_STORE) && mem == REG_PAIR_FIRST(reg)) {
+		FAIL_IF(push_mem_inst(compiler, WORD_DATA | LOAD_DATA, REG_PAIR_SECOND(reg), mem, (memw + SSIZE_OF(sw)) & 0xfff));
+		return push_mem_inst(compiler, WORD_DATA | LOAD_DATA, REG_PAIR_FIRST(reg), mem, memw);
+	}
+
+	flags = WORD_DATA | (!(type & SLJIT_MEM_STORE) ? LOAD_DATA : 0);
+
+	FAIL_IF(push_mem_inst(compiler, flags, REG_PAIR_FIRST(reg), mem, memw));
+	return push_mem_inst(compiler, flags, REG_PAIR_SECOND(reg), mem, (memw + SSIZE_OF(sw)) & 0xfff);
+}
+
+#undef TO_ARGW_HI
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
+{
+	struct sljit_const *const_;
+	sljit_s32 dst_r;
+
+	CHECK_ERROR_PTR();
+	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
+	PTR_FAIL_IF(!const_);
+	set_const(const_, compiler);
+
+	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
+	PTR_FAIL_IF(emit_const(compiler, dst_r, init_value, ADDI | RD(dst_r)));
+
+	if (dst & SLJIT_MEM)
+		PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw));
+
+	return const_;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE struct sljit_put_label* sljit_emit_put_label(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
+{
+	struct sljit_put_label *put_label;
+	sljit_s32 dst_r;
+
+	CHECK_ERROR_PTR();
+	CHECK_PTR(check_sljit_emit_put_label(compiler, dst, dstw));
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	put_label = (struct sljit_put_label*)ensure_abuf(compiler, sizeof(struct sljit_put_label));
+	PTR_FAIL_IF(!put_label);
+	set_put_label(put_label, compiler, 0);
+
+	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
+	PTR_FAIL_IF(push_inst(compiler, (sljit_ins)dst_r));
+#if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
+	compiler->size += 1;
+#else
+	compiler->size += 5;
+#endif
+
+	if (dst & SLJIT_MEM)
+		PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw));
+
+	return put_label;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
+{
+	sljit_set_jump_addr(addr, (sljit_uw)new_constant, executable_offset);
+}
diff --git a/src/sljit/sljitNativeS390X.c b/src/sljit/sljitNativeS390X.c
index 8eef910..8b51bad 100644
--- a/src/sljit/sljitNativeS390X.c
+++ b/src/sljit/sljitNativeS390X.c
@@ -103,11 +103,8 @@
 /* When reg cannot be unused. */
 #define IS_GPR_REG(reg)		((reg > 0) && (reg) <= SLJIT_SP)
 
-/* Link registers. The normal link register is r14, but since
-   we use that for flags we need to use r0 instead to do fast
-   calls so that flags are preserved. */
+/* Link register. */
 static const sljit_gpr link_r = 14;     /* r14 */
-static const sljit_gpr fast_link_r = 0; /* r0 */
 
 #define TMP_FREG1	(0)
 
@@ -220,7 +217,8 @@
 		}
 		/* fallthrough */
 
-	case SLJIT_EQUAL_F64:
+	case SLJIT_F_EQUAL:
+	case SLJIT_ORDERED_EQUAL:
 		return cc0;
 
 	case SLJIT_NOT_EQUAL:
@@ -234,13 +232,14 @@
 		}
 		/* fallthrough */
 
-	case SLJIT_NOT_EQUAL_F64:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
 		return (cc1 | cc2 | cc3);
 
 	case SLJIT_LESS:
 		return cc1;
 
 	case SLJIT_GREATER_EQUAL:
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
 		return (cc0 | cc2 | cc3);
 
 	case SLJIT_GREATER:
@@ -254,7 +253,8 @@
 		return (cc0 | cc1 | cc2);
 
 	case SLJIT_SIG_LESS:
-	case SLJIT_LESS_F64:
+	case SLJIT_F_LESS:
+	case SLJIT_ORDERED_LESS:
 		return cc1;
 
 	case SLJIT_NOT_CARRY:
@@ -263,7 +263,8 @@
 		/* fallthrough */
 
 	case SLJIT_SIG_LESS_EQUAL:
-	case SLJIT_LESS_EQUAL_F64:
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_ORDERED_LESS_EQUAL:
 		return (cc0 | cc1);
 
 	case SLJIT_CARRY:
@@ -272,6 +273,7 @@
 		/* fallthrough */
 
 	case SLJIT_SIG_GREATER:
+	case SLJIT_UNORDERED_OR_GREATER:
 		/* Overflow is considered greater, see SLJIT_SUB. */
 		return cc2 | cc3;
 
@@ -283,7 +285,7 @@
 			return (cc2 | cc3);
 		/* fallthrough */
 
-	case SLJIT_UNORDERED_F64:
+	case SLJIT_UNORDERED:
 		return cc3;
 
 	case SLJIT_NOT_OVERFLOW:
@@ -291,14 +293,29 @@
 			return (cc0 | cc1);
 		/* fallthrough */
 
-	case SLJIT_ORDERED_F64:
+	case SLJIT_ORDERED:
 		return (cc0 | cc1 | cc2);
 
-	case SLJIT_GREATER_F64:
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL:
+		return (cc1 | cc2);
+
+	case SLJIT_F_GREATER:
+	case SLJIT_ORDERED_GREATER:
 		return cc2;
 
-	case SLJIT_GREATER_EQUAL_F64:
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_ORDERED_GREATER_EQUAL:
 		return (cc0 | cc2);
+
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
+		return (cc0 | cc1 | cc3);
+
+	case SLJIT_UNORDERED_OR_EQUAL:
+		return (cc0 | cc3);
+
+	case SLJIT_UNORDERED_OR_LESS:
+		return (cc1 | cc3);
 	}
 
 	SLJIT_UNREACHABLE();
@@ -978,7 +995,7 @@
 	(cond) ? EVAL(i1, r, addr) : EVAL(i2, r, addr)
 
 /* May clobber tmp1. */
-static sljit_s32 load_word(struct sljit_compiler *compiler, sljit_gpr dst,
+static sljit_s32 load_word(struct sljit_compiler *compiler, sljit_gpr dst_r,
 		sljit_s32 src, sljit_sw srcw,
 		sljit_s32 is_32bit)
 {
@@ -986,21 +1003,36 @@
 	sljit_ins ins;
 
 	SLJIT_ASSERT(src & SLJIT_MEM);
-	if (have_ldisp() || !is_32bit)
-		FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp1));
-	else
+
+	if (is_32bit && ((src & OFFS_REG_MASK) || is_u12(srcw) || !is_s20(srcw))) {
 		FAIL_IF(make_addr_bx(compiler, &addr, src, srcw, tmp1));
+		return push_inst(compiler, 0x58000000 /* l */ | R20A(dst_r) | R16A(addr.index) | R12A(addr.base) | (sljit_ins)addr.offset);
+	}
 
-	if (is_32bit)
-		ins = WHEN(is_u12(addr.offset), dst, l, ly, addr);
-	else
-		ins = lg(dst, addr.offset, addr.index, addr.base);
+	FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp1));
 
-	return push_inst(compiler, ins);
+	ins = is_32bit ? 0xe30000000058 /* ly */ : 0xe30000000004 /* lg */;
+	return push_inst(compiler, ins | R36A(dst_r) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset));
 }
 
 /* May clobber tmp1. */
-static sljit_s32 store_word(struct sljit_compiler *compiler, sljit_gpr src,
+static sljit_s32 load_unsigned_word(struct sljit_compiler *compiler, sljit_gpr dst_r,
+		sljit_s32 src, sljit_sw srcw,
+		sljit_s32 is_32bit)
+{
+	struct addr addr;
+	sljit_ins ins;
+
+	SLJIT_ASSERT(src & SLJIT_MEM);
+
+	FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp1));
+
+	ins = is_32bit ? 0xe30000000016 /* llgf */ : 0xe30000000004 /* lg */;
+	return push_inst(compiler, ins | R36A(dst_r) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset));
+}
+
+/* May clobber tmp1. */
+static sljit_s32 store_word(struct sljit_compiler *compiler, sljit_gpr src_r,
 		sljit_s32 dst, sljit_sw dstw,
 		sljit_s32 is_32bit)
 {
@@ -1008,17 +1040,16 @@
 	sljit_ins ins;
 
 	SLJIT_ASSERT(dst & SLJIT_MEM);
-	if (have_ldisp() || !is_32bit)
-		FAIL_IF(make_addr_bxy(compiler, &addr, dst, dstw, tmp1));
-	else
+
+	if (is_32bit && ((dst & OFFS_REG_MASK) || is_u12(dstw) || !is_s20(dstw))) {
 		FAIL_IF(make_addr_bx(compiler, &addr, dst, dstw, tmp1));
+		return push_inst(compiler, 0x50000000 /* st */ | R20A(src_r) | R16A(addr.index) | R12A(addr.base) | (sljit_ins)addr.offset);
+	}
 
-	if (is_32bit)
-		ins = WHEN(is_u12(addr.offset), src, st, sty, addr);
-	else
-		ins = stg(src, addr.offset, addr.index, addr.base);
+	FAIL_IF(make_addr_bxy(compiler, &addr, dst, dstw, tmp1));
 
-	return push_inst(compiler, ins);
+	ins = is_32bit ? 0xe30000000050 /* sty */ : 0xe30000000024 /* stg */;
+	return push_inst(compiler, ins | R36A(src_r) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset));
 }
 
 #undef WHEN
@@ -1618,16 +1649,24 @@
 {
 	/* TODO(mundaym): implement all */
 	switch (feature_type) {
+	case SLJIT_HAS_FPU:
 	case SLJIT_HAS_CLZ:
-		return have_eimm() ? 1 : 0; /* FLOGR instruction */
+	case SLJIT_HAS_ROT:
+	case SLJIT_HAS_PREFETCH:
+		return 1;
+	case SLJIT_HAS_CTZ:
+		return 2;
 	case SLJIT_HAS_CMOV:
 		return have_lscond1() ? 1 : 0;
-	case SLJIT_HAS_FPU:
-		return 1;
 	}
 	return 0;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
+{
+	return (type >= SLJIT_UNORDERED && type <= SLJIT_ORDERED_LESS_EQUAL);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Entry, exit                                                          */
 /* --------------------------------------------------------------------- */
@@ -1636,7 +1675,7 @@
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
-	sljit_s32 word_arg_count = 0;
+	sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options);
 	sljit_s32 offset, i, tmp;
 
 	CHECK_ERROR();
@@ -1648,8 +1687,13 @@
 
 	offset = 2 * SSIZE_OF(sw);
 	if (saveds + scratches >= SLJIT_NUMBER_OF_REGISTERS) {
-		FAIL_IF(push_inst(compiler, stmg(r6, r14, offset, r15))); /* save registers TODO(MGM): optimize */
-		offset += 9 * SSIZE_OF(sw);
+		if (saved_arg_count == 0) {
+			FAIL_IF(push_inst(compiler, stmg(r6, r14, offset, r15)));
+			offset += 9 * SSIZE_OF(sw);
+		} else {
+			FAIL_IF(push_inst(compiler, stmg(r6, r13 - (sljit_gpr)saved_arg_count, offset, r15)));
+			offset += (8 - saved_arg_count) * SSIZE_OF(sw);
+		}
 	} else {
 		if (scratches == SLJIT_FIRST_SAVED_REG) {
 			FAIL_IF(push_inst(compiler, stg(r6, offset, 0, r15)));
@@ -1659,15 +1703,30 @@
 			offset += (scratches - (SLJIT_FIRST_SAVED_REG - 1)) * SSIZE_OF(sw);
 		}
 
-		if (saveds == 0) {
-			FAIL_IF(push_inst(compiler, stg(r14, offset, 0, r15)));
-			offset += SSIZE_OF(sw);
-		} else {
-			FAIL_IF(push_inst(compiler, stmg(r14 - (sljit_gpr)saveds, r14, offset, r15)));
-			offset += (saveds + 1) * SSIZE_OF(sw);
+		if (saved_arg_count == 0) {
+			if (saveds == 0) {
+				FAIL_IF(push_inst(compiler, stg(r14, offset, 0, r15)));
+				offset += SSIZE_OF(sw);
+			} else {
+				FAIL_IF(push_inst(compiler, stmg(r14 - (sljit_gpr)saveds, r14, offset, r15)));
+				offset += (saveds + 1) * SSIZE_OF(sw);
+			}
+		} else if (saveds > saved_arg_count) {
+			if (saveds == saved_arg_count + 1) {
+				FAIL_IF(push_inst(compiler, stg(r14 - (sljit_gpr)saveds, offset, 0, r15)));
+				offset += SSIZE_OF(sw);
+			} else {
+				FAIL_IF(push_inst(compiler, stmg(r14 - (sljit_gpr)saveds, r13 - (sljit_gpr)saved_arg_count, offset, r15)));
+				offset += (saveds - saved_arg_count) * SSIZE_OF(sw);
+			}
 		}
 	}
 
+	if (saved_arg_count > 0) {
+		FAIL_IF(push_inst(compiler, stg(r14, offset, 0, r15)));
+		offset += SSIZE_OF(sw);
+	}
+
 	tmp = SLJIT_FS0 - fsaveds;
 	for (i = SLJIT_FS0; i > tmp; i--) {
 		FAIL_IF(push_inst(compiler, 0x60000000 /* std */ | F20(i) | R12A(r15) | (sljit_ins)offset));
@@ -1684,15 +1743,19 @@
 
 	FAIL_IF(push_inst(compiler, 0xe30000000071 /* lay */ | R36A(r15) | R28A(r15) | disp_s20(-local_size)));
 
+	if (options & SLJIT_ENTER_REG_ARG)
+		return SLJIT_SUCCESS;
+
 	arg_types >>= SLJIT_ARG_SHIFT;
+	saved_arg_count = 0;
 	tmp = 0;
 	while (arg_types > 0) {
 		if ((arg_types & SLJIT_ARG_MASK) < SLJIT_ARG_TYPE_F64) {
 			if (!(arg_types & SLJIT_ARG_TYPE_SCRATCH_REG)) {
-				FAIL_IF(push_inst(compiler, lgr(gpr(SLJIT_S0 - tmp), gpr(SLJIT_R0 + word_arg_count))));
-				tmp++;
+				FAIL_IF(push_inst(compiler, lgr(gpr(SLJIT_S0 - saved_arg_count), gpr(SLJIT_R0 + tmp))));
+				saved_arg_count++;
 			}
-			word_arg_count++;
+			tmp++;
 		}
 
 		arg_types >>= SLJIT_ARG_SHIFT;
@@ -1713,12 +1776,13 @@
 	return SLJIT_SUCCESS;
 }
 
-static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_gpr last_reg)
 {
 	sljit_s32 offset, i, tmp;
 	sljit_s32 local_size = compiler->local_size;
 	sljit_s32 saveds = compiler->saveds;
 	sljit_s32 scratches = compiler->scratches;
+	sljit_s32 kept_saveds_count = SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
 
 	if (is_u12(local_size))
 		FAIL_IF(push_inst(compiler, 0x41000000 /* ly */ | R20A(r15) | R12A(r15) | (sljit_ins)local_size));
@@ -1727,8 +1791,13 @@
 
 	offset = 2 * SSIZE_OF(sw);
 	if (saveds + scratches >= SLJIT_NUMBER_OF_REGISTERS) {
-		FAIL_IF(push_inst(compiler, lmg(r6, r14, offset, r15))); /* save registers TODO(MGM): optimize */
-		offset += 9 * SSIZE_OF(sw);
+		if (kept_saveds_count == 0) {
+			FAIL_IF(push_inst(compiler, lmg(r6, last_reg, offset, r15)));
+			offset += 9 * SSIZE_OF(sw);
+		} else {
+			FAIL_IF(push_inst(compiler, lmg(r6, r13 - (sljit_gpr)kept_saveds_count, offset, r15)));
+			offset += (8 - kept_saveds_count) * SSIZE_OF(sw);
+		}
 	} else {
 		if (scratches == SLJIT_FIRST_SAVED_REG) {
 			FAIL_IF(push_inst(compiler, lg(r6, offset, 0, r15)));
@@ -1738,15 +1807,35 @@
 			offset += (scratches - (SLJIT_FIRST_SAVED_REG - 1)) * SSIZE_OF(sw);
 		}
 
-		if (saveds == 0) {
-			FAIL_IF(push_inst(compiler, lg(r14, offset, 0, r15)));
-			offset += SSIZE_OF(sw);
-		} else {
-			FAIL_IF(push_inst(compiler, lmg(r14 - (sljit_gpr)saveds, r14, offset, r15)));
-			offset += (saveds + 1) * SSIZE_OF(sw);
+		if (kept_saveds_count == 0) {
+			if (saveds == 0) {
+				if (last_reg == r14)
+					FAIL_IF(push_inst(compiler, lg(r14, offset, 0, r15)));
+				offset += SSIZE_OF(sw);
+			} else if (saveds == 1 && last_reg == r13) {
+				FAIL_IF(push_inst(compiler, lg(r13, offset, 0, r15)));
+				offset += 2 * SSIZE_OF(sw);
+			} else {
+				FAIL_IF(push_inst(compiler, lmg(r14 - (sljit_gpr)saveds, last_reg, offset, r15)));
+				offset += (saveds + 1) * SSIZE_OF(sw);
+			}
+		} else if (saveds > kept_saveds_count) {
+			if (saveds == kept_saveds_count + 1) {
+				FAIL_IF(push_inst(compiler, lg(r14 - (sljit_gpr)saveds, offset, 0, r15)));
+				offset += SSIZE_OF(sw);
+			} else {
+				FAIL_IF(push_inst(compiler, lmg(r14 - (sljit_gpr)saveds, r13 - (sljit_gpr)kept_saveds_count, offset, r15)));
+				offset += (saveds - kept_saveds_count) * SSIZE_OF(sw);
+			}
 		}
 	}
 
+	if (kept_saveds_count > 0) {
+		if (last_reg == r14)
+			FAIL_IF(push_inst(compiler, lg(r14, offset, 0, r15)));
+		offset += SSIZE_OF(sw);
+	}
+
 	tmp = SLJIT_FS0 - compiler->fsaveds;
 	for (i = SLJIT_FS0; i > tmp; i--) {
 		FAIL_IF(push_inst(compiler, 0x68000000 /* ld */ | F20(i) | R12A(r15) | (sljit_ins)offset));
@@ -1766,10 +1855,33 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_return_void(compiler));
 
-	FAIL_IF(emit_stack_frame_release(compiler));
+	FAIL_IF(emit_stack_frame_release(compiler, r14));
 	return push_inst(compiler, br(r14)); /* return */
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if (src & SLJIT_MEM) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		FAIL_IF(load_word(compiler, tmp1, src, srcw, 0 /* 64-bit */));
+		src = TMP_REG2;
+		srcw = 0;
+	} else if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		FAIL_IF(push_inst(compiler, lgr(tmp1, gpr(src))));
+		src = TMP_REG2;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, r13));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */
@@ -1858,6 +1970,47 @@
 	return push_inst(compiler, lgr(arg1, tmp0));
 }
 
+static sljit_s32 sljit_emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 op, sljit_gpr dst_r, sljit_gpr src_r)
+{
+	sljit_s32 is_ctz = (GET_OPCODE(op) == SLJIT_CTZ);
+
+	if ((op & SLJIT_32) && src_r != tmp0) {
+		FAIL_IF(push_inst(compiler, 0xb9160000 /* llgfr */ | R4A(tmp0) | R0A(src_r)));
+		src_r = tmp0;
+	}
+
+	if (is_ctz) {
+		FAIL_IF(push_inst(compiler, ((op & SLJIT_32) ? 0x1300 /* lcr */ : 0xb9030000 /* lcgr */) | R4A(tmp1) | R0A(src_r)));
+
+		if (src_r == tmp0)
+			FAIL_IF(push_inst(compiler, ((op & SLJIT_32) ? 0x1400 /* nr */ : 0xb9800000 /* ngr */) | R4A(tmp0) | R0A(tmp1)));
+		else
+			FAIL_IF(push_inst(compiler, 0xb9e40000 /* ngrk */ | R12A(tmp1) | R4A(tmp0) | R0A(src_r)));
+
+		src_r = tmp0;
+	}
+
+	FAIL_IF(push_inst(compiler, 0xb9830000 /* flogr */ | R4A(tmp0) | R0A(src_r)));
+
+	if (is_ctz)
+		FAIL_IF(push_inst(compiler, 0xec00000000d9 /* aghik */ | R36A(tmp1) | R32A(tmp0) | ((sljit_ins)(-64 & 0xffff) << 16)));
+
+	if (op & SLJIT_32) {
+		if (!is_ctz && dst_r != tmp0)
+			return push_inst(compiler, 0xec00000000d9 /* aghik */ | R36A(dst_r) | R32A(tmp0) | ((sljit_ins)(-32 & 0xffff) << 16));
+
+		FAIL_IF(push_inst(compiler, 0xc20800000000 /* agfi */ | R36A(tmp0) | (sljit_u32)-32));
+	}
+
+	if (is_ctz)
+		FAIL_IF(push_inst(compiler, 0xec0000000057 /* rxsbg */ | R36A(tmp0) | R32A(tmp1) | ((sljit_ins)((op & SLJIT_32) ? 59 : 58) << 24) | (63 << 16) | ((sljit_ins)((op & SLJIT_32) ? 5 : 6) << 8)));
+
+	if (dst_r == tmp0)
+		return SLJIT_SUCCESS;
+
+	return push_inst(compiler, ((op & SLJIT_32) ? 0x1800 /* lr */ : 0xb9040000 /* lgr */) | R4A(dst_r) | R0A(tmp0));
+}
+
 /* LEVAL will be defined later with different parameters as needed */
 #define WHEN2(cond, i1, i2) (cond) ? LEVAL(i1) : LEVAL(i2)
 
@@ -2091,23 +2244,25 @@
 
 	dst_r = FAST_IS_REG(dst) ? gpr(REG_MASK & dst) : tmp0;
 	src_r = FAST_IS_REG(src) ? gpr(REG_MASK & src) : tmp0;
-	if (src & SLJIT_MEM)
-		FAIL_IF(load_word(compiler, src_r, src, srcw, src & SLJIT_32));
 
 	compiler->status_flags_state = op & (VARIABLE_FLAG_MASK | SLJIT_SET_Z);
 
 	/* TODO(mundaym): optimize loads and stores */
-	switch (opcode | (op & SLJIT_32)) {
+	switch (opcode) {
 	case SLJIT_NOT:
-		/* emulate ~x with x^-1 */
-		FAIL_IF(push_load_imm_inst(compiler, tmp1, -1));
-		if (src_r != dst_r)
-			FAIL_IF(push_inst(compiler, lgr(dst_r, src_r)));
+		if (src & SLJIT_MEM)
+			FAIL_IF(load_word(compiler, src_r, src, srcw, op & SLJIT_32));
 
-		FAIL_IF(push_inst(compiler, xgr(dst_r, tmp1)));
-		break;
-	case SLJIT_NOT32:
 		/* emulate ~x with x^-1 */
+		if (!(op & SLJIT_32)) {
+			FAIL_IF(push_load_imm_inst(compiler, tmp1, -1));
+			if (src_r != dst_r)
+				FAIL_IF(push_inst(compiler, lgr(dst_r, src_r)));
+
+			FAIL_IF(push_inst(compiler, xgr(dst_r, tmp1)));
+			break;
+		}
+
 		if (have_eimm())
 			FAIL_IF(push_inst(compiler, xilf(dst_r, 0xffffffff)));
 		else {
@@ -2119,24 +2274,11 @@
 		}
 		break;
 	case SLJIT_CLZ:
-		if (have_eimm()) {
-			FAIL_IF(push_inst(compiler, flogr(tmp0, src_r))); /* clobbers tmp1 */
-			if (dst_r != tmp0)
-				FAIL_IF(push_inst(compiler, lgr(dst_r, tmp0)));
-		} else {
-			abort(); /* TODO(mundaym): no eimm (?) */
-		}
-		break;
-	case SLJIT_CLZ32:
-		if (have_eimm()) {
-			FAIL_IF(push_inst(compiler, sllg(tmp1, src_r, 32, 0)));
-			FAIL_IF(push_inst(compiler, iilf(tmp1, 0xffffffff)));
-			FAIL_IF(push_inst(compiler, flogr(tmp0, tmp1))); /* clobbers tmp1 */
-			if (dst_r != tmp0)
-				FAIL_IF(push_inst(compiler, lr(dst_r, tmp0)));
-		} else {
-			abort(); /* TODO(mundaym): no eimm (?) */
-		}
+	case SLJIT_CTZ:
+		if (src & SLJIT_MEM)
+			FAIL_IF(load_unsigned_word(compiler, src_r, src, srcw, op & SLJIT_32));
+
+		FAIL_IF(sljit_emit_clz_ctz(compiler, op, dst_r, src_r));
 		break;
 	default:
 		SLJIT_UNREACHABLE();
@@ -2145,9 +2287,8 @@
 	if ((op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)) == (SLJIT_SET_Z | SLJIT_SET_OVERFLOW))
 		FAIL_IF(update_zero_overflow(compiler, op, dst_r));
 
-	/* TODO(carenas): doesn't need FAIL_IF */
 	if (dst & SLJIT_MEM)
-		FAIL_IF(store_word(compiler, dst_r, dst, dstw, op & SLJIT_32));
+		return store_word(compiler, dst_r, dst, dstw, op & SLJIT_32);
 
 	return SLJIT_SUCCESS;
 }
@@ -2166,11 +2307,6 @@
 	return 0;
 }
 
-static SLJIT_INLINE int is_shift(sljit_s32 op) {
-	sljit_s32 v = GET_OPCODE(op);
-	return (v == SLJIT_SHL || v == SLJIT_ASHR || v == SLJIT_LSHR) ? 1 : 0;
-}
-
 static const struct ins_forms add_forms = {
 	0x1a00, /* ar */
 	0xb9080000, /* agr */
@@ -2604,33 +2740,41 @@
 	sljit_ins ins;
 
 	if (FAST_IS_REG(src1))
-		src_r = gpr(src1 & REG_MASK);
+		src_r = gpr(src1);
 	else
 		FAIL_IF(emit_move(compiler, tmp0, src1, src1w));
 
-	if (src2 & SLJIT_IMM)
+	if (!(src2 & SLJIT_IMM)) {
+		if (FAST_IS_REG(src2))
+			base_r = gpr(src2);
+		else {
+			FAIL_IF(emit_move(compiler, tmp1, src2, src2w));
+			base_r = tmp1;
+		}
+
+		if ((op & SLJIT_32) && (type == SLJIT_MSHL || type == SLJIT_MLSHR || type == SLJIT_MASHR)) {
+			if (base_r != tmp1) {
+				FAIL_IF(push_inst(compiler, 0xec0000000055 /* risbg */ | R36A(tmp1) | R32A(base_r) | (59 << 24) | (1 << 23) | (63 << 16)));
+				base_r = tmp1;
+			} else
+				FAIL_IF(push_inst(compiler, 0xa5070000 /* nill */ | R20A(tmp1) | 0x1f));
+		}
+	} else
 		imm = (sljit_ins)(src2w & ((op & SLJIT_32) ? 0x1f : 0x3f));
-	else if (FAST_IS_REG(src2))
-		base_r = gpr(src2 & REG_MASK);
-	else {
-		FAIL_IF(emit_move(compiler, tmp1, src2, src2w));
-		base_r = tmp1;
-	}
 
 	if ((op & SLJIT_32) && dst_r == src_r) {
-		if (type == SLJIT_SHL)
+		if (type == SLJIT_SHL || type == SLJIT_MSHL)
 			ins = 0x89000000 /* sll */;
-		else if (type == SLJIT_LSHR)
+		else if (type == SLJIT_LSHR || type == SLJIT_MLSHR)
 			ins = 0x88000000 /* srl */;
 		else
 			ins = 0x8a000000 /* sra */;
 
 		FAIL_IF(push_inst(compiler, ins | R20A(dst_r) | R12A(base_r) | imm));
-	}
-	else {
-		if (type == SLJIT_SHL)
+	} else {
+		if (type == SLJIT_SHL || type == SLJIT_MSHL)
 			ins = (op & SLJIT_32) ? 0xeb00000000df /* sllk */ : 0xeb000000000d /* sllg */;
-		else if (type == SLJIT_LSHR)
+		else if (type == SLJIT_LSHR || type == SLJIT_MLSHR)
 			ins = (op & SLJIT_32) ? 0xeb00000000de /* srlk */ : 0xeb000000000c /* srlg */;
 		else
 			ins = (op & SLJIT_32) ? 0xeb00000000dc /* srak */ : 0xeb000000000a /* srag */;
@@ -2644,6 +2788,47 @@
 	return SLJIT_SUCCESS;
 }
 
+static sljit_s32 sljit_emit_rotate(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_gpr dst_r = FAST_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;
+	sljit_gpr src_r = tmp0;
+	sljit_gpr base_r = tmp0;
+	sljit_ins imm = 0;
+	sljit_ins ins;
+
+	if (FAST_IS_REG(src1))
+		src_r = gpr(src1);
+	else
+		FAIL_IF(emit_move(compiler, tmp0, src1, src1w));
+
+	if (!(src2 & SLJIT_IMM)) {
+		if (FAST_IS_REG(src2))
+			base_r = gpr(src2);
+		else {
+			FAIL_IF(emit_move(compiler, tmp1, src2, src2w));
+			base_r = tmp1;
+		}
+	}
+
+	if (GET_OPCODE(op) == SLJIT_ROTR) {
+		if (!(src2 & SLJIT_IMM)) {
+			ins = (op & SLJIT_32) ? 0x1300 /* lcr */ : 0xb9030000 /* lcgr */;
+			FAIL_IF(push_inst(compiler, ins | R4A(tmp1) | R0A(base_r)));
+			base_r = tmp1;
+		} else
+			src2w = -src2w;
+	}
+
+	if (src2 & SLJIT_IMM)
+		imm = (sljit_ins)(src2w & ((op & SLJIT_32) ? 0x1f : 0x3f));
+
+	ins = (op & SLJIT_32) ? 0xeb000000001d /* rll */ : 0xeb000000001c /* rllg */;
+	return push_inst(compiler, ins | R36A(dst_r) | R32A(src_r) | R28A(base_r) | (imm << 16));
+}
+
 static const struct ins_forms addc_forms = {
 	0xb9980000, /* alcr */
 	0xb9880000, /* alcgr */
@@ -2716,10 +2901,17 @@
 		FAIL_IF(sljit_emit_bitwise(compiler, op, dst, src1, src1w, src2, src2w));
 		break;
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		FAIL_IF(sljit_emit_shift(compiler, op, dst, src1, src1w, src2, src2w));
 		break;
+	case SLJIT_ROTL:
+	case SLJIT_ROTR:
+		FAIL_IF(sljit_emit_rotate(compiler, op, dst, src1, src1w, src2, src2w));
+		break;
 	}
 
 	if (dst & SLJIT_MEM)
@@ -2734,18 +2926,130 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_op2(compiler, op, (sljit_s32)tmp0, 0, src1, src1w, src2, src2w);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src_dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_s32 is_right;
+	sljit_sw bit_length = (op & SLJIT_32) ? 32 : 64;
+	sljit_gpr src_dst_r = gpr(src_dst);
+	sljit_gpr src1_r = tmp0;
+	sljit_gpr src2_r = tmp1;
+	sljit_ins ins;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, src_dst, src1, src1w, src2, src2w));
+
+	is_right = (GET_OPCODE(op) == SLJIT_LSHR || GET_OPCODE(op) == SLJIT_MLSHR);
+
+	if (src_dst == src1) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_op2(compiler, (is_right ? SLJIT_ROTR : SLJIT_ROTL) | (op & SLJIT_32), src_dst, 0, src_dst, 0, src2, src2w);
+	}
+
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+	if (src1 & SLJIT_MEM)
+		FAIL_IF(load_word(compiler, tmp0, src1, src1w, op & SLJIT_32));
+	else if (src1 & SLJIT_IMM)
+		FAIL_IF(push_load_imm_inst(compiler, tmp0, src1w));
+	else
+		src1_r = gpr(src1);
+
+	if (src2 & SLJIT_IMM) {
+		src2w &= bit_length - 1;
+
+		if (src2w == 0)
+			return SLJIT_SUCCESS;
+	} else if (!(src2 & SLJIT_MEM))
+		src2_r = gpr(src2);
+	else
+		FAIL_IF(load_word(compiler, tmp1, src2, src2w, op & SLJIT_32));
+
+	if (src2 & SLJIT_IMM) {
+		if (op & SLJIT_32) {
+			ins = is_right ? 0x88000000 /* srl */ : 0x89000000 /* sll */;
+			FAIL_IF(push_inst(compiler, ins | R20A(src_dst_r) | (sljit_ins)src2w));
+		} else {
+			ins = is_right ? 0xeb000000000c /* srlg */ : 0xeb000000000d /* sllg */;
+			FAIL_IF(push_inst(compiler, ins | R36A(src_dst_r) | R32A(src_dst_r) | ((sljit_ins)src2w << 16)));
+		}
+
+		ins = 0xec0000000055 /* risbg */;
+
+		if (is_right) {
+			src2w = bit_length - src2w;
+			ins |= ((sljit_ins)(64 - bit_length) << 24) | ((sljit_ins)(63 - src2w) << 16) | ((sljit_ins)src2w << 8);
+		} else
+			ins |= ((sljit_ins)(64 - src2w) << 24) | ((sljit_ins)63 << 16) | ((sljit_ins)src2w << 8);
+
+		return push_inst(compiler, ins | R36A(src_dst_r) | R32A(src1_r));
+	}
+
+	if (op & SLJIT_32) {
+		if (GET_OPCODE(op) == SLJIT_MSHL || GET_OPCODE(op) == SLJIT_MLSHR) {
+			if (src2_r != tmp1) {
+				FAIL_IF(push_inst(compiler, 0xec0000000055 /* risbg */ | R36A(tmp1) | R32A(src2_r) | (59 << 24) | (1 << 23) | (63 << 16)));
+				src2_r = tmp1;
+			} else
+				FAIL_IF(push_inst(compiler, 0xa5070000 /* nill */ | R20A(tmp1) | 0x1f));
+		}
+
+		ins = is_right ? 0x88000000 /* srl */ : 0x89000000 /* sll */;
+		FAIL_IF(push_inst(compiler, ins | R20A(src_dst_r) | R12A(src2_r)));
+
+		if (src2_r != tmp1) {
+			FAIL_IF(push_inst(compiler, 0xa50f0000 /* llill */ | R20A(tmp1) | 0x1f));
+			FAIL_IF(push_inst(compiler, 0x1700 /* xr */ | R4A(tmp1) | R0A(src2_r)));
+		} else
+			FAIL_IF(push_inst(compiler, 0xc00700000000 /* xilf */ | R36A(tmp1) | 0x1f));
+
+		if (src1_r == tmp0) {
+			ins = is_right ? 0x89000000 /* sll */ : 0x88000000 /* srl */;
+			FAIL_IF(push_inst(compiler, ins | R20A(tmp0) | R12A(tmp1) | 0x1));
+		} else {
+			ins = is_right ? 0xeb00000000df /* sllk */ : 0xeb00000000de /* srlk */;
+			FAIL_IF(push_inst(compiler, ins | R36A(tmp0) | R32A(src1_r) | R28A(tmp1) | (0x1 << 16)));
+		}
+
+		return push_inst(compiler, 0x1600 /* or */ | R4A(src_dst_r) | R0A(tmp0));
+	}
+
+	ins = is_right ? 0xeb000000000c /* srlg */ : 0xeb000000000d /* sllg */;
+	FAIL_IF(push_inst(compiler, ins | R36A(src_dst_r) | R32A(src_dst_r) | R28A(src2_r)));
+
+	ins = is_right ? 0xeb000000000d /* sllg */ : 0xeb000000000c /* srlg */;
+
+	if (!(op & SLJIT_SHIFT_INTO_NON_ZERO)) {
+		if (src2_r != tmp1)
+			FAIL_IF(push_inst(compiler, 0xa50f0000 /* llill */ | R20A(tmp1) | 0x3f));
+
+		FAIL_IF(push_inst(compiler, ins | R36A(tmp0) | R32A(src1_r) | (0x1 << 16)));
+		src1_r = tmp0;
+
+		if (src2_r != tmp1)
+			FAIL_IF(push_inst(compiler, 0xb9820000 /* xgr */ | R4A(tmp1) | R0A(src2_r)));
+		else
+			FAIL_IF(push_inst(compiler, 0xc00700000000 /* xilf */ | R36A(tmp1) | 0x3f));
+	} else
+		FAIL_IF(push_inst(compiler, 0xb9030000 /* lcgr */ | R4A(tmp1) | R0A(src2_r)));
+
+	FAIL_IF(push_inst(compiler, ins | R36A(tmp0) | R32A(src1_r) | R28A(tmp1)));
+	return push_inst(compiler, 0xb9810000 /* ogr */ | R4A(src_dst_r) | R0A(tmp0));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(
 	struct sljit_compiler *compiler,
 	sljit_s32 op, sljit_s32 src, sljit_sw srcw)
 {
 	sljit_gpr src_r;
+	struct addr addr;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op_src(compiler, op, src, srcw));
@@ -2759,16 +3063,14 @@
 
 		return push_inst(compiler, br(src_r));
 	case SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN:
-		/* TODO(carenas): implement? */
 		return SLJIT_SUCCESS;
 	case SLJIT_PREFETCH_L1:
 	case SLJIT_PREFETCH_L2:
 	case SLJIT_PREFETCH_L3:
 	case SLJIT_PREFETCH_ONCE:
-		/* TODO(carenas): implement */
-		return SLJIT_SUCCESS;
+		FAIL_IF(make_addr_bxy(compiler, &addr, src, srcw, tmp1));
+		return push_inst(compiler, 0xe31000000036 /* pfd */ | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset));
 	default:
-                /* TODO(carenas): probably should not success by default */
 		return SLJIT_SUCCESS;
 	}
 
@@ -3064,10 +3366,10 @@
 	ADJUST_LOCAL_OFFSET(dst, dstw);
 
 	if (FAST_IS_REG(dst))
-		return push_inst(compiler, lgr(gpr(dst), fast_link_r));
+		return push_inst(compiler, lgr(gpr(dst), link_r));
 
 	/* memory */
-	return store_word(compiler, fast_link_r, dst, dstw, 0);
+	return store_word(compiler, link_r, dst, dstw, 0);
 }
 
 /* --------------------------------------------------------------------- */
@@ -3107,7 +3409,7 @@
 	/* emit jump instruction */
 	type &= 0xff;
 	if (type >= SLJIT_FAST_CALL)
-		PTR_FAIL_IF(push_inst(compiler, brasl(type == SLJIT_FAST_CALL ? fast_link_r : link_r, 0)));
+		PTR_FAIL_IF(push_inst(compiler, brasl(link_r, 0)));
 	else
 		PTR_FAIL_IF(push_inst(compiler, brcl(mask, 0)));
 
@@ -3117,19 +3419,16 @@
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 arg_types)
 {
+	SLJIT_UNUSED_ARG(arg_types);
 	CHECK_ERROR_PTR();
 	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
 
 	if (type & SLJIT_CALL_RETURN) {
-		PTR_FAIL_IF(emit_stack_frame_release(compiler));
+		PTR_FAIL_IF(emit_stack_frame_release(compiler, r14));
 		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 	}
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_jump(compiler, type);
 }
 
@@ -3151,7 +3450,7 @@
 
 	/* emit jump instruction */
 	if (type >= SLJIT_FAST_CALL)
-		return push_inst(compiler, basr(type == SLJIT_FAST_CALL ? fast_link_r : link_r, src_r));
+		return push_inst(compiler, basr(link_r, src_r));
 
 	return push_inst(compiler, br(src_r));
 }
@@ -3169,23 +3468,21 @@
 		ADJUST_LOCAL_OFFSET(src, srcw);
 		FAIL_IF(load_word(compiler, tmp1, src, srcw, 0 /* 64-bit */));
 		src = TMP_REG2;
+		srcw = 0;
 	}
 
 	if (type & SLJIT_CALL_RETURN) {
-		if (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0) {
+		if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
 			FAIL_IF(push_inst(compiler, lgr(tmp1, gpr(src))));
 			src = TMP_REG2;
+			srcw = 0;
 		}
 
-		FAIL_IF(emit_stack_frame_release(compiler));
+		FAIL_IF(emit_stack_frame_release(compiler, r14));
 		type = SLJIT_JUMP;
 	}
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_ijump(compiler, type, src, srcw);
 }
 
@@ -3193,7 +3490,7 @@
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 type)
 {
-	sljit_u8 mask = get_cc(compiler, type & 0xff);
+	sljit_u8 mask = get_cc(compiler, type);
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
@@ -3263,27 +3560,92 @@
 	sljit_s32 dst_reg,
 	sljit_s32 src, sljit_sw srcw)
 {
-	sljit_u8 mask = get_cc(compiler, type & 0xff);
-	sljit_gpr dst_r = gpr(dst_reg & ~SLJIT_32);
-	sljit_gpr src_r = FAST_IS_REG(src) ? gpr(src) : tmp0;
+	sljit_ins mask = get_cc(compiler, type & ~SLJIT_32);
+	sljit_gpr src_r;
+	sljit_ins ins;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
 
-	if (src & SLJIT_IMM) {
-		/* TODO(mundaym): fast path with lscond2 */
-		FAIL_IF(push_load_imm_inst(compiler, src_r, srcw));
+	if (type & SLJIT_32)
+		srcw = (sljit_s32)srcw;
+
+	if (have_lscond2() && (src & SLJIT_IMM) && is_s16(srcw)) {
+		ins = (type & SLJIT_32) ? 0xec0000000042 /* lochi */ : 0xec0000000046 /* locghi */;
+		return push_inst(compiler, ins | R36A(gpr(dst_reg)) | (mask << 32) | (sljit_ins)(srcw & 0xffff) << 16);
 	}
 
-	#define LEVAL(i) i(dst_r, src_r, mask)
-	if (have_lscond1())
-		return push_inst(compiler,
-			WHEN2(dst_reg & SLJIT_32, locr, locgr));
+	if (src & SLJIT_IMM) {
+		FAIL_IF(push_load_imm_inst(compiler, tmp0, srcw));
+		src_r = tmp0;
+	} else
+		src_r = gpr(src);
 
-	#undef LEVAL
+	if (have_lscond1()) {
+		ins = (type & SLJIT_32) ? 0xb9f20000 /* locr */ : 0xb9e20000 /* locgr */;
+		return push_inst(compiler, ins | (mask << 12) | R4A(gpr(dst_reg)) | R0A(src_r));
+	}
 
-	/* TODO(mundaym): implement */
-	return SLJIT_ERR_UNSUPPORTED;
+	return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_ins ins, reg1, reg2, base, offs = 0;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+
+	ADJUST_LOCAL_OFFSET(mem, memw);
+
+	base = gpr(mem & REG_MASK);
+	reg1 = gpr(REG_PAIR_FIRST(reg));
+	reg2 = gpr(REG_PAIR_SECOND(reg));
+
+	if (mem & OFFS_REG_MASK) {
+		memw &= 0x3;
+		offs = gpr(OFFS_REG(mem));
+
+		if (memw != 0) {
+			FAIL_IF(push_inst(compiler, 0xeb000000000d /* sllg */ | R36A(tmp1) | R32A(offs) | ((sljit_ins)memw << 16)));
+			offs = tmp1;
+		} else if (!(type & SLJIT_MEM_STORE) && (base == reg1 || base == reg2) && (offs == reg1 || offs == reg2)) {
+			FAIL_IF(push_inst(compiler, 0xb9f80000 | R12A(tmp1) | R4A(base) | R0A(offs)));
+			base = tmp1;
+			offs = 0;
+		}
+
+		memw = 0;
+	} else if (memw < -0x80000 || memw > 0x7ffff - ((reg2 == reg1 + 1) ? 0 : SSIZE_OF(sw))) {
+		FAIL_IF(push_load_imm_inst(compiler, tmp1, memw));
+
+		if (base == 0)
+			base = tmp1;
+		else
+			offs = tmp1;
+
+		memw = 0;
+	}
+
+	if (offs == 0 && reg2 == (reg1 + 1)) {
+		ins = (type & SLJIT_MEM_STORE) ? 0xeb0000000024 /* stmg */ : 0xeb0000000004 /* lmg */;
+		return push_inst(compiler, ins | R36A(reg1) | R32A(reg2) | R28A(base) | disp_s20((sljit_s32)memw));
+	}
+
+	ins = ((type & SLJIT_MEM_STORE) ? 0xe30000000024 /* stg */ : 0xe30000000004 /* lg */) | R32A(offs) | R28A(base);
+
+	if (!(type & SLJIT_MEM_STORE) && base == reg1) {
+		FAIL_IF(push_inst(compiler, ins | R36A(reg2) | disp_s20((sljit_s32)memw + SSIZE_OF(sw))));
+		return push_inst(compiler, ins | R36A(reg1) | disp_s20((sljit_s32)memw));
+	}
+
+	FAIL_IF(push_inst(compiler, ins | R36A(reg1) | disp_s20((sljit_s32)memw)));
+	return push_inst(compiler, ins | R36A(reg2) | disp_s20((sljit_s32)memw + SSIZE_OF(sw)));
 }
 
 /* --------------------------------------------------------------------- */
diff --git a/src/sljit/sljitNativeSPARC_32.c b/src/sljit/sljitNativeSPARC_32.c
deleted file mode 100644
index 218992b..0000000
--- a/src/sljit/sljitNativeSPARC_32.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- *    Stack-less Just-In-Time compiler
- *
- *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are
- * permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice, this list of
- *      conditions and the following disclaimer.
- *
- *   2. Redistributions in binary form must reproduce the above copyright notice, this list
- *      of conditions and the following disclaimer in the documentation and/or other materials
- *      provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
- * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw imm)
-{
-	if (imm <= SIMM_MAX && imm >= SIMM_MIN)
-		return push_inst(compiler, OR | D(dst) | S1(0) | IMM(imm), DR(dst));
-
-	FAIL_IF(push_inst(compiler, SETHI | D(dst) | ((imm >> 10) & 0x3fffff), DR(dst)));
-	return (imm & 0x3ff) ? push_inst(compiler, OR | D(dst) | S1(dst) | IMM_ARG | (imm & 0x3ff), DR(dst)) : SLJIT_SUCCESS;
-}
-
-#define ARG2(flags, src2) ((flags & SRC2_IMM) ? IMM(src2) : S2(src2))
-
-static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_u32 flags,
-	sljit_s32 dst, sljit_s32 src1, sljit_sw src2)
-{
-	SLJIT_COMPILE_ASSERT(ICC_IS_SET == SET_FLAGS, icc_is_set_and_set_flags_must_be_the_same);
-
-	switch (op) {
-	case SLJIT_MOV:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if (dst != src2)
-			return push_inst(compiler, OR | D(dst) | S1(0) | S2(src2), DR(dst));
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_U8:
-	case SLJIT_MOV_S8:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
-			if (op == SLJIT_MOV_U8)
-				return push_inst(compiler, AND | D(dst) | S1(src2) | IMM(0xff), DR(dst));
-			FAIL_IF(push_inst(compiler, SLL | D(dst) | S1(src2) | IMM(24), DR(dst)));
-			return push_inst(compiler, SRA | D(dst) | S1(dst) | IMM(24), DR(dst));
-		}
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_MOV_U16:
-	case SLJIT_MOV_S16:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
-			FAIL_IF(push_inst(compiler, SLL | D(dst) | S1(src2) | IMM(16), DR(dst)));
-			return push_inst(compiler, (op == SLJIT_MOV_S16 ? SRA : SRL) | D(dst) | S1(dst) | IMM(16), DR(dst));
-		}
-		SLJIT_ASSERT(dst == src2);
-		return SLJIT_SUCCESS;
-
-	case SLJIT_NOT:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		return push_inst(compiler, XNOR | (flags & SET_FLAGS) | D(dst) | S1(0) | S2(src2), DRF(dst, flags));
-
-	case SLJIT_CLZ:
-		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
-		FAIL_IF(push_inst(compiler, SUB | SET_FLAGS | D(0) | S1(src2) | S2(0), SET_FLAGS));
-		FAIL_IF(push_inst(compiler, OR | D(TMP_REG1) | S1(0) | S2(src2), DR(TMP_REG1)));
-		FAIL_IF(push_inst(compiler, BICC | DA(0x1) | (7 & DISP_MASK), UNMOVABLE_INS));
-		FAIL_IF(push_inst(compiler, OR | D(dst) | S1(0) | IMM(32), UNMOVABLE_INS));
-		FAIL_IF(push_inst(compiler, OR | D(dst) | S1(0) | IMM(-1), DR(dst)));
-
-		/* Loop. */
-		FAIL_IF(push_inst(compiler, SUB | SET_FLAGS | D(0) | S1(TMP_REG1) | S2(0), SET_FLAGS));
-		FAIL_IF(push_inst(compiler, SLL | D(TMP_REG1) | S1(TMP_REG1) | IMM(1), DR(TMP_REG1)));
-		FAIL_IF(push_inst(compiler, BICC | DA(0xe) | ((sljit_ins)-2 & DISP_MASK), UNMOVABLE_INS));
-		return push_inst(compiler, ADD | D(dst) | S1(dst) | IMM(1), UNMOVABLE_INS);
-
-	case SLJIT_ADD:
-		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
-		return push_inst(compiler, ADD | (flags & SET_FLAGS) | D(dst) | S1(src1) | ARG2(flags, src2), DRF(dst, flags));
-
-	case SLJIT_ADDC:
-		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
-		return push_inst(compiler, ADDC | (flags & SET_FLAGS) | D(dst) | S1(src1) | ARG2(flags, src2), DRF(dst, flags));
-
-	case SLJIT_SUB:
-		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_SUB;
-		return push_inst(compiler, SUB | (flags & SET_FLAGS) | D(dst) | S1(src1) | ARG2(flags, src2), DRF(dst, flags));
-
-	case SLJIT_SUBC:
-		compiler->status_flags_state = SLJIT_CURRENT_FLAGS_SUB;
-		return push_inst(compiler, SUBC | (flags & SET_FLAGS) | D(dst) | S1(src1) | ARG2(flags, src2), DRF(dst, flags));
-
-	case SLJIT_MUL:
-		compiler->status_flags_state = 0;
-		FAIL_IF(push_inst(compiler, SMUL | D(dst) | S1(src1) | ARG2(flags, src2), DR(dst)));
-		if (!(flags & SET_FLAGS))
-			return SLJIT_SUCCESS;
-		FAIL_IF(push_inst(compiler, SRA | D(TMP_REG1) | S1(dst) | IMM(31), DR(TMP_REG1)));
-		FAIL_IF(push_inst(compiler, RDY | D(TMP_LINK), DR(TMP_LINK)));
-		return push_inst(compiler, SUB | SET_FLAGS | D(0) | S1(TMP_REG1) | S2(TMP_LINK), MOVABLE_INS | SET_FLAGS);
-
-	case SLJIT_AND:
-		return push_inst(compiler, AND | (flags & SET_FLAGS) | D(dst) | S1(src1) | ARG2(flags, src2), DRF(dst, flags));
-
-	case SLJIT_OR:
-		return push_inst(compiler, OR | (flags & SET_FLAGS) | D(dst) | S1(src1) | ARG2(flags, src2), DRF(dst, flags));
-
-	case SLJIT_XOR:
-		return push_inst(compiler, XOR | (flags & SET_FLAGS) | D(dst) | S1(src1) | ARG2(flags, src2), DRF(dst, flags));
-
-	case SLJIT_SHL:
-		FAIL_IF(push_inst(compiler, SLL | D(dst) | S1(src1) | ARG2(flags, src2), DR(dst)));
-		return !(flags & SET_FLAGS) ? SLJIT_SUCCESS : push_inst(compiler, SUB | SET_FLAGS | D(0) | S1(dst) | S2(0), SET_FLAGS);
-
-	case SLJIT_LSHR:
-		FAIL_IF(push_inst(compiler, SRL | D(dst) | S1(src1) | ARG2(flags, src2), DR(dst)));
-		return !(flags & SET_FLAGS) ? SLJIT_SUCCESS : push_inst(compiler, SUB | SET_FLAGS | D(0) | S1(dst) | S2(0), SET_FLAGS);
-
-	case SLJIT_ASHR:
-		FAIL_IF(push_inst(compiler, SRA | D(dst) | S1(src1) | ARG2(flags, src2), DR(dst)));
-		return !(flags & SET_FLAGS) ? SLJIT_SUCCESS : push_inst(compiler, SUB | SET_FLAGS | D(0) | S1(dst) | S2(0), SET_FLAGS);
-	}
-
-	SLJIT_UNREACHABLE();
-	return SLJIT_SUCCESS;
-}
-
-static sljit_s32 call_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types, sljit_s32 *src)
-{
-	sljit_s32 reg_index = 8;
-	sljit_s32 word_reg_index = 8;
-	sljit_s32 float_arg_index = 1;
-	sljit_s32 double_arg_count = 0;
-	sljit_u32 float_offset = (16 + 6) * sizeof(sljit_sw);
-	sljit_s32 types = 0;
-	sljit_s32 reg = 0;
-	sljit_s32 move_to_tmp2 = 0;
-
-	if (src)
-		reg = reg_map[*src & REG_MASK];
-
-	arg_types >>= SLJIT_ARG_SHIFT;
-
-	while (arg_types) {
-		types = (types << SLJIT_ARG_SHIFT) | (arg_types & SLJIT_ARG_MASK);
-
-		switch (arg_types & SLJIT_ARG_MASK) {
-		case SLJIT_ARG_TYPE_F64:
-			float_arg_index++;
-			double_arg_count++;
-			if (reg_index == reg || reg_index + 1 == reg)
-				move_to_tmp2 = 1;
-			reg_index += 2;
-			break;
-		case SLJIT_ARG_TYPE_F32:
-			float_arg_index++;
-			if (reg_index == reg)
-				move_to_tmp2 = 1;
-			reg_index++;
-			break;
-		default:
-			if (reg_index != word_reg_index && reg_index == reg)
-				move_to_tmp2 = 1;
-			reg_index++;
-			word_reg_index++;
-			break;
-		}
-
-		arg_types >>= SLJIT_ARG_SHIFT;
-	}
-
-	if (move_to_tmp2) {
-		if (reg < 14)
-			FAIL_IF(push_inst(compiler, OR | D(TMP_REG1) | S1(0) | S2A(reg), DR(TMP_REG1)));
-		*src = TMP_REG1;
-	}
-
-	arg_types = types;
-
-	while (arg_types) {
-		switch (arg_types & SLJIT_ARG_MASK) {
-		case SLJIT_ARG_TYPE_F64:
-			float_arg_index--;
-			if (float_arg_index == 4 && double_arg_count == 4) {
-				/* The address is not doubleword aligned, so two instructions are required to store the double. */
-				FAIL_IF(push_inst(compiler, STF | FD(float_arg_index) | S1(SLJIT_SP) | IMM((16 + 7) * sizeof(sljit_sw)), MOVABLE_INS));
-				FAIL_IF(push_inst(compiler, STF | FD(float_arg_index) | (1 << 25) | S1(SLJIT_SP) | IMM((16 + 8) * sizeof(sljit_sw)), MOVABLE_INS));
-			}
-			else
-				FAIL_IF(push_inst(compiler, STDF | FD(float_arg_index) | S1(SLJIT_SP) | IMM(float_offset), MOVABLE_INS));
-			float_offset -= sizeof(sljit_f64);
-			break;
-		case SLJIT_ARG_TYPE_F32:
-			float_arg_index--;
-			FAIL_IF(push_inst(compiler, STF | FD(float_arg_index) | S1(SLJIT_SP) | IMM(float_offset), MOVABLE_INS));
-			float_offset -= sizeof(sljit_f64);
-			break;
-		default:
-			break;
-		}
-
-		arg_types >>= SLJIT_ARG_SHIFT;
-	}
-
-	float_offset = (16 + 6) * sizeof(sljit_sw);
-
-	while (types) {
-		switch (types & SLJIT_ARG_MASK) {
-		case SLJIT_ARG_TYPE_F64:
-			reg_index -= 2;
-			if (reg_index < 14) {
-				if ((reg_index & 0x1) != 0) {
-					FAIL_IF(push_inst(compiler, LDUW | DA(reg_index) | S1(SLJIT_SP) | IMM(float_offset), reg_index));
-					if (reg_index < 8 + 6 - 1)
-						FAIL_IF(push_inst(compiler, LDUW | DA(reg_index + 1) | S1(SLJIT_SP) | IMM(float_offset + sizeof(sljit_sw)), reg_index + 1));
-				}
-				else
-					FAIL_IF(push_inst(compiler, LDD | DA(reg_index) | S1(SLJIT_SP) | IMM(float_offset), reg_index));
-			}
-			float_offset -= sizeof(sljit_f64);
-			break;
-		case SLJIT_ARG_TYPE_F32:
-			reg_index--;
-			if (reg_index < 8 + 6)
-				FAIL_IF(push_inst(compiler, LDUW | DA(reg_index) | S1(SLJIT_SP) | IMM(float_offset), reg_index));
-			float_offset -= sizeof(sljit_f64);
-			break;
-		default:
-			reg_index--;
-			word_reg_index--;
-
-			if (reg_index != word_reg_index) {
-				if (reg_index < 14)
-					FAIL_IF(push_inst(compiler, OR | DA(reg_index) | S1(0) | S2A(word_reg_index), reg_index));
-				else
-					FAIL_IF(push_inst(compiler, STW | DA(word_reg_index) | S1(SLJIT_SP) | IMM(92), word_reg_index));
-			}
-			break;
-		}
-
-		types >>= SLJIT_ARG_SHIFT;
-	}
-
-	return SLJIT_SUCCESS;
-}
-
-static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw init_value)
-{
-	FAIL_IF(push_inst(compiler, SETHI | D(dst) | ((init_value >> 10) & 0x3fffff), DR(dst)));
-	return push_inst(compiler, OR | D(dst) | S1(dst) | IMM_ARG | (init_value & 0x3ff), DR(dst));
-}
-
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
-{
-	sljit_ins *inst = (sljit_ins *)addr;
-	SLJIT_UNUSED_ARG(executable_offset);
-
-	SLJIT_UPDATE_WX_FLAGS(inst, inst + 2, 0);
-	SLJIT_ASSERT(((inst[0] & 0xc1c00000) == 0x01000000) && ((inst[1] & 0xc1f82000) == 0x80102000));
-	inst[0] = (inst[0] & 0xffc00000) | ((new_target >> 10) & 0x3fffff);
-	inst[1] = (inst[1] & 0xfffffc00) | (new_target & 0x3ff);
-	SLJIT_UPDATE_WX_FLAGS(inst, inst + 2, 1);
-	inst = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(inst, executable_offset);
-	SLJIT_CACHE_FLUSH(inst, inst + 2);
-}
-
-SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
-{
-	sljit_set_jump_addr(addr, (sljit_uw)new_constant, executable_offset);
-}
diff --git a/src/sljit/sljitNativeSPARC_common.c b/src/sljit/sljitNativeSPARC_common.c
deleted file mode 100644
index c8d19e1..0000000
--- a/src/sljit/sljitNativeSPARC_common.c
+++ /dev/null
@@ -1,1673 +0,0 @@
-/*
- *    Stack-less Just-In-Time compiler
- *
- *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are
- * permitted provided that the following conditions are met:
- *
- *   1. Redistributions of source code must retain the above copyright notice, this list of
- *      conditions and the following disclaimer.
- *
- *   2. Redistributions in binary form must reproduce the above copyright notice, this list
- *      of conditions and the following disclaimer in the documentation and/or other materials
- *      provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
- * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
-{
-	return "SPARC" SLJIT_CPUINFO;
-}
-
-/* Length of an instruction word
-   Both for sparc-32 and sparc-64 */
-typedef sljit_u32 sljit_ins;
-
-#if (defined SLJIT_CACHE_FLUSH_OWN_IMPL && SLJIT_CACHE_FLUSH_OWN_IMPL)
-
-static void sparc_cache_flush(sljit_ins *from, sljit_ins *to)
-{
-#if defined(__SUNPRO_C) && __SUNPRO_C < 0x590
-	__asm (
-		/* if (from == to) return */
-		"cmp %i0, %i1\n"
-		"be .leave\n"
-		"nop\n"
-
-		/* loop until from >= to */
-		".mainloop:\n"
-		"flush %i0\n"
-		"add %i0, 8, %i0\n"
-		"cmp %i0, %i1\n"
-		"bcs .mainloop\n"
-		"nop\n"
-
-		/* The comparison was done above. */
-		"bne .leave\n"
-		/* nop is not necessary here, since the
-		   sub operation has no side effect. */
-		"sub %i0, 4, %i0\n"
-		"flush %i0\n"
-		".leave:"
-	);
-#else
-	if (SLJIT_UNLIKELY(from == to))
-		return;
-
-	do {
-		__asm__ volatile (
-			"flush %0\n"
-			: : "r"(from)
-		);
-		/* Operates at least on doubleword. */
-		from += 2;
-	} while (from < to);
-
-	if (from == to) {
-		/* Flush the last word. */
-		from --;
-		__asm__ volatile (
-			"flush %0\n"
-			: : "r"(from)
-		);
-	}
-#endif
-}
-
-#endif /* (defined SLJIT_CACHE_FLUSH_OWN_IMPL && SLJIT_CACHE_FLUSH_OWN_IMPL) */
-
-/* TMP_REG2 is not used by getput_arg */
-#define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
-#define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
-#define TMP_REG3	(SLJIT_NUMBER_OF_REGISTERS + 4)
-/* This register is modified by calls, which affects the instruction
-   in the delay slot if it is used as a source register. */
-#define TMP_LINK	(SLJIT_NUMBER_OF_REGISTERS + 5)
-
-#define TMP_FREG1	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
-#define TMP_FREG2	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
-
-static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 6] = {
-	0, 8, 9, 10, 11, 23, 22, 21, 20, 19, 18, 17, 16, 29, 28, 27, 26, 25, 24, 14, 1, 12, 13, 15
-};
-
-static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
-	0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
-};
-
-/* --------------------------------------------------------------------- */
-/*  Instrucion forms                                                     */
-/* --------------------------------------------------------------------- */
-
-#define D(d)		((sljit_ins)reg_map[d] << 25)
-#define FD(d)		((sljit_ins)freg_map[d] << 25)
-#define FDN(d)		(((sljit_ins)freg_map[d] | 0x1) << 25)
-#define DA(d)		((sljit_ins)(d) << 25)
-#define S1(s1)		((sljit_ins)reg_map[s1] << 14)
-#define FS1(s1)		((sljit_ins)freg_map[s1] << 14)
-#define S1A(s1)		((sljit_ins)(s1) << 14)
-#define S2(s2)		((sljit_ins)reg_map[s2])
-#define FS2(s2)		((sljit_ins)freg_map[s2])
-#define FS2N(s2)	((sljit_ins)freg_map[s2] | 0x1)
-#define S2A(s2)		((sljit_ins)(s2))
-#define IMM_ARG		0x2000
-#define DOP(op)		((sljit_ins)(op) << 5)
-#define IMM(imm)	(((sljit_ins)(imm) & 0x1fff) | IMM_ARG)
-
-#define DR(dr)		(reg_map[dr])
-#define DRF(dr, flags)	((sljit_s32)(reg_map[dr] | ((flags) & SET_FLAGS)))
-#define OPC1(opcode)	((sljit_ins)(opcode) << 30)
-#define OPC2(opcode)	((sljit_ins)(opcode) << 22)
-#define OPC3(opcode)	((sljit_ins)(opcode) << 19)
-#define SET_FLAGS	OPC3(0x10)
-
-#define ADD		(OPC1(0x2) | OPC3(0x00))
-#define ADDC		(OPC1(0x2) | OPC3(0x08))
-#define AND		(OPC1(0x2) | OPC3(0x01))
-#define ANDN		(OPC1(0x2) | OPC3(0x05))
-#define CALL		(OPC1(0x1))
-#define FABSS		(OPC1(0x2) | OPC3(0x34) | DOP(0x09))
-#define FADDD		(OPC1(0x2) | OPC3(0x34) | DOP(0x42))
-#define FADDS		(OPC1(0x2) | OPC3(0x34) | DOP(0x41))
-#define FCMPD		(OPC1(0x2) | OPC3(0x35) | DOP(0x52))
-#define FCMPS		(OPC1(0x2) | OPC3(0x35) | DOP(0x51))
-#define FDIVD		(OPC1(0x2) | OPC3(0x34) | DOP(0x4e))
-#define FDIVS		(OPC1(0x2) | OPC3(0x34) | DOP(0x4d))
-#define FDTOI		(OPC1(0x2) | OPC3(0x34) | DOP(0xd2))
-#define FDTOS		(OPC1(0x2) | OPC3(0x34) | DOP(0xc6))
-#define FITOD		(OPC1(0x2) | OPC3(0x34) | DOP(0xc8))
-#define FITOS		(OPC1(0x2) | OPC3(0x34) | DOP(0xc4))
-#define FMOVS		(OPC1(0x2) | OPC3(0x34) | DOP(0x01))
-#define FMULD		(OPC1(0x2) | OPC3(0x34) | DOP(0x4a))
-#define FMULS		(OPC1(0x2) | OPC3(0x34) | DOP(0x49))
-#define FNEGS		(OPC1(0x2) | OPC3(0x34) | DOP(0x05))
-#define FSTOD		(OPC1(0x2) | OPC3(0x34) | DOP(0xc9))
-#define FSTOI		(OPC1(0x2) | OPC3(0x34) | DOP(0xd1))
-#define FSUBD		(OPC1(0x2) | OPC3(0x34) | DOP(0x46))
-#define FSUBS		(OPC1(0x2) | OPC3(0x34) | DOP(0x45))
-#define JMPL		(OPC1(0x2) | OPC3(0x38))
-#define LDD		(OPC1(0x3) | OPC3(0x03))
-#define LDDF		(OPC1(0x3) | OPC3(0x23))
-#define LDF		(OPC1(0x3) | OPC3(0x20))
-#define LDUW		(OPC1(0x3) | OPC3(0x00))
-#define NOP		(OPC1(0x0) | OPC2(0x04))
-#define OR		(OPC1(0x2) | OPC3(0x02))
-#define ORN		(OPC1(0x2) | OPC3(0x06))
-#define RDY		(OPC1(0x2) | OPC3(0x28) | S1A(0))
-#define RESTORE		(OPC1(0x2) | OPC3(0x3d))
-#define SAVE		(OPC1(0x2) | OPC3(0x3c))
-#define SETHI		(OPC1(0x0) | OPC2(0x04))
-#define SLL		(OPC1(0x2) | OPC3(0x25))
-#define SLLX		(OPC1(0x2) | OPC3(0x25) | (1 << 12))
-#define SRA		(OPC1(0x2) | OPC3(0x27))
-#define SRAX		(OPC1(0x2) | OPC3(0x27) | (1 << 12))
-#define SRL		(OPC1(0x2) | OPC3(0x26))
-#define SRLX		(OPC1(0x2) | OPC3(0x26) | (1 << 12))
-#define STD		(OPC1(0x3) | OPC3(0x07))
-#define STDF		(OPC1(0x3) | OPC3(0x27))
-#define STF		(OPC1(0x3) | OPC3(0x24))
-#define STW		(OPC1(0x3) | OPC3(0x04))
-#define SUB		(OPC1(0x2) | OPC3(0x04))
-#define SUBC		(OPC1(0x2) | OPC3(0x0c))
-#define TA		(OPC1(0x2) | OPC3(0x3a) | (8 << 25))
-#define WRY		(OPC1(0x2) | OPC3(0x30) | DA(0))
-#define XOR		(OPC1(0x2) | OPC3(0x03))
-#define XNOR		(OPC1(0x2) | OPC3(0x07))
-
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-#define MAX_DISP	(0x1fffff)
-#define MIN_DISP	(-0x200000)
-#define DISP_MASK	((sljit_ins)0x3fffff)
-
-#define BICC		(OPC1(0x0) | OPC2(0x2))
-#define FBFCC		(OPC1(0x0) | OPC2(0x6))
-#define SLL_W		SLL
-#define SDIV		(OPC1(0x2) | OPC3(0x0f))
-#define SMUL		(OPC1(0x2) | OPC3(0x0b))
-#define UDIV		(OPC1(0x2) | OPC3(0x0e))
-#define UMUL		(OPC1(0x2) | OPC3(0x0a))
-#else
-#define SLL_W		SLLX
-#endif
-
-#define SIMM_MAX	(0x0fff)
-#define SIMM_MIN	(-0x1000)
-
-/* dest_reg is the absolute name of the register
-   Useful for reordering instructions in the delay slot. */
-static sljit_s32 push_inst(struct sljit_compiler *compiler, sljit_ins ins, sljit_s32 delay_slot)
-{
-	sljit_ins *ptr;
-	SLJIT_ASSERT((delay_slot & DST_INS_MASK) == UNMOVABLE_INS
-		|| (delay_slot & DST_INS_MASK) == MOVABLE_INS
-		|| (delay_slot & DST_INS_MASK) == ((ins >> 25) & 0x1f));
-	ptr = (sljit_ins*)ensure_buf(compiler, sizeof(sljit_ins));
-	FAIL_IF(!ptr);
-	*ptr = ins;
-	compiler->size++;
-	compiler->delay_slot = delay_slot;
-	return SLJIT_SUCCESS;
-}
-
-static SLJIT_INLINE sljit_ins* detect_jump_type(struct sljit_jump *jump, sljit_ins *code_ptr, sljit_ins *code, sljit_sw executable_offset)
-{
-	sljit_sw diff;
-	sljit_uw target_addr;
-	sljit_ins *inst;
-	sljit_ins saved_inst;
-
-	if (jump->flags & SLJIT_REWRITABLE_JUMP)
-		return code_ptr;
-
-	if (jump->flags & JUMP_ADDR)
-		target_addr = jump->u.target;
-	else {
-		SLJIT_ASSERT(jump->flags & JUMP_LABEL);
-		target_addr = (sljit_uw)(code + jump->u.label->size) + (sljit_uw)executable_offset;
-	}
-	inst = (sljit_ins*)jump->addr;
-
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-	if (jump->flags & IS_CALL) {
-		/* Call is always patchable on sparc 32. */
-		jump->flags |= PATCH_CALL;
-		if (jump->flags & IS_MOVABLE) {
-			inst[0] = inst[-1];
-			inst[-1] = CALL;
-			jump->addr -= sizeof(sljit_ins);
-			return inst;
-		}
-		inst[0] = CALL;
-		inst[1] = NOP;
-		return inst + 1;
-	}
-#else
-	/* Both calls and BPr instructions shall not pass this point. */
-#error "Implementation required"
-#endif
-
-	if (jump->flags & IS_COND)
-		inst--;
-
-	diff = ((sljit_sw)target_addr - (sljit_sw)(inst - 1) - executable_offset) >> 2;
-
-	if (jump->flags & IS_MOVABLE) {
-		if (diff <= MAX_DISP && diff >= MIN_DISP) {
-			jump->flags |= PATCH_B;
-			inst--;
-			if (jump->flags & IS_COND) {
-				saved_inst = inst[0];
-				inst[0] = inst[1] ^ (1 << 28);
-				inst[1] = saved_inst;
-			} else {
-				inst[1] = inst[0];
-				inst[0] = BICC | DA(0x8);
-			}
-			jump->addr = (sljit_uw)inst;
-			return inst + 1;
-		}
-	}
-
-	diff += SSIZE_OF(ins);
-
-	if (diff <= MAX_DISP && diff >= MIN_DISP) {
-		jump->flags |= PATCH_B;
-		if (jump->flags & IS_COND)
-			inst[0] ^= (1 << 28);
-		else
-			inst[0] = BICC | DA(0x8);
-		inst[1] = NOP;
-		jump->addr = (sljit_uw)inst;
-		return inst + 1;
-	}
-
-	return code_ptr;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
-{
-	struct sljit_memory_fragment *buf;
-	sljit_ins *code;
-	sljit_ins *code_ptr;
-	sljit_ins *buf_ptr;
-	sljit_ins *buf_end;
-	sljit_uw word_count;
-	sljit_uw next_addr;
-	sljit_sw executable_offset;
-	sljit_sw addr;
-
-	struct sljit_label *label;
-	struct sljit_jump *jump;
-	struct sljit_const *const_;
-	struct sljit_put_label *put_label;
-
-	CHECK_ERROR_PTR();
-	CHECK_PTR(check_sljit_generate_code(compiler));
-	reverse_buf(compiler);
-
-	code = (sljit_ins*)SLJIT_MALLOC_EXEC(compiler->size * sizeof(sljit_ins), compiler->exec_allocator_data);
-	PTR_FAIL_WITH_EXEC_IF(code);
-	buf = compiler->buf;
-
-	code_ptr = code;
-	word_count = 0;
-	next_addr = 0;
-	executable_offset = SLJIT_EXEC_OFFSET(code);
-
-	label = compiler->labels;
-	jump = compiler->jumps;
-	const_ = compiler->consts;
-	put_label = compiler->put_labels;
-
-	do {
-		buf_ptr = (sljit_ins*)buf->memory;
-		buf_end = buf_ptr + (buf->used_size >> 2);
-		do {
-			*code_ptr = *buf_ptr++;
-			if (next_addr == word_count) {
-				SLJIT_ASSERT(!label || label->size >= word_count);
-				SLJIT_ASSERT(!jump || jump->addr >= word_count);
-				SLJIT_ASSERT(!const_ || const_->addr >= word_count);
-				SLJIT_ASSERT(!put_label || put_label->addr >= word_count);
-
-				/* These structures are ordered by their address. */
-				if (label && label->size == word_count) {
-					/* Just recording the address. */
-					label->addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
-					label->size = (sljit_uw)(code_ptr - code);
-					label = label->next;
-				}
-				if (jump && jump->addr == word_count) {
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-					jump->addr = (sljit_uw)(code_ptr - 3);
-#else
-					jump->addr = (sljit_uw)(code_ptr - 6);
-#endif
-					code_ptr = detect_jump_type(jump, code_ptr, code, executable_offset);
-					jump = jump->next;
-				}
-				if (const_ && const_->addr == word_count) {
-					/* Just recording the address. */
-					const_->addr = (sljit_uw)code_ptr;
-					const_ = const_->next;
-				}
-				if (put_label && put_label->addr == word_count) {
-					SLJIT_ASSERT(put_label->label);
-					put_label->addr = (sljit_uw)code_ptr;
-					put_label = put_label->next;
-				}
-				next_addr = compute_next_addr(label, jump, const_, put_label);
-			}
-			code_ptr ++;
-			word_count ++;
-		} while (buf_ptr < buf_end);
-
-		buf = buf->next;
-	} while (buf);
-
-	if (label && label->size == word_count) {
-		label->addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
-		label->size = (sljit_uw)(code_ptr - code);
-		label = label->next;
-	}
-
-	SLJIT_ASSERT(!label);
-	SLJIT_ASSERT(!jump);
-	SLJIT_ASSERT(!const_);
-	SLJIT_ASSERT(!put_label);
-	SLJIT_ASSERT(code_ptr - code <= (sljit_s32)compiler->size);
-
-	jump = compiler->jumps;
-	while (jump) {
-		do {
-			addr = (sljit_sw)((jump->flags & JUMP_LABEL) ? jump->u.label->addr : jump->u.target);
-			buf_ptr = (sljit_ins *)jump->addr;
-
-			if (jump->flags & PATCH_CALL) {
-				addr = (addr - (sljit_sw)SLJIT_ADD_EXEC_OFFSET(buf_ptr, executable_offset)) >> 2;
-				SLJIT_ASSERT(addr <= 0x1fffffff && addr >= -0x20000000);
-				buf_ptr[0] = CALL | ((sljit_ins)addr & 0x3fffffff);
-				break;
-			}
-			if (jump->flags & PATCH_B) {
-				addr = (addr - (sljit_sw)SLJIT_ADD_EXEC_OFFSET(buf_ptr, executable_offset)) >> 2;
-				SLJIT_ASSERT(addr <= MAX_DISP && addr >= MIN_DISP);
-				buf_ptr[0] = (buf_ptr[0] & ~DISP_MASK) | ((sljit_ins)addr & DISP_MASK);
-				break;
-			}
-
-			/* Set the fields of immediate loads. */
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-			SLJIT_ASSERT(((buf_ptr[0] & 0xc1cfffff) == 0x01000000) && ((buf_ptr[1] & 0xc1f83fff) == 0x80102000));
-			buf_ptr[0] |= (sljit_ins)(addr >> 10) & 0x3fffff;
-			buf_ptr[1] |= (sljit_ins)addr & 0x3ff;
-#else
-#error "Implementation required"
-#endif
-		} while (0);
-		jump = jump->next;
-	}
-
-	put_label = compiler->put_labels;
-	while (put_label) {
-		addr = (sljit_sw)put_label->label->addr;
-		buf_ptr = (sljit_ins *)put_label->addr;
-
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-		SLJIT_ASSERT(((buf_ptr[0] & 0xc1cfffff) == 0x01000000) && ((buf_ptr[1] & 0xc1f83fff) == 0x80102000));
-		buf_ptr[0] |= (addr >> 10) & 0x3fffff;
-		buf_ptr[1] |= addr & 0x3ff;
-#else
-#error "Implementation required"
-#endif
-		put_label = put_label->next;
-	}
-
-	compiler->error = SLJIT_ERR_COMPILED;
-	compiler->executable_offset = executable_offset;
-	compiler->executable_size = (sljit_uw)(code_ptr - code) * sizeof(sljit_ins);
-
-	code = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
-	code_ptr = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
-
-	SLJIT_CACHE_FLUSH(code, code_ptr);
-	SLJIT_UPDATE_WX_FLAGS(code, code_ptr, 1);
-	return code;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
-{
-	switch (feature_type) {
-	case SLJIT_HAS_FPU:
-#ifdef SLJIT_IS_FPU_AVAILABLE
-		return SLJIT_IS_FPU_AVAILABLE;
-#else
-		/* Available by default. */
-		return 1;
-#endif
-
-	case SLJIT_HAS_ZERO_REGISTER:
-		return 1;
-
-#if (defined SLJIT_CONFIG_SPARC_64 && SLJIT_CONFIG_SPARC_64)
-	case SLJIT_HAS_CMOV:
-		return 1;
-#endif
-
-	default:
-		return 0;
-	}
-}
-
-/* --------------------------------------------------------------------- */
-/*  Entry, exit                                                          */
-/* --------------------------------------------------------------------- */
-
-/* Creates an index in data_transfer_insts array. */
-#define LOAD_DATA	0x01
-#define WORD_DATA	0x00
-#define BYTE_DATA	0x02
-#define HALF_DATA	0x04
-#define INT_DATA	0x06
-#define SIGNED_DATA	0x08
-/* Separates integer and floating point registers */
-#define GPR_REG		0x0f
-#define DOUBLE_DATA	0x10
-#define SINGLE_DATA	0x12
-
-#define MEM_MASK	0x1f
-
-#define ARG_TEST	0x00020
-#define ALT_KEEP_CACHE	0x00040
-#define CUMULATIVE_OP	0x00080
-#define IMM_OP		0x00100
-#define MOVE_OP		0x00200
-#define SRC2_IMM	0x00400
-
-#define REG_DEST	0x00800
-#define REG2_SOURCE	0x01000
-#define SLOW_SRC1	0x02000
-#define SLOW_SRC2	0x04000
-#define SLOW_DEST	0x08000
-
-/* SET_FLAGS (0x10 << 19) also belong here! */
-
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-#include "sljitNativeSPARC_32.c"
-#else
-#include "sljitNativeSPARC_64.c"
-#endif
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
-	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
-	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
-{
-	sljit_s32 reg_index, types, tmp;
-	sljit_u32 float_offset, args_offset;
-	sljit_s32 saved_arg_index, scratch_arg_index, float_arg_index;
-
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
-	set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
-
-	local_size = (local_size + SLJIT_LOCALS_OFFSET + 7) & ~0x7;
-	compiler->local_size = local_size;
-
-	if (local_size <= -SIMM_MIN) {
-		FAIL_IF(push_inst(compiler, SAVE | D(SLJIT_SP) | S1(SLJIT_SP) | IMM(-local_size), UNMOVABLE_INS));
-	}
-	else {
-		FAIL_IF(load_immediate(compiler, TMP_REG1, -local_size));
-		FAIL_IF(push_inst(compiler, SAVE | D(SLJIT_SP) | S1(SLJIT_SP) | S2(TMP_REG1), UNMOVABLE_INS));
-	}
-
-	arg_types >>= SLJIT_ARG_SHIFT;
-
-	types = arg_types;
-	float_offset = 16 * sizeof(sljit_sw);
-	reg_index = 24;
-
-	while (types && reg_index < 24 + 6) {
-		switch (types & SLJIT_ARG_MASK) {
-		case SLJIT_ARG_TYPE_F64:
-			if (reg_index & 0x1) {
-				FAIL_IF(push_inst(compiler, STW | DA(reg_index) | S1(SLJIT_SP) | IMM(float_offset), MOVABLE_INS));
-				if (reg_index >= 24 + 6 - 1)
-					break;
-				FAIL_IF(push_inst(compiler, STW | DA(reg_index + 1) | S1(SLJIT_SP) | IMM(float_offset + sizeof(sljit_sw)), MOVABLE_INS));
-			} else
-				FAIL_IF(push_inst(compiler, STD | DA(reg_index) | S1(SLJIT_SP) | IMM(float_offset), MOVABLE_INS));
-
-			float_offset += sizeof(sljit_f64);
-			reg_index++;
-			break;
-		case SLJIT_ARG_TYPE_F32:
-			FAIL_IF(push_inst(compiler, STW | DA(reg_index) | S1(SLJIT_SP) | IMM(float_offset), MOVABLE_INS));
-			float_offset += sizeof(sljit_f64);
-			break;
-		}
-
-		reg_index++;
-		types >>= SLJIT_ARG_SHIFT;
-	}
-
-	args_offset = (16 + 1 + 6) * sizeof(sljit_sw);
-	float_offset = 16 * sizeof(sljit_sw);
-	reg_index = 24;
-	saved_arg_index = 24;
-	scratch_arg_index = 8 - 1;
-	float_arg_index = 1;
-
-	while (arg_types) {
-		switch (arg_types & SLJIT_ARG_MASK) {
-		case SLJIT_ARG_TYPE_F64:
-			if (reg_index < 24 + 6 - 1) {
-				FAIL_IF(push_inst(compiler, LDDF | FD(float_arg_index) | S1(SLJIT_SP) | IMM(float_offset), MOVABLE_INS));
-			} else if (reg_index < 24 + 6) {
-				FAIL_IF(push_inst(compiler, LDF | FD(float_arg_index) | S1(SLJIT_SP) | IMM(float_offset), MOVABLE_INS));
-				FAIL_IF(push_inst(compiler, LDF | FD(float_arg_index) | (1 << 25) | S1A(30) | IMM(args_offset), MOVABLE_INS));
-			} else {
-				FAIL_IF(push_inst(compiler, LDF | FD(float_arg_index) | S1A(30) | IMM(args_offset), MOVABLE_INS));
-				FAIL_IF(push_inst(compiler, LDF | FD(float_arg_index) | (1 << 25) | S1A(30) | IMM(args_offset + sizeof(sljit_sw)), MOVABLE_INS));
-			}
-
-			float_arg_index++;
-			float_offset += sizeof(sljit_f64);
-			reg_index++;
-			break;
-		case SLJIT_ARG_TYPE_F32:
-			if (reg_index < 24 + 6)
-				FAIL_IF(push_inst(compiler, LDF | FD(float_arg_index) | S1(SLJIT_SP) | IMM(float_offset), MOVABLE_INS));
-			else
-				FAIL_IF(push_inst(compiler, LDF | FD(float_arg_index) | S1A(30) | IMM(args_offset), MOVABLE_INS));
-			float_arg_index++;
-			float_offset += sizeof(sljit_f64);
-			break;
-		default:
-			scratch_arg_index++;
-
-			if (!(arg_types & SLJIT_ARG_TYPE_SCRATCH_REG)) {
-				tmp = saved_arg_index++;
-				if (tmp == reg_index)
-					break;
-			} else
-				tmp = scratch_arg_index;
-
-			if (reg_index < 24 + 6)
-				FAIL_IF(push_inst(compiler, OR | DA(tmp) | S1(0) | S2A(reg_index), tmp));
-			else
-				FAIL_IF(push_inst(compiler, LDUW | DA(tmp) | S1A(30) | IMM(args_offset), tmp));
-			break;
-		}
-
-		reg_index++;
-		arg_types >>= SLJIT_ARG_SHIFT;
-	}
-
-	return SLJIT_SUCCESS;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_set_context(struct sljit_compiler *compiler,
-	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
-	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
-	set_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
-
-	compiler->local_size = (local_size + SLJIT_LOCALS_OFFSET + 7) & ~0x7;
-	return SLJIT_SUCCESS;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_return_void(compiler));
-
-	FAIL_IF(push_inst(compiler, JMPL | D(0) | S1A(31) | IMM(8), UNMOVABLE_INS));
-	return push_inst(compiler, RESTORE | D(SLJIT_R0) | S1(SLJIT_R0) | S2(0), UNMOVABLE_INS);
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_return(compiler, op, src, srcw));
-
-	if (TYPE_CAST_NEEDED(op) || !FAST_IS_REG(src)) {
-		FAIL_IF(emit_mov_before_return(compiler, op, src, srcw));
-		src = SLJIT_R0;
-	}
-
-	FAIL_IF(push_inst(compiler, JMPL | D(0) | S1A(31) | IMM(8), UNMOVABLE_INS));
-	return push_inst(compiler, RESTORE | D(SLJIT_R0) | S1(src) | S2(0), UNMOVABLE_INS);
-}
-
-/* --------------------------------------------------------------------- */
-/*  Operators                                                            */
-/* --------------------------------------------------------------------- */
-
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-#define ARCH_32_64(a, b)	a
-#else
-#define ARCH_32_64(a, b)	b
-#endif
-
-static const sljit_ins data_transfer_insts[16 + 4] = {
-/* u w s */ ARCH_32_64(OPC1(3) | OPC3(0x04) /* stw */, OPC1(3) | OPC3(0x0e) /* stx */),
-/* u w l */ ARCH_32_64(OPC1(3) | OPC3(0x00) /* lduw */, OPC1(3) | OPC3(0x0b) /* ldx */),
-/* u b s */ OPC1(3) | OPC3(0x05) /* stb */,
-/* u b l */ OPC1(3) | OPC3(0x01) /* ldub */,
-/* u h s */ OPC1(3) | OPC3(0x06) /* sth */,
-/* u h l */ OPC1(3) | OPC3(0x02) /* lduh */,
-/* u i s */ OPC1(3) | OPC3(0x04) /* stw */,
-/* u i l */ OPC1(3) | OPC3(0x00) /* lduw */,
-
-/* s w s */ ARCH_32_64(OPC1(3) | OPC3(0x04) /* stw */, OPC1(3) | OPC3(0x0e) /* stx */),
-/* s w l */ ARCH_32_64(OPC1(3) | OPC3(0x00) /* lduw */, OPC1(3) | OPC3(0x0b) /* ldx */),
-/* s b s */ OPC1(3) | OPC3(0x05) /* stb */,
-/* s b l */ OPC1(3) | OPC3(0x09) /* ldsb */,
-/* s h s */ OPC1(3) | OPC3(0x06) /* sth */,
-/* s h l */ OPC1(3) | OPC3(0x0a) /* ldsh */,
-/* s i s */ OPC1(3) | OPC3(0x04) /* stw */,
-/* s i l */ ARCH_32_64(OPC1(3) | OPC3(0x00) /* lduw */, OPC1(3) | OPC3(0x08) /* ldsw */),
-
-/* d   s */ OPC1(3) | OPC3(0x27),
-/* d   l */ OPC1(3) | OPC3(0x23),
-/* s   s */ OPC1(3) | OPC3(0x24),
-/* s   l */ OPC1(3) | OPC3(0x20),
-};
-
-#undef ARCH_32_64
-
-/* Can perform an operation using at most 1 instruction. */
-static sljit_s32 getput_arg_fast(struct sljit_compiler *compiler, sljit_u32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
-{
-	SLJIT_ASSERT(arg & SLJIT_MEM);
-
-	if ((!(arg & OFFS_REG_MASK) && argw <= SIMM_MAX && argw >= SIMM_MIN)
-			|| ((arg & OFFS_REG_MASK) && (argw & 0x3) == 0)) {
-		/* Works for both absoulte and relative addresses (immediate case). */
-		if (SLJIT_UNLIKELY(flags & ARG_TEST))
-			return 1;
-		FAIL_IF(push_inst(compiler, data_transfer_insts[flags & MEM_MASK]
-			| ((flags & MEM_MASK) <= GPR_REG ? D(reg) : FD(reg))
-			| S1(arg & REG_MASK) | ((arg & OFFS_REG_MASK) ? S2(OFFS_REG(arg)) : IMM(argw)),
-			((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA)) ? DR(reg) : MOVABLE_INS));
-		return -1;
-	}
-	return 0;
-}
-
-/* See getput_arg below.
-   Note: can_cache is called only for binary operators. Those
-   operators always uses word arguments without write back. */
-static sljit_s32 can_cache(sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
-{
-	SLJIT_ASSERT((arg & SLJIT_MEM) && (next_arg & SLJIT_MEM));
-
-	/* Simple operation except for updates. */
-	if (arg & OFFS_REG_MASK) {
-		argw &= 0x3;
-		SLJIT_ASSERT(argw);
-		next_argw &= 0x3;
-		if ((arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK) && argw == next_argw)
-			return 1;
-		return 0;
-	}
-
-	if (((next_argw - argw) <= SIMM_MAX && (next_argw - argw) >= SIMM_MIN))
-		return 1;
-	return 0;
-}
-
-/* Emit the necessary instructions. See can_cache above. */
-static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_u32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw, sljit_s32 next_arg, sljit_sw next_argw)
-{
-	sljit_s32 base, arg2, delay_slot;
-	sljit_ins dest;
-
-	SLJIT_ASSERT(arg & SLJIT_MEM);
-	if (!(next_arg & SLJIT_MEM)) {
-		next_arg = 0;
-		next_argw = 0;
-	}
-
-	base = arg & REG_MASK;
-	if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
-		argw &= 0x3;
-
-		/* Using the cache. */
-		if (((SLJIT_MEM | (arg & OFFS_REG_MASK)) == compiler->cache_arg) && (argw == compiler->cache_argw))
-			arg2 = TMP_REG3;
-		else {
-			if ((arg & OFFS_REG_MASK) == (next_arg & OFFS_REG_MASK) && argw == (next_argw & 0x3)) {
-				compiler->cache_arg = SLJIT_MEM | (arg & OFFS_REG_MASK);
-				compiler->cache_argw = argw;
-				arg2 = TMP_REG3;
-			}
-			else if ((flags & LOAD_DATA) && ((flags & MEM_MASK) <= GPR_REG) && reg != base && reg != OFFS_REG(arg))
-				arg2 = reg;
-			else /* It must be a mov operation, so tmp1 must be free to use. */
-				arg2 = TMP_REG1;
-			FAIL_IF(push_inst(compiler, SLL_W | D(arg2) | S1(OFFS_REG(arg)) | IMM_ARG | (sljit_ins)argw, DR(arg2)));
-		}
-	}
-	else {
-		/* Using the cache. */
-		if ((compiler->cache_arg == SLJIT_MEM) && (argw - compiler->cache_argw) <= SIMM_MAX && (argw - compiler->cache_argw) >= SIMM_MIN) {
-			if (argw != compiler->cache_argw) {
-				FAIL_IF(push_inst(compiler, ADD | D(TMP_REG3) | S1(TMP_REG3) | IMM(argw - compiler->cache_argw), DR(TMP_REG3)));
-				compiler->cache_argw = argw;
-			}
-			arg2 = TMP_REG3;
-		} else {
-			if ((next_argw - argw) <= SIMM_MAX && (next_argw - argw) >= SIMM_MIN) {
-				compiler->cache_arg = SLJIT_MEM;
-				compiler->cache_argw = argw;
-				arg2 = TMP_REG3;
-			}
-			else if ((flags & LOAD_DATA) && ((flags & MEM_MASK) <= GPR_REG) && reg != base)
-				arg2 = reg;
-			else /* It must be a mov operation, so tmp1 must be free to use. */
-				arg2 = TMP_REG1;
-			FAIL_IF(load_immediate(compiler, arg2, argw));
-		}
-	}
-
-	dest = ((flags & MEM_MASK) <= GPR_REG ? D(reg) : FD(reg));
-	delay_slot = ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA)) ? DR(reg) : MOVABLE_INS;
-	if (!base)
-		return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | dest | S1(arg2) | IMM(0), delay_slot);
-	return push_inst(compiler, data_transfer_insts[flags & MEM_MASK] | dest | S1(base) | S2(arg2), delay_slot);
-}
-
-static SLJIT_INLINE sljit_s32 emit_op_mem(struct sljit_compiler *compiler, sljit_u32 flags, sljit_s32 reg, sljit_s32 arg, sljit_sw argw)
-{
-	if (getput_arg_fast(compiler, flags, reg, arg, argw))
-		return compiler->error;
-	compiler->cache_arg = 0;
-	compiler->cache_argw = 0;
-	return getput_arg(compiler, flags, reg, arg, argw, 0, 0);
-}
-
-static SLJIT_INLINE sljit_s32 emit_op_mem2(struct sljit_compiler *compiler, sljit_u32 flags, sljit_s32 reg, sljit_s32 arg1, sljit_sw arg1w, sljit_s32 arg2, sljit_sw arg2w)
-{
-	if (getput_arg_fast(compiler, flags, reg, arg1, arg1w))
-		return compiler->error;
-	return getput_arg(compiler, flags, reg, arg1, arg1w, arg2, arg2w);
-}
-
-static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_u32 flags,
-	sljit_s32 dst, sljit_sw dstw,
-	sljit_s32 src1, sljit_sw src1w,
-	sljit_s32 src2, sljit_sw src2w)
-{
-	/* arg1 goes to TMP_REG1 or src reg
-	   arg2 goes to TMP_REG2, imm or src reg
-	   TMP_REG3 can be used for caching
-	   result goes to TMP_REG2, so put result can use TMP_REG1 and TMP_REG3. */
-	sljit_s32 dst_r = TMP_REG2;
-	sljit_s32 src1_r;
-	sljit_sw src2_r = 0;
-	sljit_s32 sugg_src2_r = TMP_REG2;
-
-	if (!(flags & ALT_KEEP_CACHE)) {
-		compiler->cache_arg = 0;
-		compiler->cache_argw = 0;
-	}
-
-	if (dst != TMP_REG2) {
-		if (FAST_IS_REG(dst)) {
-			dst_r = dst;
-			flags |= REG_DEST;
-			if (flags & MOVE_OP)
-				sugg_src2_r = dst_r;
-		}
-		else if ((dst & SLJIT_MEM) && !getput_arg_fast(compiler, flags | ARG_TEST, TMP_REG1, dst, dstw))
-			flags |= SLOW_DEST;
-	}
-
-	if (flags & IMM_OP) {
-		if ((src2 & SLJIT_IMM) && src2w) {
-			if (src2w <= SIMM_MAX && src2w >= SIMM_MIN) {
-				flags |= SRC2_IMM;
-				src2_r = src2w;
-			}
-		}
-		if (!(flags & SRC2_IMM) && (flags & CUMULATIVE_OP) && (src1 & SLJIT_IMM) && src1w) {
-			if (src1w <= SIMM_MAX && src1w >= SIMM_MIN) {
-				flags |= SRC2_IMM;
-				src2_r = src1w;
-
-				/* And swap arguments. */
-				src1 = src2;
-				src1w = src2w;
-				src2 = SLJIT_IMM;
-				/* src2w = src2_r unneeded. */
-			}
-		}
-	}
-
-	/* Source 1. */
-	if (FAST_IS_REG(src1))
-		src1_r = src1;
-	else if (src1 & SLJIT_IMM) {
-		if (src1w) {
-			FAIL_IF(load_immediate(compiler, TMP_REG1, src1w));
-			src1_r = TMP_REG1;
-		}
-		else
-			src1_r = 0;
-	}
-	else {
-		if (getput_arg_fast(compiler, flags | LOAD_DATA, TMP_REG1, src1, src1w))
-			FAIL_IF(compiler->error);
-		else
-			flags |= SLOW_SRC1;
-		src1_r = TMP_REG1;
-	}
-
-	/* Source 2. */
-	if (FAST_IS_REG(src2)) {
-		src2_r = src2;
-		flags |= REG2_SOURCE;
-		if ((flags & (REG_DEST | MOVE_OP)) == MOVE_OP)
-			dst_r = src2_r;
-	}
-	else if (src2 & SLJIT_IMM) {
-		if (!(flags & SRC2_IMM)) {
-			if (src2w) {
-				FAIL_IF(load_immediate(compiler, sugg_src2_r, src2w));
-				src2_r = sugg_src2_r;
-			}
-			else {
-				src2_r = 0;
-				if (flags & MOVE_OP) {
-					if (dst & SLJIT_MEM)
-						dst_r = 0;
-					else
-						op = SLJIT_MOV;
-				}
-			}
-		}
-	}
-	else {
-		if (getput_arg_fast(compiler, flags | LOAD_DATA, sugg_src2_r, src2, src2w))
-			FAIL_IF(compiler->error);
-		else
-			flags |= SLOW_SRC2;
-		src2_r = sugg_src2_r;
-	}
-
-	if ((flags & (SLOW_SRC1 | SLOW_SRC2)) == (SLOW_SRC1 | SLOW_SRC2)) {
-		SLJIT_ASSERT(src2_r == TMP_REG2);
-		if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, src1w, dst, dstw)) {
-			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG2, src2, src2w, src1, src1w));
-			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG1, src1, src1w, dst, dstw));
-		}
-		else {
-			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG1, src1, src1w, src2, src2w));
-			FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG2, src2, src2w, dst, dstw));
-		}
-	}
-	else if (flags & SLOW_SRC1)
-		FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, TMP_REG1, src1, src1w, dst, dstw));
-	else if (flags & SLOW_SRC2)
-		FAIL_IF(getput_arg(compiler, flags | LOAD_DATA, sugg_src2_r, src2, src2w, dst, dstw));
-
-	FAIL_IF(emit_single_op(compiler, op, flags, dst_r, src1_r, src2_r));
-
-	if (dst & SLJIT_MEM) {
-		if (!(flags & SLOW_DEST)) {
-			getput_arg_fast(compiler, flags, dst_r, dst, dstw);
-			return compiler->error;
-		}
-		return getput_arg(compiler, flags, dst_r, dst, dstw, 0, 0);
-	}
-
-	return SLJIT_SUCCESS;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_op0(compiler, op));
-
-	op = GET_OPCODE(op);
-	switch (op) {
-	case SLJIT_BREAKPOINT:
-		return push_inst(compiler, TA, UNMOVABLE_INS);
-	case SLJIT_NOP:
-		return push_inst(compiler, NOP, UNMOVABLE_INS);
-	case SLJIT_LMUL_UW:
-	case SLJIT_LMUL_SW:
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-		FAIL_IF(push_inst(compiler, (op == SLJIT_LMUL_UW ? UMUL : SMUL) | D(SLJIT_R0) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R0)));
-		return push_inst(compiler, RDY | D(SLJIT_R1), DR(SLJIT_R1));
-#else
-#error "Implementation required"
-#endif
-	case SLJIT_DIVMOD_UW:
-	case SLJIT_DIVMOD_SW:
-	case SLJIT_DIV_UW:
-	case SLJIT_DIV_SW:
-		SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-		if ((op | 0x2) == SLJIT_DIV_UW)
-			FAIL_IF(push_inst(compiler, WRY | S1(0), MOVABLE_INS));
-		else {
-			FAIL_IF(push_inst(compiler, SRA | D(TMP_REG1) | S1(SLJIT_R0) | IMM(31), DR(TMP_REG1)));
-			FAIL_IF(push_inst(compiler, WRY | S1(TMP_REG1), MOVABLE_INS));
-		}
-		if (op <= SLJIT_DIVMOD_SW)
-			FAIL_IF(push_inst(compiler, OR | D(TMP_REG2) | S1(0) | S2(SLJIT_R0), DR(TMP_REG2)));
-		FAIL_IF(push_inst(compiler, ((op | 0x2) == SLJIT_DIV_UW ? UDIV : SDIV) | D(SLJIT_R0) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R0)));
-		if (op >= SLJIT_DIV_UW)
-			return SLJIT_SUCCESS;
-		FAIL_IF(push_inst(compiler, SMUL | D(SLJIT_R1) | S1(SLJIT_R0) | S2(SLJIT_R1), DR(SLJIT_R1)));
-		return push_inst(compiler, SUB | D(SLJIT_R1) | S1(TMP_REG2) | S2(SLJIT_R1), DR(SLJIT_R1));
-#else
-#error "Implementation required"
-#endif
-	case SLJIT_ENDBR:
-	case SLJIT_SKIP_FRAMES_BEFORE_RETURN:
-		return SLJIT_SUCCESS;
-	}
-
-	return SLJIT_SUCCESS;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
-	sljit_s32 dst, sljit_sw dstw,
-	sljit_s32 src, sljit_sw srcw)
-{
-	sljit_u32 flags = HAS_FLAGS(op) ? SET_FLAGS : 0;
-
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-	ADJUST_LOCAL_OFFSET(src, srcw);
-
-	op = GET_OPCODE(op);
-	switch (op) {
-	case SLJIT_MOV:
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-	case SLJIT_MOV_U32:
-	case SLJIT_MOV_S32:
-	case SLJIT_MOV32:
-#endif
-	case SLJIT_MOV_P:
-		return emit_op(compiler, SLJIT_MOV, flags | WORD_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, srcw);
-
-	case SLJIT_MOV_U8:
-		return emit_op(compiler, SLJIT_MOV_U8, flags | BYTE_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8)srcw : srcw);
-
-	case SLJIT_MOV_S8:
-		return emit_op(compiler, SLJIT_MOV_S8, flags | BYTE_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8)srcw : srcw);
-
-	case SLJIT_MOV_U16:
-		return emit_op(compiler, SLJIT_MOV_U16, flags | HALF_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16)srcw : srcw);
-
-	case SLJIT_MOV_S16:
-		return emit_op(compiler, SLJIT_MOV_S16, flags | HALF_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
-
-	case SLJIT_NOT:
-	case SLJIT_CLZ:
-		return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, src, srcw);
-	}
-
-	return SLJIT_SUCCESS;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
-	sljit_s32 dst, sljit_sw dstw,
-	sljit_s32 src1, sljit_sw src1w,
-	sljit_s32 src2, sljit_sw src2w)
-{
-	sljit_u32 flags = HAS_FLAGS(op) ? SET_FLAGS : 0;
-
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_op2(compiler, op, 0, dst, dstw, src1, src1w, src2, src2w));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-	ADJUST_LOCAL_OFFSET(src1, src1w);
-	ADJUST_LOCAL_OFFSET(src2, src2w);
-
-	op = GET_OPCODE(op);
-	switch (op) {
-	case SLJIT_ADD:
-	case SLJIT_ADDC:
-	case SLJIT_MUL:
-	case SLJIT_AND:
-	case SLJIT_OR:
-	case SLJIT_XOR:
-		return emit_op(compiler, op, flags | CUMULATIVE_OP | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
-
-	case SLJIT_SUB:
-	case SLJIT_SUBC:
-		return emit_op(compiler, op, flags | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
-
-	case SLJIT_SHL:
-	case SLJIT_LSHR:
-	case SLJIT_ASHR:
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-		if (src2 & SLJIT_IMM)
-			src2w &= 0x1f;
-#else
-		SLJIT_UNREACHABLE();
-#endif
-		return emit_op(compiler, op, flags | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
-	}
-
-	return SLJIT_SUCCESS;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compiler, sljit_s32 op,
-	sljit_s32 src1, sljit_sw src1w,
-	sljit_s32 src2, sljit_sw src2w)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
-
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-	return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w);
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
-	sljit_s32 src, sljit_sw srcw)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_op_src(compiler, op, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
-
-	switch (op) {
-	case SLJIT_FAST_RETURN:
-		if (FAST_IS_REG(src))
-			FAIL_IF(push_inst(compiler, OR | D(TMP_LINK) | S1(0) | S2(src), DR(TMP_LINK)));
-		else
-			FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_LINK, src, srcw));
-
-		FAIL_IF(push_inst(compiler, JMPL | D(0) | S1(TMP_LINK) | IMM(8), UNMOVABLE_INS));
-		return push_inst(compiler, NOP, UNMOVABLE_INS);
-	case SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN:
-	case SLJIT_PREFETCH_L1:
-	case SLJIT_PREFETCH_L2:
-	case SLJIT_PREFETCH_L3:
-	case SLJIT_PREFETCH_ONCE:
-		return SLJIT_SUCCESS;
-	}
-
-	return SLJIT_SUCCESS;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
-{
-	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
-	return reg_map[reg];
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
-{
-	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
-	return freg_map[reg];
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
-	void *instruction, sljit_u32 size)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
-
-	return push_inst(compiler, *(sljit_ins*)instruction, UNMOVABLE_INS);
-}
-
-/* --------------------------------------------------------------------- */
-/*  Floating point operators                                             */
-/* --------------------------------------------------------------------- */
-
-#define FLOAT_DATA(op) ((sljit_ins)DOUBLE_DATA | (((sljit_ins)(op) & SLJIT_32) >> 7))
-#define SELECT_FOP(op, single, double) ((op & SLJIT_32) ? single : double)
-#define FLOAT_TMP_MEM_OFFSET (22 * sizeof(sljit_sw))
-
-static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
-	sljit_s32 dst, sljit_sw dstw,
-	sljit_s32 src, sljit_sw srcw)
-{
-	if (src & SLJIT_MEM) {
-		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src, srcw, dst, dstw));
-		src = TMP_FREG1;
-	}
-
-	FAIL_IF(push_inst(compiler, SELECT_FOP(op, FSTOI, FDTOI) | FD(TMP_FREG1) | FS2(src), MOVABLE_INS));
-
-	if (FAST_IS_REG(dst)) {
-		FAIL_IF(emit_op_mem2(compiler, SINGLE_DATA, TMP_FREG1, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET));
-		return emit_op_mem2(compiler, WORD_DATA | LOAD_DATA, dst, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET);
-	}
-
-	/* Store the integer value from a VFP register. */
-	return emit_op_mem2(compiler, SINGLE_DATA, TMP_FREG1, dst, dstw, 0, 0);
-}
-
-static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
-	sljit_s32 dst, sljit_sw dstw,
-	sljit_s32 src, sljit_sw srcw)
-{
-	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
-
-	if (src & SLJIT_IMM) {
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
-			srcw = (sljit_s32)srcw;
-#endif
-		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
-		src = TMP_REG1;
-		srcw = 0;
-	}
-
-	if (FAST_IS_REG(src)) {
-		FAIL_IF(emit_op_mem2(compiler, WORD_DATA, src, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET, SLJIT_MEM1(SLJIT_SP), FLOAT_TMP_MEM_OFFSET));
-		src = SLJIT_MEM1(SLJIT_SP);
-		srcw = FLOAT_TMP_MEM_OFFSET;
-	}
-
-	FAIL_IF(emit_op_mem2(compiler, SINGLE_DATA | LOAD_DATA, TMP_FREG1, src, srcw, dst, dstw));
-	FAIL_IF(push_inst(compiler, SELECT_FOP(op, FITOS, FITOD) | FD(dst_r) | FS2(TMP_FREG1), MOVABLE_INS));
-
-	if (dst & SLJIT_MEM)
-		return emit_op_mem2(compiler, FLOAT_DATA(op), TMP_FREG1, dst, dstw, 0, 0);
-	return SLJIT_SUCCESS;
-}
-
-static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
-	sljit_s32 src1, sljit_sw src1w,
-	sljit_s32 src2, sljit_sw src2w)
-{
-	if (src1 & SLJIT_MEM) {
-		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, src2, src2w));
-		src1 = TMP_FREG1;
-	}
-
-	if (src2 & SLJIT_MEM) {
-		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, 0, 0));
-		src2 = TMP_FREG2;
-	}
-
-	return push_inst(compiler, SELECT_FOP(op, FCMPS, FCMPD) | FS1(src1) | FS2(src2), FCC_IS_SET | MOVABLE_INS);
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
-	sljit_s32 dst, sljit_sw dstw,
-	sljit_s32 src, sljit_sw srcw)
-{
-	sljit_s32 dst_r;
-
-	CHECK_ERROR();
-	compiler->cache_arg = 0;
-	compiler->cache_argw = 0;
-
-	SLJIT_COMPILE_ASSERT((SLJIT_32 == 0x100) && !(DOUBLE_DATA & 0x2), float_transfer_bit_error);
-	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
-
-	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32)
-		op ^= SLJIT_32;
-
-	dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
-
-	if (src & SLJIT_MEM) {
-		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, dst_r, src, srcw, dst, dstw));
-		src = dst_r;
-	}
-
-	switch (GET_OPCODE(op)) {
-	case SLJIT_MOV_F64:
-		if (src != dst_r) {
-			if (dst_r != TMP_FREG1) {
-				FAIL_IF(push_inst(compiler, FMOVS | FD(dst_r) | FS2(src), MOVABLE_INS));
-				if (!(op & SLJIT_32))
-					FAIL_IF(push_inst(compiler, FMOVS | FDN(dst_r) | FS2N(src), MOVABLE_INS));
-			}
-			else
-				dst_r = src;
-		}
-		break;
-	case SLJIT_NEG_F64:
-		FAIL_IF(push_inst(compiler, FNEGS | FD(dst_r) | FS2(src), MOVABLE_INS));
-		if (dst_r != src && !(op & SLJIT_32))
-			FAIL_IF(push_inst(compiler, FMOVS | FDN(dst_r) | FS2N(src), MOVABLE_INS));
-		break;
-	case SLJIT_ABS_F64:
-		FAIL_IF(push_inst(compiler, FABSS | FD(dst_r) | FS2(src), MOVABLE_INS));
-		if (dst_r != src && !(op & SLJIT_32))
-			FAIL_IF(push_inst(compiler, FMOVS | FDN(dst_r) | FS2N(src), MOVABLE_INS));
-		break;
-	case SLJIT_CONV_F64_FROM_F32:
-		FAIL_IF(push_inst(compiler, SELECT_FOP(op, FSTOD, FDTOS) | FD(dst_r) | FS2(src), MOVABLE_INS));
-		op ^= SLJIT_32;
-		break;
-	}
-
-	if (dst & SLJIT_MEM)
-		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op), dst_r, dst, dstw, 0, 0));
-	return SLJIT_SUCCESS;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
-	sljit_s32 dst, sljit_sw dstw,
-	sljit_s32 src1, sljit_sw src1w,
-	sljit_s32 src2, sljit_sw src2w)
-{
-	sljit_s32 dst_r, flags = 0;
-
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-	ADJUST_LOCAL_OFFSET(src1, src1w);
-	ADJUST_LOCAL_OFFSET(src2, src2w);
-
-	compiler->cache_arg = 0;
-	compiler->cache_argw = 0;
-
-	dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG2;
-
-	if (src1 & SLJIT_MEM) {
-		if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w)) {
-			FAIL_IF(compiler->error);
-			src1 = TMP_FREG1;
-		} else
-			flags |= SLOW_SRC1;
-	}
-
-	if (src2 & SLJIT_MEM) {
-		if (getput_arg_fast(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w)) {
-			FAIL_IF(compiler->error);
-			src2 = TMP_FREG2;
-		} else
-			flags |= SLOW_SRC2;
-	}
-
-	if ((flags & (SLOW_SRC1 | SLOW_SRC2)) == (SLOW_SRC1 | SLOW_SRC2)) {
-		if (!can_cache(src1, src1w, src2, src2w) && can_cache(src1, src1w, dst, dstw)) {
-			FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, src1, src1w));
-			FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, dst, dstw));
-		}
-		else {
-			FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, src2, src2w));
-			FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, dst, dstw));
-		}
-	}
-	else if (flags & SLOW_SRC1)
-		FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src1, src1w, dst, dstw));
-	else if (flags & SLOW_SRC2)
-		FAIL_IF(getput_arg(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG2, src2, src2w, dst, dstw));
-
-	if (flags & SLOW_SRC1)
-		src1 = TMP_FREG1;
-	if (flags & SLOW_SRC2)
-		src2 = TMP_FREG2;
-
-	switch (GET_OPCODE(op)) {
-	case SLJIT_ADD_F64:
-		FAIL_IF(push_inst(compiler, SELECT_FOP(op, FADDS, FADDD) | FD(dst_r) | FS1(src1) | FS2(src2), MOVABLE_INS));
-		break;
-
-	case SLJIT_SUB_F64:
-		FAIL_IF(push_inst(compiler, SELECT_FOP(op, FSUBS, FSUBD) | FD(dst_r) | FS1(src1) | FS2(src2), MOVABLE_INS));
-		break;
-
-	case SLJIT_MUL_F64:
-		FAIL_IF(push_inst(compiler, SELECT_FOP(op, FMULS, FMULD) | FD(dst_r) | FS1(src1) | FS2(src2), MOVABLE_INS));
-		break;
-
-	case SLJIT_DIV_F64:
-		FAIL_IF(push_inst(compiler, SELECT_FOP(op, FDIVS, FDIVD) | FD(dst_r) | FS1(src1) | FS2(src2), MOVABLE_INS));
-		break;
-	}
-
-	if (dst_r == TMP_FREG2)
-		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op), TMP_FREG2, dst, dstw, 0, 0));
-
-	return SLJIT_SUCCESS;
-}
-
-#undef FLOAT_DATA
-#undef SELECT_FOP
-
-/* --------------------------------------------------------------------- */
-/*  Other instructions                                                   */
-/* --------------------------------------------------------------------- */
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_fast_enter(compiler, dst, dstw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-
-	if (FAST_IS_REG(dst))
-		return push_inst(compiler, OR | D(dst) | S1(0) | S2(TMP_LINK), UNMOVABLE_INS);
-
-	/* Memory. */
-	FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_LINK, dst, dstw));
-	compiler->delay_slot = UNMOVABLE_INS;
-	return SLJIT_SUCCESS;
-}
-
-/* --------------------------------------------------------------------- */
-/*  Conditional instructions                                             */
-/* --------------------------------------------------------------------- */
-
-SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
-{
-	struct sljit_label *label;
-
-	CHECK_ERROR_PTR();
-	CHECK_PTR(check_sljit_emit_label(compiler));
-
-	if (compiler->last_label && compiler->last_label->size == compiler->size)
-		return compiler->last_label;
-
-	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
-	PTR_FAIL_IF(!label);
-	set_label(label, compiler);
-	compiler->delay_slot = UNMOVABLE_INS;
-	return label;
-}
-
-static sljit_ins get_cc(struct sljit_compiler *compiler, sljit_s32 type)
-{
-	switch (type) {
-	case SLJIT_EQUAL:
-	case SLJIT_NOT_EQUAL_F64: /* Unordered. */
-		return DA(0x1);
-
-	case SLJIT_NOT_EQUAL:
-	case SLJIT_EQUAL_F64:
-		return DA(0x9);
-
-	case SLJIT_LESS:
-	case SLJIT_GREATER_F64: /* Unordered. */
-	case SLJIT_CARRY:
-		return DA(0x5);
-
-	case SLJIT_GREATER_EQUAL:
-	case SLJIT_LESS_EQUAL_F64:
-	case SLJIT_NOT_CARRY:
-		return DA(0xd);
-
-	case SLJIT_GREATER:
-	case SLJIT_GREATER_EQUAL_F64: /* Unordered. */
-		return DA(0xc);
-
-	case SLJIT_LESS_EQUAL:
-	case SLJIT_LESS_F64:
-		return DA(0x4);
-
-	case SLJIT_SIG_LESS:
-		return DA(0x3);
-
-	case SLJIT_SIG_GREATER_EQUAL:
-		return DA(0xb);
-
-	case SLJIT_SIG_GREATER:
-		return DA(0xa);
-
-	case SLJIT_SIG_LESS_EQUAL:
-		return DA(0x2);
-
-	case SLJIT_OVERFLOW:
-		if (!(compiler->status_flags_state & (SLJIT_CURRENT_FLAGS_ADD | SLJIT_CURRENT_FLAGS_SUB)))
-			return DA(0x9);
-		/* fallthrough */
-
-	case SLJIT_UNORDERED_F64:
-		return DA(0x7);
-
-	case SLJIT_NOT_OVERFLOW:
-		if (!(compiler->status_flags_state & (SLJIT_CURRENT_FLAGS_ADD | SLJIT_CURRENT_FLAGS_SUB)))
-			return DA(0x1);
-		/* fallthrough */
-
-	case SLJIT_ORDERED_F64:
-		return DA(0xf);
-
-	default:
-		SLJIT_UNREACHABLE();
-		return DA(0x8);
-	}
-}
-
-SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
-{
-	struct sljit_jump *jump;
-
-	CHECK_ERROR_PTR();
-	CHECK_PTR(check_sljit_emit_jump(compiler, type));
-
-	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
-	PTR_FAIL_IF(!jump);
-	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
-	type &= 0xff;
-
-	if (type < SLJIT_EQUAL_F64) {
-		jump->flags |= IS_COND;
-		if (((compiler->delay_slot & DST_INS_MASK) != UNMOVABLE_INS) && !(compiler->delay_slot & ICC_IS_SET))
-			jump->flags |= IS_MOVABLE;
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-		PTR_FAIL_IF(push_inst(compiler, BICC | get_cc(compiler, type ^ 1) | 5, UNMOVABLE_INS));
-#else
-#error "Implementation required"
-#endif
-	}
-	else if (type < SLJIT_JUMP) {
-		jump->flags |= IS_COND;
-		if (((compiler->delay_slot & DST_INS_MASK) != UNMOVABLE_INS) && !(compiler->delay_slot & FCC_IS_SET))
-			jump->flags |= IS_MOVABLE;
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-		PTR_FAIL_IF(push_inst(compiler, FBFCC | get_cc(compiler, type ^ 1) | 5, UNMOVABLE_INS));
-#else
-#error "Implementation required"
-#endif
-	}
-	else {
-		if ((compiler->delay_slot & DST_INS_MASK) != UNMOVABLE_INS)
-			jump->flags |= IS_MOVABLE;
-		if (type >= SLJIT_FAST_CALL)
-			jump->flags |= IS_CALL;
-	}
-
-	PTR_FAIL_IF(emit_const(compiler, TMP_REG1, 0));
-	PTR_FAIL_IF(push_inst(compiler, JMPL | D(type >= SLJIT_FAST_CALL ? TMP_LINK : 0) | S1(TMP_REG1) | IMM(0), UNMOVABLE_INS));
-	jump->addr = compiler->size;
-	PTR_FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-
-	return jump;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
-	sljit_s32 arg_types)
-{
-	CHECK_ERROR_PTR();
-	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
-
-	PTR_FAIL_IF(call_with_args(compiler, arg_types, NULL));
-
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
-	return sljit_emit_jump(compiler, type);
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
-{
-	struct sljit_jump *jump = NULL;
-	sljit_s32 src_r;
-
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
-	ADJUST_LOCAL_OFFSET(src, srcw);
-
-	if (FAST_IS_REG(src))
-		src_r = src;
-	else if (src & SLJIT_IMM) {
-		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
-		FAIL_IF(!jump);
-		set_jump(jump, compiler, JUMP_ADDR);
-		jump->u.target = (sljit_uw)srcw;
-
-		if ((compiler->delay_slot & DST_INS_MASK) != UNMOVABLE_INS)
-			jump->flags |= IS_MOVABLE;
-		if (type >= SLJIT_FAST_CALL)
-			jump->flags |= IS_CALL;
-
-		FAIL_IF(emit_const(compiler, TMP_REG1, 0));
-		src_r = TMP_REG1;
-	}
-	else {
-		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
-		src_r = TMP_REG1;
-	}
-
-	FAIL_IF(push_inst(compiler, JMPL | D(type >= SLJIT_FAST_CALL ? TMP_LINK : 0) | S1(src_r) | IMM(0), UNMOVABLE_INS));
-	if (jump)
-		jump->addr = compiler->size;
-	return push_inst(compiler, NOP, UNMOVABLE_INS);
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type,
-	sljit_s32 arg_types,
-	sljit_s32 src, sljit_sw srcw)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
-
-	if (src & SLJIT_MEM) {
-		ADJUST_LOCAL_OFFSET(src, srcw);
-		FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
-		src = TMP_REG1;
-	}
-
-	FAIL_IF(call_with_args(compiler, arg_types, &src));
-
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
-	return sljit_emit_ijump(compiler, type, src, srcw);
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
-	sljit_s32 dst, sljit_sw dstw,
-	sljit_s32 type)
-{
-	sljit_s32 reg;
-	sljit_u32 flags = HAS_FLAGS(op) ? SET_FLAGS : 0;
-
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-	op = GET_OPCODE(op);
-	reg = (op < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG2;
-
-	compiler->cache_arg = 0;
-	compiler->cache_argw = 0;
-
-	if (op >= SLJIT_ADD && (dst & SLJIT_MEM))
-		FAIL_IF(emit_op_mem2(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, dst, dstw, dst, dstw));
-
-	type &= 0xff;
-	if (type < SLJIT_EQUAL_F64)
-		FAIL_IF(push_inst(compiler, BICC | get_cc(compiler, type) | 3, UNMOVABLE_INS));
-	else
-		FAIL_IF(push_inst(compiler, FBFCC | get_cc(compiler, type) | 3, UNMOVABLE_INS));
-
-	FAIL_IF(push_inst(compiler, OR | D(reg) | S1(0) | IMM(1), UNMOVABLE_INS));
-	FAIL_IF(push_inst(compiler, OR | D(reg) | S1(0) | IMM(0), UNMOVABLE_INS));
-
-	if (op >= SLJIT_ADD) {
-		flags |= CUMULATIVE_OP | IMM_OP | ALT_KEEP_CACHE;
-		if (dst & SLJIT_MEM)
-			return emit_op(compiler, op, flags, dst, dstw, TMP_REG1, 0, TMP_REG2, 0);
-		return emit_op(compiler, op, flags, dst, 0, dst, 0, TMP_REG2, 0);
-	}
-
-	if (!(dst & SLJIT_MEM))
-		return SLJIT_SUCCESS;
-
-	return emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw);
-#else
-#error "Implementation required"
-#endif
-}
-
-SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,
-	sljit_s32 dst_reg,
-	sljit_s32 src, sljit_sw srcw)
-{
-	CHECK_ERROR();
-	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
-
-#if (defined SLJIT_CONFIG_SPARC_32 && SLJIT_CONFIG_SPARC_32)
-	return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);;
-#else
-#error "Implementation required"
-#endif
-}
-
-SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
-{
-	struct sljit_const *const_;
-	sljit_s32 dst_r;
-
-	CHECK_ERROR_PTR();
-	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-
-	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
-	PTR_FAIL_IF(!const_);
-	set_const(const_, compiler);
-
-	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
-	PTR_FAIL_IF(emit_const(compiler, dst_r, init_value));
-
-	if (dst & SLJIT_MEM)
-		PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw));
-	return const_;
-}
-
-SLJIT_API_FUNC_ATTRIBUTE struct sljit_put_label* sljit_emit_put_label(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
-{
-	struct sljit_put_label *put_label;
-	sljit_s32 dst_r;
-
-	CHECK_ERROR_PTR();
-	CHECK_PTR(check_sljit_emit_put_label(compiler, dst, dstw));
-	ADJUST_LOCAL_OFFSET(dst, dstw);
-
-	put_label = (struct sljit_put_label*)ensure_abuf(compiler, sizeof(struct sljit_put_label));
-	PTR_FAIL_IF(!put_label);
-	set_put_label(put_label, compiler, 0);
-
-	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
-	PTR_FAIL_IF(emit_const(compiler, dst_r, 0));
-
-	if (dst & SLJIT_MEM)
-		PTR_FAIL_IF(emit_op_mem(compiler, WORD_DATA, TMP_REG2, dst, dstw));
-	return put_label;
-}
diff --git a/src/sljit/sljitNativeX86_32.c b/src/sljit/sljitNativeX86_32.c
index b9a7b39..08da030 100644
--- a/src/sljit/sljitNativeX86_32.c
+++ b/src/sljit/sljitNativeX86_32.c
@@ -80,21 +80,28 @@
 	if (b & SLJIT_MEM) {
 		if (!(b & REG_MASK))
 			inst_size += sizeof(sljit_sw);
-		else if (immb != 0 && !(b & OFFS_REG_MASK)) {
-			/* Immediate operand. */
-			if (immb <= 127 && immb >= -128)
-				inst_size += sizeof(sljit_s8);
-			else
-				inst_size += sizeof(sljit_sw);
+		else {
+			if (immb != 0 && !(b & OFFS_REG_MASK)) {
+				/* Immediate operand. */
+				if (immb <= 127 && immb >= -128)
+					inst_size += sizeof(sljit_s8);
+				else
+					inst_size += sizeof(sljit_sw);
+			}
+			else if (reg_map[b & REG_MASK] == 5) {
+				/* Swap registers if possible. */
+				if ((b & OFFS_REG_MASK) && (immb & 0x3) == 0 && reg_map[OFFS_REG(b)] != 5)
+					b = SLJIT_MEM | OFFS_REG(b) | TO_OFFS_REG(b & REG_MASK);
+				else
+					inst_size += sizeof(sljit_s8);
+			}
+
+			if (reg_map[b & REG_MASK] == 4 && !(b & OFFS_REG_MASK))
+				b |= TO_OFFS_REG(SLJIT_SP);
+
+			if (b & OFFS_REG_MASK)
+				inst_size += 1; /* SIB byte. */
 		}
-		else if (reg_map[b & REG_MASK] == 5)
-			inst_size += sizeof(sljit_s8);
-
-		if ((b & REG_MASK) == SLJIT_SP && !(b & OFFS_REG_MASK))
-			b |= TO_OFFS_REG(SLJIT_SP);
-
-		if (b & OFFS_REG_MASK)
-			inst_size += 1; /* SIB byte. */
 	}
 
 	/* Calculate size of a. */
@@ -107,9 +114,9 @@
 				inst_size += 4;
 		}
 		else if (flags & EX86_SHIFT_INS) {
-			imma &= 0x1f;
+			SLJIT_ASSERT(imma <= 0x1f);
 			if (imma != 1) {
-				inst_size ++;
+				inst_size++;
 				flags |= EX86_BYTE_ARG;
 			}
 		} else if (flags & EX86_BYTE_ARG)
@@ -165,7 +172,7 @@
 	} else if (b & REG_MASK) {
 		reg_map_b = reg_map[b & REG_MASK];
 
-		if (!(b & OFFS_REG_MASK) || (b & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_SP) || reg_map_b == 5) {
+		if (!(b & OFFS_REG_MASK) || (b & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_SP)) {
 			if (immb != 0 || reg_map_b == 5) {
 				if (immb <= 127 && immb >= -128)
 					*buf_ptr |= 0x40;
@@ -190,8 +197,14 @@
 			}
 		}
 		else {
+			if (reg_map_b == 5)
+				*buf_ptr |= 0x40;
+
 			*buf_ptr++ |= 0x04;
 			*buf_ptr++ = U8(reg_map_b | (reg_map[OFFS_REG(b)] << 3) | (immb << 6));
+
+			if (reg_map_b == 5)
+				*buf_ptr++ = 0;
 		}
 	}
 	else {
@@ -243,19 +256,16 @@
 	return code_ptr;
 }
 
-#define ENTER_R2_USED	0x00001
-#define ENTER_R2_TO_S	0x00002
-#define ENTER_R2_TO_R0	0x00004
-#define ENTER_R1_TO_S	0x00008
-#define ENTER_TMP_TO_R4	0x00010
-#define ENTER_TMP_TO_S	0x00020
+#define ENTER_TMP_TO_R4		0x00001
+#define ENTER_TMP_TO_S		0x00002
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compiler,
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
 	sljit_s32 word_arg_count, saved_arg_count, float_arg_count;
-	sljit_s32 size, locals_offset, args_size, types, status;
+	sljit_s32 size, args_size, types, status;
+	sljit_s32 kept_saveds_count = SLJIT_KEPT_SAVEDS_COUNT(options);
 	sljit_u8 *inst;
 #ifdef _WIN32
 	sljit_s32 r2_offset = -1;
@@ -271,108 +281,97 @@
 	SLJIT_COMPILE_ASSERT(SLJIT_FR0 == 1, float_register_index_start);
 
 	arg_types >>= SLJIT_ARG_SHIFT;
-	types = arg_types;
 	word_arg_count = 0;
-	saved_arg_count = 0;
-	float_arg_count = 0;
-	args_size = SSIZE_OF(sw);
 	status = 0;
-	while (types) {
-		switch (types & SLJIT_ARG_MASK) {
-		case SLJIT_ARG_TYPE_F64:
-			float_arg_count++;
-			FAIL_IF(emit_sse2_load(compiler, 0, float_arg_count, SLJIT_MEM1(SLJIT_SP), args_size));
-			args_size += SSIZE_OF(f64);
-			break;
-		case SLJIT_ARG_TYPE_F32:
-			float_arg_count++;
-			FAIL_IF(emit_sse2_load(compiler, 1, float_arg_count, SLJIT_MEM1(SLJIT_SP), args_size));
-			args_size += SSIZE_OF(f32);
-			break;
-		default:
-			word_arg_count++;
 
-			if (!(types & SLJIT_ARG_TYPE_SCRATCH_REG)) {
-				saved_arg_count++;
-				if (saved_arg_count == 4)
-					status |= ENTER_TMP_TO_S;
-			} else {
-				if (word_arg_count == 4)
+	if (options & SLJIT_ENTER_REG_ARG) {
+		args_size = 3 * SSIZE_OF(sw);
+
+		while (arg_types) {
+			if ((arg_types & SLJIT_ARG_MASK) < SLJIT_ARG_TYPE_F64) {
+				word_arg_count++;
+				if (word_arg_count >= 4)
 					status |= ENTER_TMP_TO_R4;
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-				if (word_arg_count == 3)
-					status |= ENTER_R2_USED;
-#endif
 			}
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-			if (word_arg_count <= 2 && !(options & SLJIT_ENTER_CDECL))
-				break;
-#endif
-
-			args_size += SSIZE_OF(sw);
-			break;
+			arg_types >>= SLJIT_ARG_SHIFT;
 		}
-		types >>= SLJIT_ARG_SHIFT;
+
+		compiler->args_size = 0;
+	} else {
+		types = arg_types;
+		saved_arg_count = 0;
+		float_arg_count = 0;
+		args_size = SSIZE_OF(sw);
+		while (types) {
+			switch (types & SLJIT_ARG_MASK) {
+			case SLJIT_ARG_TYPE_F64:
+				float_arg_count++;
+				FAIL_IF(emit_sse2_load(compiler, 0, float_arg_count, SLJIT_MEM1(SLJIT_SP), args_size));
+				args_size += SSIZE_OF(f64);
+				break;
+			case SLJIT_ARG_TYPE_F32:
+				float_arg_count++;
+				FAIL_IF(emit_sse2_load(compiler, 1, float_arg_count, SLJIT_MEM1(SLJIT_SP), args_size));
+				args_size += SSIZE_OF(f32);
+				break;
+			default:
+				word_arg_count++;
+
+				if (!(types & SLJIT_ARG_TYPE_SCRATCH_REG))
+					saved_arg_count++;
+
+				if (word_arg_count == 4) {
+					if (types & SLJIT_ARG_TYPE_SCRATCH_REG) {
+						status |= ENTER_TMP_TO_R4;
+						arg_types &= ~(SLJIT_ARG_FULL_MASK << 3 * SLJIT_ARG_SHIFT);
+					} else if (saved_arg_count == 4) {
+						status |= ENTER_TMP_TO_S;
+						arg_types &= ~(SLJIT_ARG_FULL_MASK << 3 * SLJIT_ARG_SHIFT);
+					}
+				}
+
+				args_size += SSIZE_OF(sw);
+				break;
+			}
+			types >>= SLJIT_ARG_SHIFT;
+		}
+
+		args_size -= SSIZE_OF(sw);
+		compiler->args_size = args_size;
 	}
 
-	args_size -= SSIZE_OF(sw);
-	compiler->args_size = args_size;
+	size = (scratches > 9 ? (scratches - 9) : 0) + (saveds <= 3 ? saveds : 3) - kept_saveds_count;
+	if (!(options & SLJIT_ENTER_REG_ARG))
+		size++;
 
-	/* [esp+0] for saving temporaries and function calls. */
-	locals_offset = 2 * SSIZE_OF(sw);
+	if (size != 0) {
+		inst = (sljit_u8*)ensure_buf(compiler, (sljit_uw)(size + 1));
+		FAIL_IF(!inst);
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	if ((options & SLJIT_ENTER_CDECL) && scratches >= 3)
-		locals_offset = 4 * SSIZE_OF(sw);
-#else
-	if (scratches >= 3)
-		locals_offset = 4 * SSIZE_OF(sw);
-#endif
+		INC_SIZE((sljit_uw)size);
 
-	compiler->scratches_offset = locals_offset;
+		if (!(options & SLJIT_ENTER_REG_ARG))
+			PUSH_REG(reg_map[TMP_REG1]);
 
-	if (scratches > 3)
-		locals_offset += ((scratches > (3 + 6)) ? 6 : (scratches - 3)) * SSIZE_OF(sw);
+		if ((saveds > 2 && kept_saveds_count <= 2) || scratches > 9)
+			PUSH_REG(reg_map[SLJIT_S2]);
+		if ((saveds > 1 && kept_saveds_count <= 1) || scratches > 10)
+			PUSH_REG(reg_map[SLJIT_S1]);
+		if ((saveds > 0 && kept_saveds_count == 0) || scratches > 11)
+			PUSH_REG(reg_map[SLJIT_S0]);
 
-	if (saveds > 3)
-		locals_offset += (saveds - 3) * SSIZE_OF(sw);
-
-	compiler->locals_offset = locals_offset;
-
-	size = 1 + (scratches > 9 ? (scratches - 9) : 0) + (saveds <= 3 ? saveds : 3);
-	inst = (sljit_u8*)ensure_buf(compiler, (sljit_uw)(size + 1));
-	FAIL_IF(!inst);
-
-	INC_SIZE((sljit_uw)size);
-	PUSH_REG(reg_map[TMP_REG1]);
-	if (saveds > 2 || scratches > 9)
-		PUSH_REG(reg_map[SLJIT_S2]);
-	if (saveds > 1 || scratches > 10)
-		PUSH_REG(reg_map[SLJIT_S1]);
-	if (saveds > 0 || scratches > 11)
-		PUSH_REG(reg_map[SLJIT_S0]);
-
-	size *= SSIZE_OF(sw);
+		size *= SSIZE_OF(sw);
+	}
 
 	if (status & (ENTER_TMP_TO_R4 | ENTER_TMP_TO_S))
 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), args_size + size);
 
 	size += SSIZE_OF(sw);
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	if (!(options & SLJIT_ENTER_CDECL))
-		size += args_size;
-#endif
-
-	local_size = ((locals_offset + local_size + size + 0xf) & ~0xf) - size;
+	local_size = ((SLJIT_LOCALS_OFFSET_BASE + local_size + size + 0xf) & ~0xf) - size;
 	compiler->local_size = local_size;
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	if (!(options & SLJIT_ENTER_CDECL))
-		size -= args_size;
-#endif
-
 	word_arg_count = 0;
 	saved_arg_count = 0;
 	args_size = size;
@@ -386,64 +385,27 @@
 			break;
 		default:
 			word_arg_count++;
+			SLJIT_ASSERT(word_arg_count <= 3 || (word_arg_count == 4 && !(status & (ENTER_TMP_TO_R4 | ENTER_TMP_TO_S))));
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-			if (!(options & SLJIT_ENTER_CDECL) && word_arg_count <= 2) {
-				if (word_arg_count == 1) {
-					if (status & ENTER_R2_USED) {
-						EMIT_MOV(compiler, (arg_types & SLJIT_ARG_TYPE_SCRATCH_REG) ? SLJIT_R0 : SLJIT_S0, 0, SLJIT_R2, 0);
-					} else if (!(arg_types & SLJIT_ARG_TYPE_SCRATCH_REG)) {
-						status |= ENTER_R2_TO_S;
-						saved_arg_count++;
-					} else
-						status |= ENTER_R2_TO_R0;
-				} else if (!(arg_types & SLJIT_ARG_TYPE_SCRATCH_REG)) {
-					status |= ENTER_R1_TO_S;
-					saved_arg_count++;
-				}
-				break;
-			}
-#endif
 			if (arg_types & SLJIT_ARG_TYPE_SCRATCH_REG) {
-				SLJIT_ASSERT(word_arg_count <= 3 || (status & ENTER_TMP_TO_R4));
-
-				if (word_arg_count <= 3) {
 #ifdef _WIN32
-					if (word_arg_count == 3 && local_size > 4 * 4096)
-						r2_offset = local_size + args_size;
-					else
+				if (word_arg_count == 3 && local_size > 4 * 4096)
+					r2_offset = local_size + args_size;
+				else
 #endif
-						EMIT_MOV(compiler, word_arg_count, 0, SLJIT_MEM1(SLJIT_SP), args_size);
-				}
-			} else {
-				SLJIT_ASSERT(saved_arg_count <= 3 || (status & ENTER_TMP_TO_S));
+					EMIT_MOV(compiler, word_arg_count, 0, SLJIT_MEM1(SLJIT_SP), args_size);
 
-				if (saved_arg_count <= 3)
-					EMIT_MOV(compiler, SLJIT_S0 - saved_arg_count, 0, SLJIT_MEM1(SLJIT_SP), args_size);
+			} else {
+				EMIT_MOV(compiler, SLJIT_S0 - saved_arg_count, 0, SLJIT_MEM1(SLJIT_SP), args_size);
 				saved_arg_count++;
 			}
+
 			args_size += SSIZE_OF(sw);
 			break;
 		}
 		arg_types >>= SLJIT_ARG_SHIFT;
 	}
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	if (!(options & SLJIT_ENTER_CDECL)) {
-		if (status & ENTER_R2_TO_R0)
-			EMIT_MOV(compiler, SLJIT_R0, 0, SLJIT_R2, 0);
-
-		saved_arg_count = 0;
-		if (status & ENTER_R2_TO_S) {
-			EMIT_MOV(compiler, SLJIT_S0, 0, SLJIT_R2, 0);
-			saved_arg_count++;
-		}
-
-		if (status & ENTER_R1_TO_S)
-			EMIT_MOV(compiler, SLJIT_S0 - saved_arg_count, 0, SLJIT_R1, 0);
-	}
-#endif
-
 	SLJIT_ASSERT(SLJIT_LOCALS_OFFSET > 0);
 
 #ifdef _WIN32
@@ -459,6 +421,18 @@
 				BINARY_IMM32(OR, 0, SLJIT_MEM1(SLJIT_SP), -4096 * 3);
 		}
 		else {
+			if (options & SLJIT_ENTER_REG_ARG) {
+				SLJIT_ASSERT(r2_offset == -1);
+
+				inst = (sljit_u8*)ensure_buf(compiler, (sljit_uw)(1 + 1));
+				FAIL_IF(!inst);
+				INC_SIZE(1);
+				PUSH_REG(reg_map[SLJIT_R2]);
+
+				local_size -= SSIZE_OF(sw);
+				r2_offset = local_size;
+			}
+
 			EMIT_MOV(compiler, SLJIT_R2, 0, SLJIT_IMM, local_size >> 12);
 
 			BINARY_IMM32(OR, 0, SLJIT_MEM1(SLJIT_SP), -4096);
@@ -490,8 +464,20 @@
 
 #endif /* _WIN32 */
 
+	size = SLJIT_LOCALS_OFFSET_BASE - SSIZE_OF(sw);
+	kept_saveds_count = SLJIT_R3 - kept_saveds_count;
+
+	while (saved_arg_count > 3) {
+		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), size, kept_saveds_count, 0);
+		kept_saveds_count++;
+		size -= SSIZE_OF(sw);
+		saved_arg_count--;
+	}
+
 	if (status & (ENTER_TMP_TO_R4 | ENTER_TMP_TO_S)) {
-		size = (status & ENTER_TMP_TO_R4) ? compiler->scratches_offset : compiler->locals_offset - SSIZE_OF(sw);
+		if (status & ENTER_TMP_TO_R4)
+			size = 2 * SSIZE_OF(sw);
+
 		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), size, TMP_REG1, 0);
 	}
 
@@ -502,10 +488,7 @@
 	sljit_s32 options, sljit_s32 arg_types, sljit_s32 scratches, sljit_s32 saveds,
 	sljit_s32 fscratches, sljit_s32 fsaveds, sljit_s32 local_size)
 {
-	sljit_s32 args_size, locals_offset;
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	sljit_s32 word_arg_count = 0;
-#endif
+	sljit_s32 args_size;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_set_context(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
@@ -513,87 +496,88 @@
 
 	arg_types >>= SLJIT_ARG_SHIFT;
 	args_size = 0;
-	while (arg_types) {
-		switch (arg_types & SLJIT_ARG_MASK) {
-		case SLJIT_ARG_TYPE_F64:
-			args_size += SSIZE_OF(f64);
-			break;
-		case SLJIT_ARG_TYPE_F32:
-			args_size += SSIZE_OF(f32);
-			break;
-		default:
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-			if (word_arg_count >= 2)
+
+	if (!(options & SLJIT_ENTER_REG_ARG)) {
+		while (arg_types) {
+			switch (arg_types & SLJIT_ARG_MASK) {
+			case SLJIT_ARG_TYPE_F64:
+				args_size += SSIZE_OF(f64);
+				break;
+			case SLJIT_ARG_TYPE_F32:
+				args_size += SSIZE_OF(f32);
+				break;
+			default:
 				args_size += SSIZE_OF(sw);
-			word_arg_count++;
-#else
-			args_size += SSIZE_OF(sw);
-#endif
-			break;
+				break;
+			}
+			arg_types >>= SLJIT_ARG_SHIFT;
 		}
-		arg_types >>= SLJIT_ARG_SHIFT;
 	}
 
 	compiler->args_size = args_size;
 
-	/* [esp+0] for saving temporaries and function calls. */
-	locals_offset = 2 * SSIZE_OF(sw);
+	/* [esp+0] for saving temporaries and for function calls. */
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	if ((options & SLJIT_ENTER_CDECL) && scratches >= 3)
-		locals_offset = 4 * SSIZE_OF(sw);
-#else
-	if (scratches >= 3)
-		locals_offset = 4 * SSIZE_OF(sw);
-#endif
+	saveds = (1 + (scratches > 9 ? (scratches - 9) : 0) + (saveds <= 3 ? saveds : 3) - SLJIT_KEPT_SAVEDS_COUNT(options)) * SSIZE_OF(sw);
 
-	compiler->scratches_offset = locals_offset;
+	/* Saving ebp. */
+	if (!(options & SLJIT_ENTER_REG_ARG))
+		saveds += SSIZE_OF(sw);
 
-	if (scratches > 3)
-		locals_offset += ((scratches > (3 + 6)) ? 6 : (scratches - 3)) * SSIZE_OF(sw);
-
-	if (saveds > 3)
-		locals_offset += (saveds - 3) * SSIZE_OF(sw);
-
-	compiler->locals_offset = locals_offset;
-
-	saveds = (2 + (scratches > 9 ? (scratches - 9) : 0) + (saveds <= 3 ? saveds : 3)) * SSIZE_OF(sw);
-
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	if (!(options & SLJIT_ENTER_CDECL))
-		saveds += args_size;
-#endif
-
-	compiler->local_size = ((locals_offset + local_size + saveds + 0xf) & ~0xf) - saveds;
+	compiler->local_size = ((SLJIT_LOCALS_OFFSET_BASE + local_size + saveds + 0xf) & ~0xf) - saveds;
 	return SLJIT_SUCCESS;
 }
 
-static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 is_return_to)
 {
+	sljit_s32 kept_saveds_count = SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
+	sljit_s32 local_size, saveds;
 	sljit_uw size;
 	sljit_u8 *inst;
 
-	size = (sljit_uw)(1 + (compiler->scratches > 9 ? (compiler->scratches - 9) : 0) +
-		(compiler->saveds <= 3 ? compiler->saveds : 3));
+	size = (sljit_uw)((compiler->scratches > 9 ? (compiler->scratches - 9) : 0) +
+		(compiler->saveds <= 3 ? compiler->saveds : 3) - kept_saveds_count);
+
+	local_size = compiler->local_size;
+
+	if (!(compiler->options & SLJIT_ENTER_REG_ARG))
+		size++;
+	else if (is_return_to && size == 0) {
+		local_size += SSIZE_OF(sw);
+		is_return_to = 0;
+	}
+
+	if (local_size > 0)
+		BINARY_IMM32(ADD, local_size, SLJIT_SP, 0);
+
+	if (size == 0)
+		return SLJIT_SUCCESS;
+
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
 	FAIL_IF(!inst);
 
 	INC_SIZE(size);
 
-	if (compiler->saveds > 0 || compiler->scratches > 11)
+	saveds = compiler->saveds;
+
+	if ((saveds > 0 && kept_saveds_count == 0) || compiler->scratches > 11)
 		POP_REG(reg_map[SLJIT_S0]);
-	if (compiler->saveds > 1 || compiler->scratches > 10)
+	if ((saveds > 1 && kept_saveds_count <= 1) || compiler->scratches > 10)
 		POP_REG(reg_map[SLJIT_S1]);
-	if (compiler->saveds > 2 || compiler->scratches > 9)
+	if ((saveds > 2 && kept_saveds_count <= 2) || compiler->scratches > 9)
 		POP_REG(reg_map[SLJIT_S2]);
-	POP_REG(reg_map[TMP_REG1]);
+
+	if (!(compiler->options & SLJIT_ENTER_REG_ARG))
+		POP_REG(reg_map[TMP_REG1]);
+
+	if (is_return_to)
+		BINARY_IMM32(ADD, sizeof(sljit_sw), SLJIT_SP, 0);
 
 	return SLJIT_SUCCESS;
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_void(struct sljit_compiler *compiler)
 {
-	sljit_uw size;
 	sljit_u8 *inst;
 
 	CHECK_ERROR();
@@ -602,143 +586,45 @@
 	SLJIT_ASSERT(compiler->args_size >= 0);
 	SLJIT_ASSERT(compiler->local_size > 0);
 
-	BINARY_IMM32(ADD, compiler->local_size, SLJIT_SP, 0);
+	FAIL_IF(emit_stack_frame_release(compiler, 0));
 
-	FAIL_IF(emit_stack_frame_release(compiler));
-
-	size = 1;
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	if (compiler->args_size > 0 && !(compiler->options & SLJIT_ENTER_CDECL))
-		size = 3;
-#endif
-	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
+	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
 	FAIL_IF(!inst);
-
-	INC_SIZE(size);
-
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	if (compiler->args_size > 0 && !(compiler->options & SLJIT_ENTER_CDECL)) {
-		RET_I16(U8(compiler->args_size));
-		return SLJIT_SUCCESS;
-	}
-#endif
-
+	INC_SIZE(1);
 	RET();
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_s32 src_r;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	if ((src & SLJIT_MEM) || (src > SLJIT_R2 && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options)))) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+		CHECK_EXTRA_REGS(src, srcw, (void)0);
+
+		src_r = (compiler->options & SLJIT_ENTER_REG_ARG) ? TMP_REG1 : SLJIT_R1;
+
+		EMIT_MOV(compiler, src_r, 0, src, srcw);
+		src = src_r;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Call / return instructions                                           */
 /* --------------------------------------------------------------------- */
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-
-static sljit_sw c_fast_call_get_stack_size(sljit_s32 arg_types, sljit_s32 *word_arg_count_ptr)
-{
-	sljit_sw stack_size = 0;
-	sljit_s32 word_arg_count = 0;
-
-	arg_types >>= SLJIT_ARG_SHIFT;
-
-	while (arg_types) {
-		switch (arg_types & SLJIT_ARG_MASK) {
-		case SLJIT_ARG_TYPE_F64:
-			stack_size += SSIZE_OF(f64);
-			break;
-		case SLJIT_ARG_TYPE_F32:
-			stack_size += SSIZE_OF(f32);
-			break;
-		default:
-			word_arg_count++;
-			if (word_arg_count > 2)
-				stack_size += SSIZE_OF(sw);
-			break;
-		}
-
-		arg_types >>= SLJIT_ARG_SHIFT;
-	}
-
-	if (word_arg_count_ptr)
-		*word_arg_count_ptr = word_arg_count;
-
-	return stack_size;
-}
-
-static sljit_s32 c_fast_call_with_args(struct sljit_compiler *compiler,
-	sljit_s32 arg_types, sljit_sw stack_size, sljit_s32 word_arg_count, sljit_s32 swap_args)
-{
-	sljit_u8 *inst;
-	sljit_s32 float_arg_count;
-
-	if (stack_size == SSIZE_OF(sw) && word_arg_count == 3) {
-		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
-		FAIL_IF(!inst);
-		INC_SIZE(1);
-		PUSH_REG(reg_map[SLJIT_R2]);
-	}
-	else if (stack_size > 0) {
-		if (word_arg_count >= 4)
-			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), compiler->scratches_offset);
-
-		BINARY_IMM32(SUB, stack_size, SLJIT_SP, 0);
-
-		stack_size = 0;
-		arg_types >>= SLJIT_ARG_SHIFT;
-		word_arg_count = 0;
-		float_arg_count = 0;
-		while (arg_types) {
-			switch (arg_types & SLJIT_ARG_MASK) {
-			case SLJIT_ARG_TYPE_F64:
-				float_arg_count++;
-				FAIL_IF(emit_sse2_store(compiler, 0, SLJIT_MEM1(SLJIT_SP), stack_size, float_arg_count));
-				stack_size += SSIZE_OF(f64);
-				break;
-			case SLJIT_ARG_TYPE_F32:
-				float_arg_count++;
-				FAIL_IF(emit_sse2_store(compiler, 1, SLJIT_MEM1(SLJIT_SP), stack_size, float_arg_count));
-				stack_size += SSIZE_OF(f32);
-				break;
-			default:
-				word_arg_count++;
-				if (word_arg_count == 3) {
-					EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), stack_size, SLJIT_R2, 0);
-					stack_size += SSIZE_OF(sw);
-				}
-				else if (word_arg_count == 4) {
-					EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), stack_size, TMP_REG1, 0);
-					stack_size += SSIZE_OF(sw);
-				}
-				break;
-			}
-
-			arg_types >>= SLJIT_ARG_SHIFT;
-		}
-	}
-
-	if (word_arg_count > 0) {
-		if (swap_args) {
-			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
-			FAIL_IF(!inst);
-			INC_SIZE(1);
-
-			*inst++ = U8(XCHG_EAX_r | reg_map[SLJIT_R2]);
-		}
-		else {
-			inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
-			FAIL_IF(!inst);
-			INC_SIZE(2);
-
-			*inst++ = MOV_r_rm;
-			*inst++ = U8(MOD_REG | (reg_map[SLJIT_R2] << 3) | reg_map[SLJIT_R0]);
-		}
-	}
-
-	return SLJIT_SUCCESS;
-}
-
-#endif
-
-static sljit_s32 cdecl_call_get_stack_size(struct sljit_compiler *compiler, sljit_s32 arg_types, sljit_s32 *word_arg_count_ptr)
+static sljit_s32 call_get_stack_size(sljit_s32 arg_types, sljit_s32 *word_arg_count_ptr)
 {
 	sljit_sw stack_size = 0;
 	sljit_s32 word_arg_count = 0;
@@ -765,25 +651,31 @@
 	if (word_arg_count_ptr)
 		*word_arg_count_ptr = word_arg_count;
 
-	if (stack_size <= compiler->scratches_offset)
+	if (stack_size <= 4 * SSIZE_OF(sw))
 		return 0;
 
-	return ((stack_size - compiler->scratches_offset + 0xf) & ~0xf);
+	return ((stack_size - (4 * SSIZE_OF(sw)) + 0xf) & ~0xf);
 }
 
-static sljit_s32 cdecl_call_with_args(struct sljit_compiler *compiler,
-	sljit_s32 arg_types, sljit_sw stack_size, sljit_s32 word_arg_count)
+static sljit_s32 call_with_args(struct sljit_compiler *compiler,
+	sljit_s32 arg_types, sljit_sw stack_size, sljit_s32 word_arg_count, sljit_s32 keep_tmp1)
 {
-	sljit_s32 float_arg_count = 0;
+	sljit_s32 float_arg_count = 0, arg4_reg = 0, arg_offset;
 	sljit_u8 *inst;
 
-	if (word_arg_count >= 4)
-		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), compiler->scratches_offset);
+	if (word_arg_count >= 4) {
+		arg4_reg = SLJIT_R0;
+
+		if (!keep_tmp1) {
+			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), 2 * SSIZE_OF(sw));
+			arg4_reg = TMP_REG1;
+		}
+	}
 
 	if (stack_size > 0)
 		BINARY_IMM32(SUB, stack_size, SLJIT_SP, 0);
 
-	stack_size = 0;
+	arg_offset = 0;
 	word_arg_count = 0;
 	arg_types >>= SLJIT_ARG_SHIFT;
 
@@ -791,18 +683,22 @@
 		switch (arg_types & SLJIT_ARG_MASK) {
 		case SLJIT_ARG_TYPE_F64:
 			float_arg_count++;
-			FAIL_IF(emit_sse2_store(compiler, 0, SLJIT_MEM1(SLJIT_SP), stack_size, float_arg_count));
-			stack_size += SSIZE_OF(f64);
+			FAIL_IF(emit_sse2_store(compiler, 0, SLJIT_MEM1(SLJIT_SP), arg_offset, float_arg_count));
+			arg_offset += SSIZE_OF(f64);
 			break;
 		case SLJIT_ARG_TYPE_F32:
 			float_arg_count++;
-			FAIL_IF(emit_sse2_store(compiler, 1, SLJIT_MEM1(SLJIT_SP), stack_size, float_arg_count));
-			stack_size += SSIZE_OF(f32);
+			FAIL_IF(emit_sse2_store(compiler, 1, SLJIT_MEM1(SLJIT_SP), arg_offset, float_arg_count));
+			arg_offset += SSIZE_OF(f32);
 			break;
 		default:
 			word_arg_count++;
-			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), stack_size, (word_arg_count >= 4) ? TMP_REG1 : word_arg_count, 0);
-			stack_size += SSIZE_OF(sw);
+			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), arg_offset, (word_arg_count >= 4) ? arg4_reg : word_arg_count, 0);
+
+			if (word_arg_count == 1 && arg4_reg == SLJIT_R0)
+				EMIT_MOV(compiler, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), 2 * SSIZE_OF(sw) + stack_size);
+
+			arg_offset += SSIZE_OF(sw);
 			break;
 		}
 
@@ -840,21 +736,19 @@
 	sljit_s32 *extra_space, sljit_s32 arg_types,
 	sljit_s32 src, sljit_sw srcw)
 {
-	sljit_sw args_size, prev_args_size, saved_regs_size;
+	sljit_sw args_size, saved_regs_size;
 	sljit_sw types, word_arg_count, float_arg_count;
 	sljit_sw stack_size, prev_stack_size, min_size, offset;
 	sljit_sw word_arg4_offset;
 	sljit_u8 r2_offset = 0;
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	sljit_u8 fast_call = (*extra_space & 0xff) == SLJIT_CALL;
-#endif
+	sljit_s32 kept_saveds_count = SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
 	sljit_u8* inst;
 
 	ADJUST_LOCAL_OFFSET(src, srcw);
 	CHECK_EXTRA_REGS(src, srcw, (void)0);
 
 	saved_regs_size = (1 + (compiler->scratches > 9 ? (compiler->scratches - 9) : 0)
-		+ (compiler->saveds <= 3 ? compiler->saveds : 3)) * SSIZE_OF(sw);
+		+ (compiler->saveds <= 3 ? compiler->saveds : 3) - kept_saveds_count) * SSIZE_OF(sw);
 
 	word_arg_count = 0;
 	float_arg_count = 0;
@@ -876,30 +770,15 @@
 			break;
 		default:
 			word_arg_count++;
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-			if (!fast_call || word_arg_count > 2)
-				args_size += SSIZE_OF(sw);
-#else
 			args_size += SSIZE_OF(sw);
-#endif
 			break;
 		}
 		arg_types >>= SLJIT_ARG_SHIFT;
 	}
 
-	if (args_size <= compiler->args_size
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-			&& (!(compiler->options & SLJIT_ENTER_CDECL) || args_size == 0 || !fast_call)
-#endif /* SLJIT_X86_32_FASTCALL */
-			&& 1) {
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-		*extra_space = fast_call ? 0 : args_size;
-		prev_args_size = compiler->args_size;
-		stack_size = prev_args_size + SSIZE_OF(sw) + saved_regs_size;
-#else /* !SLJIT_X86_32_FASTCALL */
+	if (args_size <= compiler->args_size) {
 		*extra_space = 0;
 		stack_size = args_size + SSIZE_OF(sw) + saved_regs_size;
-#endif /* SLJIT_X86_32_FASTCALL */
 
 		offset = stack_size + compiler->local_size;
 
@@ -911,37 +790,6 @@
 			EMIT_MOV(compiler, SLJIT_R0, 0, src, srcw);
 		}
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-		if (!(compiler->options & SLJIT_ENTER_CDECL)) {
-			if (!fast_call)
-				offset -= SSIZE_OF(sw);
-
-			if (word_arg_count >= 3) {
-				word_arg4_offset = SSIZE_OF(sw);
-
-				if (word_arg_count + float_arg_count >= 4) {
-					word_arg4_offset = SSIZE_OF(sw) + SSIZE_OF(sw);
-					if ((types & SLJIT_ARG_MASK) == SLJIT_ARG_TYPE_F64)
-						word_arg4_offset = SSIZE_OF(sw) + SSIZE_OF(f64);
-				}
-
-				/* In cdecl mode, at least one more word value must
-				 * be present on the stack before the return address. */
-				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset - word_arg4_offset, SLJIT_R2, 0);
-			}
-
-			if (fast_call) {
-				if (args_size < prev_args_size) {
-					EMIT_MOV(compiler, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), offset - prev_args_size - SSIZE_OF(sw));
-					EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset - args_size - SSIZE_OF(sw), SLJIT_R2, 0);
-				}
-			} else if (prev_args_size > 0) {
-				EMIT_MOV(compiler, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), offset - prev_args_size);
-				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset, SLJIT_R2, 0);
-			}
-		}
-#endif /* SLJIT_X86_32_FASTCALL */
-
 		while (types != 0) {
 			switch (types & SLJIT_ARG_MASK) {
 			case SLJIT_ARG_TYPE_F64:
@@ -957,12 +805,6 @@
 			default:
 				switch (word_arg_count) {
 				case 1:
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-					if (fast_call) {
-						EMIT_MOV(compiler, SLJIT_R2, 0, r2_offset != 0 ? SLJIT_MEM1(SLJIT_SP) : SLJIT_R0, 0);
-						break;
-					}
-#endif
 					offset -= SSIZE_OF(sw);
 					if (r2_offset != 0) {
 						EMIT_MOV(compiler, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), 0);
@@ -971,10 +813,6 @@
 						EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset, SLJIT_R0, 0);
 					break;
 				case 2:
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-					if (fast_call)
-						break;
-#endif
 					offset -= SSIZE_OF(sw);
 					EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset, SLJIT_R1, 0);
 					break;
@@ -983,7 +821,7 @@
 					break;
 				case 4:
 					offset -= SSIZE_OF(sw);
-					EMIT_MOV(compiler, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), compiler->scratches_offset);
+					EMIT_MOV(compiler, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), 2 * SSIZE_OF(sw));
 					EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset, SLJIT_R2, 0);
 					break;
 				}
@@ -993,15 +831,7 @@
 			types >>= SLJIT_ARG_SHIFT;
 		}
 
-		BINARY_IMM32(ADD, compiler->local_size, SLJIT_SP, 0);
-		FAIL_IF(emit_stack_frame_release(compiler));
-
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-		if (args_size < prev_args_size)
-			BINARY_IMM32(ADD, prev_args_size - args_size, SLJIT_SP, 0);
-#endif
-
-		return SLJIT_SUCCESS;
+		return emit_stack_frame_release(compiler, 0);
 	}
 
 	stack_size = args_size + SSIZE_OF(sw);
@@ -1014,16 +844,10 @@
 	if (word_arg_count >= 3)
 		stack_size += SSIZE_OF(sw);
 
-	prev_args_size = 0;
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	if (!(compiler->options & SLJIT_ENTER_CDECL))
-		prev_args_size = compiler->args_size;
-#endif
-
-	prev_stack_size = prev_args_size + SSIZE_OF(sw) + saved_regs_size;
+	prev_stack_size = SSIZE_OF(sw) + saved_regs_size;
 	min_size = prev_stack_size + compiler->local_size;
 
-	word_arg4_offset = compiler->scratches_offset;
+	word_arg4_offset = 2 * SSIZE_OF(sw);
 
 	if (stack_size > min_size) {
 		BINARY_IMM32(SUB, stack_size - min_size, SLJIT_SP, 0);
@@ -1050,75 +874,30 @@
 	}
 
 	/* Restore saved registers. */
-	offset = stack_size - prev_args_size - 2 * SSIZE_OF(sw);
+	offset = stack_size - 2 * SSIZE_OF(sw);
 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), offset);
 
 	if (compiler->saveds > 2 || compiler->scratches > 9) {
 		offset -= SSIZE_OF(sw);
 		EMIT_MOV(compiler, SLJIT_S2, 0, SLJIT_MEM1(SLJIT_SP), offset);
 	}
-	if (compiler->saveds > 1 || compiler->scratches > 10) {
+	if ((compiler->saveds > 1 && kept_saveds_count <= 1) || compiler->scratches > 10) {
 		offset -= SSIZE_OF(sw);
 		EMIT_MOV(compiler, SLJIT_S1, 0, SLJIT_MEM1(SLJIT_SP), offset);
 	}
-	if (compiler->saveds > 0 || compiler->scratches > 11) {
+	if ((compiler->saveds > 0 && kept_saveds_count == 0) || compiler->scratches > 11) {
 		offset -= SSIZE_OF(sw);
 		EMIT_MOV(compiler, SLJIT_S0, 0, SLJIT_MEM1(SLJIT_SP), offset);
 	}
 
 	/* Copy fourth argument and return address. */
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	if (fast_call) {
-		offset = stack_size;
-		*extra_space = 0;
+	offset = stack_size - SSIZE_OF(sw);
+	*extra_space = args_size;
 
-		if (word_arg_count >= 4 && prev_args_size == 0) {
-			offset -= SSIZE_OF(sw);
-			inst = emit_x86_instruction(compiler, 1, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), offset);
-			FAIL_IF(!inst);
-			*inst = XCHG_r_rm;
-
-			SLJIT_ASSERT(args_size != prev_args_size);
-		} else {
-			if (word_arg_count >= 4) {
-				offset -= SSIZE_OF(sw);
-				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset, SLJIT_R2, 0);
-			}
-
-			if (args_size != prev_args_size)
-				EMIT_MOV(compiler, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), stack_size - prev_args_size - SSIZE_OF(sw));
-		}
-
-		if (args_size != prev_args_size)
-			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), stack_size - args_size - SSIZE_OF(sw), SLJIT_R2, 0);
-	} else {
-#endif /* SLJIT_X86_32_FASTCALL */
-		offset = stack_size - SSIZE_OF(sw);
-		*extra_space = args_size;
-
-		if (word_arg_count >= 4 && prev_args_size == SSIZE_OF(sw)) {
-			offset -= SSIZE_OF(sw);
-			inst = emit_x86_instruction(compiler, 1, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), offset);
-			FAIL_IF(!inst);
-			*inst = XCHG_r_rm;
-
-			SLJIT_ASSERT(prev_args_size > 0);
-		} else {
-			if (word_arg_count >= 4) {
-				offset -= SSIZE_OF(sw);
-				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset, SLJIT_R2, 0);
-			}
-
-			if (prev_args_size > 0)
-				EMIT_MOV(compiler, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), stack_size - prev_args_size - SSIZE_OF(sw));
-		}
-
-		/* Copy return address. */
-		if (prev_args_size > 0)
-			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), stack_size - SSIZE_OF(sw), SLJIT_R2, 0);
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
+	if (word_arg_count >= 4) {
+		offset -= SSIZE_OF(sw);
+		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset, SLJIT_R2, 0);
 	}
-#endif /* SLJIT_X86_32_FASTCALL */
 
 	while (types != 0) {
 		switch (types & SLJIT_ARG_MASK) {
@@ -1135,12 +914,6 @@
 		default:
 			switch (word_arg_count) {
 			case 1:
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-				if (fast_call) {
-					EMIT_MOV(compiler, SLJIT_R2, 0, r2_offset != 0 ? SLJIT_MEM1(SLJIT_SP) : SLJIT_R0, 0);
-					break;
-				}
-#endif
 				offset -= SSIZE_OF(sw);
 				if (r2_offset != 0) {
 					EMIT_MOV(compiler, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), 0);
@@ -1149,10 +922,6 @@
 					EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset, SLJIT_R0, 0);
 				break;
 			case 2:
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-				if (fast_call)
-					break;
-#endif
 				offset -= SSIZE_OF(sw);
 				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset, SLJIT_R1, 0);
 				break;
@@ -1168,12 +937,6 @@
 		types >>= SLJIT_ARG_SHIFT;
 	}
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	/* Skip return address. */
-	if (fast_call)
-		offset -= SSIZE_OF(sw);
-#endif
-
 	SLJIT_ASSERT(offset >= 0);
 
 	if (offset == 0)
@@ -1198,6 +961,38 @@
 	return SLJIT_SUCCESS;
 }
 
+static sljit_s32 tail_call_reg_arg_with_args(struct sljit_compiler *compiler, sljit_s32 arg_types)
+{
+	sljit_s32 word_arg_count = 0;
+	sljit_s32 kept_saveds_count, offset;
+
+	arg_types >>= SLJIT_ARG_SHIFT;
+
+	while (arg_types) {
+		if ((arg_types & SLJIT_ARG_MASK) < SLJIT_ARG_TYPE_F64)
+			word_arg_count++;
+
+		arg_types >>= SLJIT_ARG_SHIFT;
+	}
+
+	if (word_arg_count < 4)
+		return SLJIT_SUCCESS;
+
+	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), 2 * SSIZE_OF(sw));
+
+	kept_saveds_count = SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
+	offset = compiler->local_size + 3 * SSIZE_OF(sw);
+
+	if ((compiler->saveds > 0 && kept_saveds_count == 0) || compiler->scratches > 11)
+		offset += SSIZE_OF(sw);
+	if ((compiler->saveds > 1 && kept_saveds_count <= 1) || compiler->scratches > 10)
+		offset += SSIZE_OF(sw);
+	if ((compiler->saveds > 2 && kept_saveds_count <= 2) || compiler->scratches > 9)
+		offset += SSIZE_OF(sw);
+
+	return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), offset, TMP_REG1, 0);
+}
+
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 arg_types)
 {
@@ -1209,18 +1004,21 @@
 	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
 
 	if (type & SLJIT_CALL_RETURN) {
+		if ((type & 0xff) == SLJIT_CALL_REG_ARG) {
+			PTR_FAIL_IF(tail_call_reg_arg_with_args(compiler, arg_types));
+			PTR_FAIL_IF(emit_stack_frame_release(compiler, 0));
+
+			SLJIT_SKIP_CHECKS(compiler);
+			return sljit_emit_jump(compiler, SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP));
+		}
+
 		stack_size = type;
 		PTR_FAIL_IF(tail_call_with_args(compiler, &stack_size, arg_types, SLJIT_IMM, 0));
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-			|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-		compiler->skip_checks = 1;
-#endif
+		SLJIT_SKIP_CHECKS(compiler);
 
-		if (stack_size == 0) {
-			type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
-			return sljit_emit_jump(compiler, type);
-		}
+		if (stack_size == 0)
+			return sljit_emit_jump(compiler, SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP));
 
 		jump = sljit_emit_jump(compiler, type);
 		PTR_FAIL_IF(jump == NULL);
@@ -1229,32 +1027,15 @@
 		return jump;
 	}
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	if ((type & 0xff) == SLJIT_CALL) {
-		stack_size = c_fast_call_get_stack_size(arg_types, &word_arg_count);
-		PTR_FAIL_IF(c_fast_call_with_args(compiler, arg_types, stack_size, word_arg_count, 0));
-
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-		compiler->skip_checks = 1;
-#endif
-
-		jump = sljit_emit_jump(compiler, type);
-		PTR_FAIL_IF(jump == NULL);
-
-		PTR_FAIL_IF(post_call_with_args(compiler, arg_types, 0));
-		return jump;
+	if ((type & 0xff) == SLJIT_CALL_REG_ARG) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_jump(compiler, type);
 	}
-#endif
 
-	stack_size = cdecl_call_get_stack_size(compiler, arg_types, &word_arg_count);
-	PTR_FAIL_IF(cdecl_call_with_args(compiler, arg_types, stack_size, word_arg_count));
+	stack_size = call_get_stack_size(arg_types, &word_arg_count);
+	PTR_FAIL_IF(call_with_args(compiler, arg_types, stack_size, word_arg_count, 0));
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
-
+	SLJIT_SKIP_CHECKS(compiler);
 	jump = sljit_emit_jump(compiler, type);
 	PTR_FAIL_IF(jump == NULL);
 
@@ -1268,14 +1049,29 @@
 {
 	sljit_sw stack_size = 0;
 	sljit_s32 word_arg_count;
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	sljit_s32 swap_args;
-#endif
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
 
 	if (type & SLJIT_CALL_RETURN) {
+		if ((type & 0xff) == SLJIT_CALL_REG_ARG) {
+			FAIL_IF(tail_call_reg_arg_with_args(compiler, arg_types));
+
+			if ((src & SLJIT_MEM) || (src > SLJIT_R2 && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options)))) {
+				ADJUST_LOCAL_OFFSET(src, srcw);
+				CHECK_EXTRA_REGS(src, srcw, (void)0);
+
+				EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
+				src = TMP_REG1;
+				srcw = 0;
+			}
+
+			FAIL_IF(emit_stack_frame_release(compiler, 0));
+
+			SLJIT_SKIP_CHECKS(compiler);
+			return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+		}
+
 		stack_size = type;
 		FAIL_IF(tail_call_with_args(compiler, &stack_size, arg_types, src, srcw));
 
@@ -1284,10 +1080,7 @@
 			srcw = 0;
 		}
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-			|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-		compiler->skip_checks = 1;
-#endif
+		SLJIT_SKIP_CHECKS(compiler);
 
 		if (stack_size == 0)
 			return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
@@ -1296,59 +1089,59 @@
 		return emit_tail_call_end(compiler, stack_size);
 	}
 
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	SLJIT_ASSERT(reg_map[SLJIT_R0] == 0 && reg_map[SLJIT_R2] == 1 && SLJIT_R0 == 1 && SLJIT_R2 == 3);
-
-	if ((type & 0xff) == SLJIT_CALL) {
-		stack_size = c_fast_call_get_stack_size(arg_types, &word_arg_count);
-		swap_args = 0;
-
-		if (word_arg_count > 0) {
-			if ((src & REG_MASK) == SLJIT_R2 || OFFS_REG(src) == SLJIT_R2) {
-				swap_args = 1;
-				if (((src & REG_MASK) | 0x2) == SLJIT_R2)
-					src ^= 0x2;
-				if ((OFFS_REG(src) | 0x2) == SLJIT_R2)
-					src ^= TO_OFFS_REG(0x2);
-			}
-		}
-
-		FAIL_IF(c_fast_call_with_args(compiler, arg_types, stack_size, word_arg_count, swap_args));
-
-		compiler->scratches_offset += stack_size;
-		compiler->locals_offset += stack_size;
-
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-		compiler->skip_checks = 1;
-#endif
-		FAIL_IF(sljit_emit_ijump(compiler, type, src, srcw));
-
-		compiler->scratches_offset -= stack_size;
-		compiler->locals_offset -= stack_size;
-
-		return post_call_with_args(compiler, arg_types, 0);
+	if ((type & 0xff) == SLJIT_CALL_REG_ARG) {
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_ijump(compiler, type, src, srcw);
 	}
-#endif
 
-	stack_size = cdecl_call_get_stack_size(compiler, arg_types, &word_arg_count);
-	FAIL_IF(cdecl_call_with_args(compiler, arg_types, stack_size, word_arg_count));
+	ADJUST_LOCAL_OFFSET(src, srcw);
+	CHECK_EXTRA_REGS(src, srcw, (void)0);
 
-	compiler->scratches_offset += stack_size;
-	compiler->locals_offset += stack_size;
+	if (src & SLJIT_MEM) {
+		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
+		src = TMP_REG1;
+		srcw = 0;
+	}
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	stack_size = call_get_stack_size(arg_types, &word_arg_count);
+	FAIL_IF(call_with_args(compiler, arg_types, stack_size, word_arg_count, src == TMP_REG1));
+
+	if (stack_size > 0 && src == SLJIT_MEM1(SLJIT_SP))
+		srcw += stack_size;
+
+	SLJIT_SKIP_CHECKS(compiler);
 	FAIL_IF(sljit_emit_ijump(compiler, type, src, srcw));
 
-	compiler->scratches_offset -= stack_size;
-	compiler->locals_offset -= stack_size;
-
 	return post_call_with_args(compiler, arg_types, stack_size);
 }
 
+static SLJIT_INLINE sljit_s32 emit_fmov_before_return(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 src, sljit_sw srcw)
+{
+	sljit_u8* inst;
+
+	if (compiler->options & SLJIT_ENTER_REG_ARG) {
+		if (src == SLJIT_FR0)
+			return SLJIT_SUCCESS;
+
+		SLJIT_SKIP_CHECKS(compiler);
+		return sljit_emit_fop1(compiler, op, SLJIT_RETURN_FREG, 0, src, srcw);
+	}
+
+	if (FAST_IS_REG(src)) {
+		FAIL_IF(emit_sse2_store(compiler, op & SLJIT_32, SLJIT_MEM1(SLJIT_SP), 0, src));
+
+		src = SLJIT_MEM1(SLJIT_SP);
+		srcw = 0;
+	} else {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+	}
+
+	inst = emit_x86_instruction(compiler, 1 | EX86_SSE2_OP1, 0, 0, src, srcw);
+	*inst = (op & SLJIT_32) ? FLDS : FLDL;
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fast_enter(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
 {
 	sljit_u8 *inst;
@@ -1404,6 +1197,88 @@
 	return SLJIT_SUCCESS;
 }
 
+/* --------------------------------------------------------------------- */
+/*  Other operations                                                     */
+/* --------------------------------------------------------------------- */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_u8* inst;
+	sljit_s32 i, next, reg_idx, offset;
+	sljit_u8 regs[2];
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+
+	ADJUST_LOCAL_OFFSET(mem, memw);
+
+	regs[0] = U8(REG_PAIR_FIRST(reg));
+	regs[1] = U8(REG_PAIR_SECOND(reg));
+
+	next = SSIZE_OF(sw);
+
+	if (!(type & SLJIT_MEM_STORE) && (regs[0] == (mem & REG_MASK) || regs[0] == OFFS_REG(mem))) {
+		if (regs[1] == (mem & REG_MASK) || regs[1] == OFFS_REG(mem)) {
+			/* None of them are virtual register so TMP_REG1 will not be used. */
+			EMIT_MOV(compiler, TMP_REG1, 0, OFFS_REG(mem), 0);
+
+			if (regs[1] == OFFS_REG(mem))
+				next = -SSIZE_OF(sw);
+
+			mem = (mem & ~OFFS_REG_MASK) | TO_OFFS_REG(TMP_REG1);
+		} else {
+			next = -SSIZE_OF(sw);
+
+			if (!(mem & OFFS_REG_MASK))
+				memw += SSIZE_OF(sw);
+		}
+	}
+
+	for (i = 0; i < 2; i++) {
+		reg_idx = next > 0 ? i : (i ^ 0x1);
+		reg = regs[reg_idx];
+
+		offset = -1;
+
+		if (reg >= SLJIT_R3 && reg <= SLJIT_S3) {
+			offset = (2 * SSIZE_OF(sw)) + ((reg) - SLJIT_R3) * SSIZE_OF(sw);
+			reg = TMP_REG1;
+
+			if (type & SLJIT_MEM_STORE)
+				EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(SLJIT_SP), offset);
+		}
+
+		if ((mem & OFFS_REG_MASK) && (reg_idx == 1)) {
+			inst = (sljit_u8*)ensure_buf(compiler, (sljit_uw)(1 + 4));
+			FAIL_IF(!inst);
+
+			INC_SIZE(4);
+
+			inst[0] = (type & SLJIT_MEM_STORE) ? MOV_rm_r : MOV_r_rm;
+			inst[1] = 0x44 | U8(reg_map[reg] << 3);
+			inst[2] = U8(memw << 6) | U8(reg_map[OFFS_REG(mem)] << 3) | reg_map[mem & REG_MASK];
+			inst[3] = sizeof(sljit_sw);
+		} else if (type & SLJIT_MEM_STORE) {
+			EMIT_MOV(compiler, mem, memw, reg, 0);
+		} else {
+			EMIT_MOV(compiler, reg, 0, mem, memw);
+		}
+
+		if (!(mem & OFFS_REG_MASK))
+			memw += next;
+
+		if (!(type & SLJIT_MEM_STORE) && offset != -1)
+			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), offset, TMP_REG1, 0);
+	}
+
+	return SLJIT_SUCCESS;
+}
+
 static sljit_s32 skip_frames_before_return(struct sljit_compiler *compiler)
 {
 	sljit_sw size;
diff --git a/src/sljit/sljitNativeX86_64.c b/src/sljit/sljitNativeX86_64.c
index f37df6e..4e938ff 100644
--- a/src/sljit/sljitNativeX86_64.c
+++ b/src/sljit/sljitNativeX86_64.c
@@ -101,34 +101,38 @@
 	/* Calculate size of b. */
 	inst_size += 1; /* mod r/m byte. */
 	if (b & SLJIT_MEM) {
-		if (!(b & OFFS_REG_MASK)) {
-			if (NOT_HALFWORD(immb)) {
-				PTR_FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immb));
-				immb = 0;
-				if (b & REG_MASK)
-					b |= TO_OFFS_REG(TMP_REG2);
-				else
-					b |= TMP_REG2;
-			}
-			else if (reg_lmap[b & REG_MASK] == 4)
-				b |= TO_OFFS_REG(SLJIT_SP);
+		if (!(b & OFFS_REG_MASK) && NOT_HALFWORD(immb)) {
+			PTR_FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immb));
+			immb = 0;
+			if (b & REG_MASK)
+				b |= TO_OFFS_REG(TMP_REG2);
+			else
+				b |= TMP_REG2;
 		}
 
 		if (!(b & REG_MASK))
 			inst_size += 1 + sizeof(sljit_s32); /* SIB byte required to avoid RIP based addressing. */
 		else {
-			if (reg_map[b & REG_MASK] >= 8)
-				rex |= REX_B;
-
-			if (immb != 0 && (!(b & OFFS_REG_MASK) || (b & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_SP))) {
+			if (immb != 0 && !(b & OFFS_REG_MASK)) {
 				/* Immediate operand. */
 				if (immb <= 127 && immb >= -128)
 					inst_size += sizeof(sljit_s8);
 				else
 					inst_size += sizeof(sljit_s32);
 			}
-			else if (reg_lmap[b & REG_MASK] == 5)
-				inst_size += sizeof(sljit_s8);
+			else if (reg_lmap[b & REG_MASK] == 5) {
+				/* Swap registers if possible. */
+				if ((b & OFFS_REG_MASK) && (immb & 0x3) == 0 && reg_lmap[OFFS_REG(b)] != 5)
+					b = SLJIT_MEM | OFFS_REG(b) | TO_OFFS_REG(b & REG_MASK);
+				else
+					inst_size += sizeof(sljit_s8);
+			}
+
+			if (reg_map[b & REG_MASK] >= 8)
+				rex |= REX_B;
+
+			if (reg_lmap[b & REG_MASK] == 4 && !(b & OFFS_REG_MASK))
+				b |= TO_OFFS_REG(SLJIT_SP);
 
 			if (b & OFFS_REG_MASK) {
 				inst_size += 1; /* SIB byte. */
@@ -153,9 +157,9 @@
 				inst_size += 4;
 		}
 		else if (flags & EX86_SHIFT_INS) {
-			imma &= compiler->mode32 ? 0x1f : 0x3f;
+			SLJIT_ASSERT(imma <= (compiler->mode32 ? 0x1f : 0x3f));
 			if (imma != 1) {
-				inst_size ++;
+				inst_size++;
 				flags |= EX86_BYTE_ARG;
 			}
 		} else if (flags & EX86_BYTE_ARG)
@@ -223,7 +227,7 @@
 	} else if (b & REG_MASK) {
 		reg_lmap_b = reg_lmap[b & REG_MASK];
 
-		if (!(b & OFFS_REG_MASK) || (b & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_SP) || reg_lmap_b == 5) {
+		if (!(b & OFFS_REG_MASK) || (b & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_SP)) {
 			if (immb != 0 || reg_lmap_b == 5) {
 				if (immb <= 127 && immb >= -128)
 					*buf_ptr |= 0x40;
@@ -248,8 +252,14 @@
 			}
 		}
 		else {
+			if (reg_lmap_b == 5)
+				*buf_ptr |= 0x40;
+
 			*buf_ptr++ |= 0x04;
 			*buf_ptr++ = U8(reg_lmap_b | (reg_lmap[OFFS_REG(b)] << 3) | (immb << 6));
+
+			if (reg_lmap_b == 5)
+				*buf_ptr++ = 0;
 		}
 	}
 	else {
@@ -366,7 +376,7 @@
 {
 	sljit_uw size;
 	sljit_s32 word_arg_count = 0;
-	sljit_s32 saved_arg_count = 0;
+	sljit_s32 saved_arg_count = SLJIT_KEPT_SAVEDS_COUNT(options);
 	sljit_s32 saved_regs_size, tmp, i;
 #ifdef _WIN64
 	sljit_s32 saved_float_regs_size;
@@ -379,16 +389,19 @@
 	CHECK(check_sljit_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size));
 	set_emit_enter(compiler, options, arg_types, scratches, saveds, fscratches, fsaveds, local_size);
 
+	if (options & SLJIT_ENTER_REG_ARG)
+		arg_types = 0;
+
 	/* Emit ENDBR64 at function entry if needed.  */
 	FAIL_IF(emit_endbranch(compiler));
 
 	compiler->mode32 = 0;
 
 	/* Including the return address saved by the call instruction. */
-	saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
+	saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - saved_arg_count, 1);
 
 	tmp = SLJIT_S0 - saveds;
-	for (i = SLJIT_S0; i > tmp; i--) {
+	for (i = SLJIT_S0 - saved_arg_count; i > tmp; i--) {
 		size = reg_map[i] >= 8 ? 2 : 1;
 		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
 		FAIL_IF(!inst);
@@ -561,15 +574,15 @@
 #endif /* _WIN64 */
 
 	/* Including the return address saved by the call instruction. */
-	saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds, 1);
+	saved_regs_size = GET_SAVED_REGISTERS_SIZE(scratches, saveds - SLJIT_KEPT_SAVEDS_COUNT(options), 1);
 	compiler->local_size = ((local_size + saved_regs_size + 0xf) & ~0xf) - saved_regs_size;
 	return SLJIT_SUCCESS;
 }
 
-static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler)
+static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 is_return_to)
 {
 	sljit_uw size;
-	sljit_s32 i, tmp;
+	sljit_s32 local_size, i, tmp;
 	sljit_u8 *inst;
 #ifdef _WIN64
 	sljit_s32 saved_float_regs_offset;
@@ -598,30 +611,21 @@
 			*inst = MOVAPS_x_xm;
 			saved_float_regs_offset += 16;
 		}
+
+		compiler->mode32 = 0;
 	}
 #endif /* _WIN64 */
 
-	if (compiler->local_size > 0) {
-		if (compiler->local_size <= 127) {
-			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
-			FAIL_IF(!inst);
-			INC_SIZE(4);
-			*inst++ = REX_W;
-			*inst++ = GROUP_BINARY_83;
-			*inst++ = MOD_REG | ADD | 4;
-			*inst = U8(compiler->local_size);
-		}
-		else {
-			inst = (sljit_u8*)ensure_buf(compiler, 1 + 7);
-			FAIL_IF(!inst);
-			INC_SIZE(7);
-			*inst++ = REX_W;
-			*inst++ = GROUP_BINARY_81;
-			*inst++ = MOD_REG | ADD | 4;
-			sljit_unaligned_store_s32(inst, compiler->local_size);
-		}
+	local_size = compiler->local_size;
+
+	if (is_return_to && compiler->scratches < SLJIT_FIRST_SAVED_REG && (compiler->saveds == SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
+		local_size += SSIZE_OF(sw);
+		is_return_to = 0;
 	}
 
+	if (local_size > 0)
+		BINARY_IMM32(ADD, local_size, SLJIT_SP, 0);
+
 	tmp = compiler->scratches;
 	for (i = SLJIT_FIRST_SAVED_REG; i <= tmp; i++) {
 		size = reg_map[i] >= 8 ? 2 : 1;
@@ -633,8 +637,8 @@
 		POP_REG(reg_lmap[i]);
 	}
 
-	tmp = compiler->saveds < SLJIT_NUMBER_OF_SAVED_REGISTERS ? (SLJIT_S0 + 1 - compiler->saveds) : SLJIT_FIRST_SAVED_REG;
-	for (i = tmp; i <= SLJIT_S0; i++) {
+	tmp = SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options);
+	for (i = SLJIT_S0 + 1 - compiler->saveds; i <= tmp; i++) {
 		size = reg_map[i] >= 8 ? 2 : 1;
 		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
 		FAIL_IF(!inst);
@@ -644,6 +648,9 @@
 		POP_REG(reg_lmap[i]);
 	}
 
+	if (is_return_to)
+		BINARY_IMM32(ADD, sizeof(sljit_sw), SLJIT_SP, 0);
+
 	return SLJIT_SUCCESS;
 }
 
@@ -654,7 +661,9 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_return_void(compiler));
 
-	FAIL_IF(emit_stack_frame_release(compiler));
+	compiler->mode32 = 0;
+
+	FAIL_IF(emit_stack_frame_release(compiler, 0));
 
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
 	FAIL_IF(!inst);
@@ -663,6 +672,28 @@
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_return_to(struct sljit_compiler *compiler,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_return_to(compiler, src, srcw));
+
+	compiler->mode32 = 0;
+
+	if ((src & SLJIT_MEM) || (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options)))) {
+		ADJUST_LOCAL_OFFSET(src, srcw);
+
+		EMIT_MOV(compiler, TMP_REG2, 0, src, srcw);
+		src = TMP_REG2;
+		srcw = 0;
+	}
+
+	FAIL_IF(emit_stack_frame_release(compiler, 1));
+
+	SLJIT_SKIP_CHECKS(compiler);
+	return sljit_emit_ijump(compiler, SLJIT_JUMP, src, srcw);
+}
+
 /* --------------------------------------------------------------------- */
 /*  Call / return instructions                                           */
 /* --------------------------------------------------------------------- */
@@ -786,17 +817,15 @@
 
 	compiler->mode32 = 0;
 
-	PTR_FAIL_IF(call_with_args(compiler, arg_types, NULL));
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG)
+		PTR_FAIL_IF(call_with_args(compiler, arg_types, NULL));
 
 	if (type & SLJIT_CALL_RETURN) {
-		PTR_FAIL_IF(emit_stack_frame_release(compiler));
+		PTR_FAIL_IF(emit_stack_frame_release(compiler, 0));
 		type = SLJIT_JUMP | (type & SLJIT_REWRITABLE_JUMP);
 	}
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_jump(compiler, type);
 }
 
@@ -816,22 +845,21 @@
 	}
 
 	if (type & SLJIT_CALL_RETURN) {
-		if (src >= SLJIT_FIRST_SAVED_REG && src <= SLJIT_S0) {
+		if (src >= SLJIT_FIRST_SAVED_REG && src <= (SLJIT_S0 - SLJIT_KEPT_SAVEDS_COUNT(compiler->options))) {
 			EMIT_MOV(compiler, TMP_REG2, 0, src, srcw);
 			src = TMP_REG2;
 		}
 
-		FAIL_IF(emit_stack_frame_release(compiler));
-		type = SLJIT_JUMP;
+		FAIL_IF(emit_stack_frame_release(compiler, 0));
 	}
 
-	FAIL_IF(call_with_args(compiler, arg_types, &src));
+	if ((type & 0xff) != SLJIT_CALL_REG_ARG)
+		FAIL_IF(call_with_args(compiler, arg_types, &src));
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	if (type & SLJIT_CALL_RETURN)
+		type = SLJIT_JUMP;
 
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_ijump(compiler, type, src, srcw);
 }
 
@@ -907,9 +935,89 @@
 }
 
 /* --------------------------------------------------------------------- */
-/*  Extend input                                                         */
+/*  Other operations                                                     */
 /* --------------------------------------------------------------------- */
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 reg,
+	sljit_s32 mem, sljit_sw memw)
+{
+	sljit_u8* inst;
+	sljit_s32 i, next, reg_idx;
+	sljit_u8 regs[2];
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_mem(compiler, type, reg, mem, memw));
+
+	if (!(reg & REG_PAIR_MASK))
+		return sljit_emit_mem_unaligned(compiler, type, reg, mem, memw);
+
+	ADJUST_LOCAL_OFFSET(mem, memw);
+
+	compiler->mode32 = 0;
+
+	if ((mem & REG_MASK) == 0) {
+		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, memw);
+
+		mem = SLJIT_MEM1(TMP_REG1);
+		memw = 0;
+	} else if (!(mem & OFFS_REG_MASK) && ((memw < HALFWORD_MIN) || (memw > HALFWORD_MAX - SSIZE_OF(sw)))) {
+		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, memw);
+
+		mem = SLJIT_MEM2(mem & REG_MASK, TMP_REG1);
+		memw = 0;
+	}
+
+	regs[0] = U8(REG_PAIR_FIRST(reg));
+	regs[1] = U8(REG_PAIR_SECOND(reg));
+
+	next = SSIZE_OF(sw);
+
+	if (!(type & SLJIT_MEM_STORE) && (regs[0] == (mem & REG_MASK) || regs[0] == OFFS_REG(mem))) {
+		if (regs[1] == (mem & REG_MASK) || regs[1] == OFFS_REG(mem)) {
+			/* Base and offset cannot be TMP_REG1. */
+			EMIT_MOV(compiler, TMP_REG1, 0, OFFS_REG(mem), 0);
+
+			if (regs[1] == OFFS_REG(mem))
+				next = -SSIZE_OF(sw);
+
+			mem = (mem & ~OFFS_REG_MASK) | TO_OFFS_REG(TMP_REG1);
+		} else {
+			next = -SSIZE_OF(sw);
+
+			if (!(mem & OFFS_REG_MASK))
+				memw += SSIZE_OF(sw);
+		}
+	}
+
+	for (i = 0; i < 2; i++) {
+		reg_idx = next > 0 ? i : (i ^ 0x1);
+		reg = regs[reg_idx];
+
+		if ((mem & OFFS_REG_MASK) && (reg_idx == 1)) {
+			inst = (sljit_u8*)ensure_buf(compiler, (sljit_uw)(1 + 5));
+			FAIL_IF(!inst);
+
+			INC_SIZE(5);
+
+			inst[0] = U8(REX_W | ((reg_map[reg] >= 8) ? REX_R : 0) | ((reg_map[mem & REG_MASK] >= 8) ? REX_B : 0) | ((reg_map[OFFS_REG(mem)] >= 8) ? REX_X : 0));
+			inst[1] = (type & SLJIT_MEM_STORE) ? MOV_rm_r : MOV_r_rm;
+			inst[2] = 0x44 | U8(reg_lmap[reg] << 3);
+			inst[3] = U8(memw << 6) | U8(reg_lmap[OFFS_REG(mem)] << 3) | reg_lmap[mem & REG_MASK];
+			inst[4] = sizeof(sljit_sw);
+		} else if (type & SLJIT_MEM_STORE) {
+			EMIT_MOV(compiler, mem, memw, reg, 0);
+		} else {
+			EMIT_MOV(compiler, reg, 0, mem, memw);
+		}
+
+		if (!(mem & OFFS_REG_MASK))
+			memw += next;
+	}
+
+	return SLJIT_SUCCESS;
+}
+
 static sljit_s32 emit_mov_int(struct sljit_compiler *compiler, sljit_s32 sign,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src, sljit_sw srcw)
diff --git a/src/sljit/sljitNativeX86_common.c b/src/sljit/sljitNativeX86_common.c
index c7dd9be..651942b 100644
--- a/src/sljit/sljitNativeX86_common.c
+++ b/src/sljit/sljitNativeX86_common.c
@@ -26,11 +26,7 @@
 
 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
 {
-#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
-	return "x86" SLJIT_CPUINFO " ABI:fastcall";
-#else
 	return "x86" SLJIT_CPUINFO;
-#endif
 }
 
 /*
@@ -78,10 +74,7 @@
 
 #define CHECK_EXTRA_REGS(p, w, do) \
 	if (p >= SLJIT_R3 && p <= SLJIT_S3) { \
-		if (p <= compiler->scratches) \
-			w = compiler->scratches_offset + ((p) - SLJIT_R3) * SSIZE_OF(sw); \
-		else \
-			w = compiler->locals_offset + ((p) - SLJIT_S2) * SSIZE_OF(sw); \
+		w = (2 * SSIZE_OF(sw)) + ((p) - SLJIT_R3) * SSIZE_OF(sw); \
 		p = SLJIT_MEM1(SLJIT_SP); \
 		do; \
 	}
@@ -181,6 +174,7 @@
 #define AND_rm_r	0x21
 #define ANDPD_x_xm	0x54
 #define BSR_r_rm	(/* GROUP_0F */ 0xbd)
+#define BSF_r_rm	(/* GROUP_0F */ 0xbc)
 #define CALL_i32	0xe8
 #define CALL_rm		(/* GROUP_FF */ 2 << 3)
 #define CDQ		0x99
@@ -194,6 +188,8 @@
 #define CVTTSD2SI_r_xm	0x2c
 #define DIV		(/* GROUP_F7 */ 6 << 3)
 #define DIVSD_x_xm	0x5e
+#define FLDS		0xd9
+#define FLDL		0xdd
 #define FSTPS		0xd9
 #define FSTPD		0xdd
 #define INT3		0xcc
@@ -209,6 +205,7 @@
 #define JMP_rm		(/* GROUP_FF */ 4 << 3)
 #define LEA_r_m		0x8d
 #define LOOP_i8		0xe2
+#define LZCNT_r_rm	(/* GROUP_F3 */ /* GROUP_0F */ 0xbd)
 #define MOV_r_rm	0x8b
 #define MOV_r_i32	0xb8
 #define MOV_rm_r	0x89
@@ -242,6 +239,8 @@
 #define PUSH_r		0x50
 #define PUSH_rm		(/* GROUP_FF */ 6 << 3)
 #define PUSHF		0x9c
+#define ROL		(/* SHIFT */ 0 << 3)
+#define ROR		(/* SHIFT */ 1 << 3)
 #define RET_near	0xc3
 #define RET_i16		0xc2
 #define SBB		(/* BINARY */ 3 << 3)
@@ -250,6 +249,8 @@
 #define SBB_rm_r	0x19
 #define SAR		(/* SHIFT */ 7 << 3)
 #define SHL		(/* SHIFT */ 4 << 3)
+#define SHLD		(/* GROUP_0F */ 0xa5)
+#define SHRD		(/* GROUP_0F */ 0xad)
 #define SHR		(/* SHIFT */ 5 << 3)
 #define SUB		(/* BINARY */ 5 << 3)
 #define SUB_EAX_i32	0x2d
@@ -258,6 +259,7 @@
 #define SUBSD_x_xm	0x5c
 #define TEST_EAX_i32	0xa9
 #define TEST_rm_r	0x85
+#define TZCNT_r_rm	(/* GROUP_F3 */ /* GROUP_0F */ 0xbc)
 #define UCOMISD_x_xm	0x2e
 #define UNPCKLPD_x_xm	0x14
 #define XCHG_EAX_r	0x90
@@ -269,6 +271,7 @@
 #define XORPD_x_xm	0x57
 
 #define GROUP_0F	0x0f
+#define GROUP_F3	0xf3
 #define GROUP_F7	0xf7
 #define GROUP_FF	0xff
 #define GROUP_BINARY_81	0x81
@@ -290,10 +293,15 @@
 /* Multithreading does not affect these static variables, since they store
    built-in CPU features. Therefore they can be overwritten by different threads
    if they detect the CPU features in the same time. */
+#define CPU_FEATURE_DETECTED		0x001
 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
-static sljit_s32 cpu_has_sse2 = -1;
+#define CPU_FEATURE_SSE2		0x002
 #endif
-static sljit_s32 cpu_has_cmov = -1;
+#define CPU_FEATURE_LZCNT		0x004
+#define CPU_FEATURE_TZCNT		0x008
+#define CPU_FEATURE_CMOV		0x010
+
+static sljit_u32 cpu_feature_list = 0;
 
 #ifdef _WIN32_WCE
 #include <cmnintrin.h>
@@ -326,18 +334,65 @@
 
 static void get_cpu_features(void)
 {
-	sljit_u32 features;
+	sljit_u32 feature_list = CPU_FEATURE_DETECTED;
+	sljit_u32 value;
 
 #if defined(_MSC_VER) && _MSC_VER >= 1400
 
 	int CPUInfo[4];
+
+	__cpuid(CPUInfo, 0);
+	if (CPUInfo[0] >= 7) {
+		__cpuidex(CPUInfo, 7, 0);
+		if (CPUInfo[1] & 0x8)
+			feature_list |= CPU_FEATURE_TZCNT;
+	}
+
+	__cpuid(CPUInfo, (int)0x80000001);
+	if (CPUInfo[2] & 0x20)
+		feature_list |= CPU_FEATURE_LZCNT;
+
 	__cpuid(CPUInfo, 1);
-	features = (sljit_u32)CPUInfo[3];
+	value = (sljit_u32)CPUInfo[3];
 
 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
 
 	/* AT&T syntax. */
 	__asm__ (
+		"movl $0x0, %%eax\n"
+		"lzcnt %%eax, %%eax\n"
+		"setnz %%al\n"
+		"movl %%eax, %0\n"
+		: "=g" (value)
+		:
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+		: "eax"
+#else
+		: "rax"
+#endif
+	);
+
+	if (value & 0x1)
+		feature_list |= CPU_FEATURE_LZCNT;
+
+	__asm__ (
+		"movl $0x0, %%eax\n"
+		"tzcnt %%eax, %%eax\n"
+		"setnz %%al\n"
+		"movl %%eax, %0\n"
+		: "=g" (value)
+		:
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+		: "eax"
+#else
+		: "rax"
+#endif
+	);
+
+	if (value & 0x1)
+		feature_list |= CPU_FEATURE_TZCNT;
+
+	__asm__ (
 		"movl $0x1, %%eax\n"
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 		/* On x86-32, there is no red zone, so this
@@ -349,7 +404,7 @@
 		"pop %%ebx\n"
 #endif
 		"movl %%edx, %0\n"
-		: "=g" (features)
+		: "=g" (value)
 		:
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 		: "%eax", "%ecx", "%edx"
@@ -362,46 +417,82 @@
 
 	/* Intel syntax. */
 	__asm {
+		mov eax, 0
+		lzcnt eax, eax
+		setnz al
+		mov value, eax
+	}
+
+	if (value & 0x1)
+		feature_list |= CPU_FEATURE_LZCNT;
+
+	__asm {
+		mov eax, 0
+		tzcnt eax, eax
+		setnz al
+		mov value, eax
+	}
+
+	if (value & 0x1)
+		feature_list |= CPU_FEATURE_TZCNT;
+
+	__asm {
 		mov eax, 1
 		cpuid
-		mov features, edx
+		mov value, edx
 	}
 
 #endif /* _MSC_VER && _MSC_VER >= 1400 */
 
 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
-	cpu_has_sse2 = (features >> 26) & 0x1;
+	if (value & 0x4000000)
+		feature_list |= CPU_FEATURE_SSE2;
 #endif
-	cpu_has_cmov = (features >> 15) & 0x1;
+	if (value & 0x8000)
+		feature_list |= CPU_FEATURE_CMOV;
+
+	cpu_feature_list = feature_list;
 }
 
 static sljit_u8 get_jump_code(sljit_uw type)
 {
 	switch (type) {
 	case SLJIT_EQUAL:
-	case SLJIT_EQUAL_F64:
+	case SLJIT_F_EQUAL:
+	case SLJIT_UNORDERED_OR_EQUAL:
+	case SLJIT_ORDERED_EQUAL: /* Not supported. */
 		return 0x84 /* je */;
 
 	case SLJIT_NOT_EQUAL:
-	case SLJIT_NOT_EQUAL_F64:
+	case SLJIT_F_NOT_EQUAL:
+	case SLJIT_ORDERED_NOT_EQUAL:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL: /* Not supported. */
 		return 0x85 /* jne */;
 
 	case SLJIT_LESS:
 	case SLJIT_CARRY:
-	case SLJIT_LESS_F64:
+	case SLJIT_F_LESS:
+	case SLJIT_UNORDERED_OR_LESS:
+	case SLJIT_UNORDERED_OR_GREATER:
 		return 0x82 /* jc */;
 
 	case SLJIT_GREATER_EQUAL:
 	case SLJIT_NOT_CARRY:
-	case SLJIT_GREATER_EQUAL_F64:
+	case SLJIT_F_GREATER_EQUAL:
+	case SLJIT_ORDERED_GREATER_EQUAL:
+	case SLJIT_ORDERED_LESS_EQUAL:
 		return 0x83 /* jae */;
 
 	case SLJIT_GREATER:
-	case SLJIT_GREATER_F64:
+	case SLJIT_F_GREATER:
+	case SLJIT_ORDERED_LESS:
+	case SLJIT_ORDERED_GREATER:
 		return 0x87 /* jnbe */;
 
 	case SLJIT_LESS_EQUAL:
-	case SLJIT_LESS_EQUAL_F64:
+	case SLJIT_F_LESS_EQUAL:
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+	case SLJIT_UNORDERED_OR_LESS_EQUAL:
 		return 0x86 /* jbe */;
 
 	case SLJIT_SIG_LESS:
@@ -422,10 +513,10 @@
 	case SLJIT_NOT_OVERFLOW:
 		return 0x81 /* jno */;
 
-	case SLJIT_UNORDERED_F64:
+	case SLJIT_UNORDERED:
 		return 0x8a /* jp */;
 
-	case SLJIT_ORDERED_F64:
+	case SLJIT_ORDERED:
 		return 0x8b /* jpo */;
 	}
 	return 0;
@@ -449,13 +540,13 @@
 	else
 		label_addr = jump->u.target - (sljit_uw)executable_offset;
 
-	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
-
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
 		return generate_far_jump_code(jump, code_ptr);
 #endif
 
+	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
+
 	if (type == SLJIT_JUMP) {
 		if (short_jump)
 			*code_ptr++ = JMP_i8;
@@ -581,32 +672,33 @@
 
 	jump = compiler->jumps;
 	while (jump) {
-		jump_addr = jump->addr + (sljit_uw)executable_offset;
+		if (jump->flags & (PATCH_MB | PATCH_MW)) {
+			if (jump->flags & JUMP_LABEL)
+				jump_addr = jump->u.label->addr;
+			else
+				jump_addr = jump->u.target;
 
-		if (jump->flags & PATCH_MB) {
-			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) <= 127);
-			*(sljit_u8*)jump->addr = U8(jump->u.label->addr - (jump_addr + sizeof(sljit_s8)));
-		} else if (jump->flags & PATCH_MW) {
-			if (jump->flags & JUMP_LABEL) {
+			jump_addr -= jump->addr + (sljit_uw)executable_offset;
+
+			if (jump->flags & PATCH_MB) {
+				jump_addr -= sizeof(sljit_s8);
+				SLJIT_ASSERT((sljit_sw)jump_addr >= -128 && (sljit_sw)jump_addr <= 127);
+				*(sljit_u8*)jump->addr = U8(jump_addr);
+			} else {
+				jump_addr -= sizeof(sljit_s32);
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_sw))));
+				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)jump_addr);
 #else
-				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
-				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))));
-#endif
-			}
-			else {
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_sw))));
-#else
-				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
-				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.target - (jump_addr + sizeof(sljit_s32))));
+				SLJIT_ASSERT((sljit_sw)jump_addr >= HALFWORD_MIN && (sljit_sw)jump_addr <= HALFWORD_MAX);
+				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)jump_addr);
 #endif
 			}
 		}
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-		else if (jump->flags & PATCH_MD)
-			sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)jump->u.label->addr);
+		else if (jump->flags & PATCH_MD) {
+				SLJIT_ASSERT(jump->flags & JUMP_LABEL);
+				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)jump->u.label->addr);
+		}
 #endif
 
 		jump = jump->next;
@@ -647,9 +739,9 @@
 #ifdef SLJIT_IS_FPU_AVAILABLE
 		return SLJIT_IS_FPU_AVAILABLE;
 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
-		if (cpu_has_sse2 == -1)
+		if (cpu_feature_list == 0)
 			get_cpu_features();
-		return cpu_has_sse2;
+		return (cpu_feature_list & CPU_FEATURE_SSE2) != 0;
 #else /* SLJIT_DETECT_SSE2 */
 		return 1;
 #endif /* SLJIT_DETECT_SSE2 */
@@ -657,31 +749,57 @@
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 	case SLJIT_HAS_VIRTUAL_REGISTERS:
 		return 1;
-#endif
+#endif /* SLJIT_CONFIG_X86_32 */
 
 	case SLJIT_HAS_CLZ:
-	case SLJIT_HAS_CMOV:
-		if (cpu_has_cmov == -1)
+		if (cpu_feature_list == 0)
 			get_cpu_features();
-		return cpu_has_cmov;
 
+		return (cpu_feature_list & CPU_FEATURE_LZCNT) ? 1 : 2;
+
+	case SLJIT_HAS_CTZ:
+		if (cpu_feature_list == 0)
+			get_cpu_features();
+
+		return (cpu_feature_list & CPU_FEATURE_TZCNT) ? 1 : 2;
+
+	case SLJIT_HAS_CMOV:
+		if (cpu_feature_list == 0)
+			get_cpu_features();
+		return (cpu_feature_list & CPU_FEATURE_CMOV) != 0;
+
+	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_PREFETCH:
 		return 1;
 
 	case SLJIT_HAS_SSE2:
 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
-		if (cpu_has_sse2 == -1)
+		if (cpu_feature_list == 0)
 			get_cpu_features();
-		return cpu_has_sse2;
-#else
+		return (cpu_feature_list & CPU_FEATURE_SSE2) != 0;
+#else /* !SLJIT_DETECT_SSE2 */
 		return 1;
-#endif
+#endif /* SLJIT_DETECT_SSE2 */
 
 	default:
 		return 0;
 	}
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
+{
+	if (type < SLJIT_UNORDERED || type > SLJIT_ORDERED_LESS_EQUAL)
+		return 0;
+
+	switch (type) {
+	case SLJIT_ORDERED_EQUAL:
+	case SLJIT_UNORDERED_OR_NOT_EQUAL:
+		return 0;
+	}
+
+	return 1;
+}
+
 /* --------------------------------------------------------------------- */
 /*  Operators                                                            */
 /* --------------------------------------------------------------------- */
@@ -1385,47 +1503,75 @@
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 static const sljit_sw emit_clz_arg = 32 + 31;
+static const sljit_sw emit_ctz_arg = 32;
 #endif
 
-static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
+static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 is_clz,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src, sljit_sw srcw)
 {
 	sljit_u8* inst;
 	sljit_s32 dst_r;
+	sljit_sw max;
 
-	SLJIT_UNUSED_ARG(op_flags);
-
-	if (cpu_has_cmov == -1)
+	if (cpu_feature_list == 0)
 		get_cpu_features();
 
 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
 
+	if (is_clz ? (cpu_feature_list & CPU_FEATURE_LZCNT) : (cpu_feature_list & CPU_FEATURE_TZCNT)) {
+		/* Group prefix added separately. */
+		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
+		FAIL_IF(!inst);
+		INC_SIZE(1);
+		*inst++ = GROUP_F3;
+
+		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
+		FAIL_IF(!inst);
+		*inst++ = GROUP_0F;
+		*inst = is_clz ? LZCNT_r_rm : TZCNT_r_rm;
+
+		if (dst & SLJIT_MEM)
+			EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
+		return SLJIT_SUCCESS;
+	}
+
 	inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
 	FAIL_IF(!inst);
 	*inst++ = GROUP_0F;
-	*inst = BSR_r_rm;
+	*inst = is_clz ? BSR_r_rm : BSF_r_rm;
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-	if (cpu_has_cmov) {
+	max = is_clz ? (32 + 31) : 32;
+
+	if (cpu_feature_list & CPU_FEATURE_CMOV) {
 		if (dst_r != TMP_REG1) {
-			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 32 + 31);
+			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, max);
 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
 		}
 		else
-			inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), (sljit_sw)&emit_clz_arg);
+			inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), is_clz ? (sljit_sw)&emit_clz_arg : (sljit_sw)&emit_ctz_arg);
 
 		FAIL_IF(!inst);
 		*inst++ = GROUP_0F;
 		*inst = CMOVE_r_rm;
 	}
 	else
-		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, 32 + 31));
+		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
 
-	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
+	if (is_clz) {
+		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
+		FAIL_IF(!inst);
+		*(inst + 1) |= XOR;
+	}
 #else
-	if (cpu_has_cmov) {
-		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, !(op_flags & SLJIT_32) ? (64 + 63) : (32 + 31));
+	if (is_clz)
+		max = compiler->mode32 ? (32 + 31) : (64 + 63);
+	else
+		max = compiler->mode32 ? 32 : 64;
+
+	if (cpu_feature_list & CPU_FEATURE_CMOV) {
+		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, max);
 
 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
 		FAIL_IF(!inst);
@@ -1433,14 +1579,15 @@
 		*inst = CMOVE_r_rm;
 	}
 	else
-		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, !(op_flags & SLJIT_32) ? (64 + 63) : (32 + 31)));
+		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
 
-	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_32) ? 63 : 31, dst_r, 0);
+	if (is_clz) {
+		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, max >> 1, dst_r, 0);
+		FAIL_IF(!inst);
+		*(inst + 1) |= XOR;
+	}
 #endif
 
-	FAIL_IF(!inst);
-	*(inst + 1) |= XOR;
-
 	if (dst & SLJIT_MEM)
 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
 	return SLJIT_SUCCESS;
@@ -1578,7 +1725,8 @@
 		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
 
 	case SLJIT_CLZ:
-		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
+	case SLJIT_CTZ:
+		return emit_clz_ctz(compiler, (op == SLJIT_CLZ), dst, dstw, src, srcw);
 	}
 
 	return SLJIT_SUCCESS;
@@ -2116,6 +2264,9 @@
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w)
 {
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	sljit_s32 mode32;
+#endif
 	sljit_u8* inst;
 
 	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
@@ -2155,41 +2306,62 @@
 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
 		FAIL_IF(!inst);
 		*inst |= mode;
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
+		return emit_mov(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
 	}
-	else if (FAST_IS_REG(dst) && dst != src2 && dst != TMP_REG1 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
+
+	if (FAST_IS_REG(dst) && dst != src2 && dst != TMP_REG1 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
 		if (src1 != dst)
 			EMIT_MOV(compiler, dst, 0, src1, src1w);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		mode32 = compiler->mode32;
+		compiler->mode32 = 0;
+#endif
 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = mode32;
+#endif
 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
 		FAIL_IF(!inst);
 		*inst |= mode;
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
-	}
-	else {
-		/* This case is complex since ecx itself may be used for
-		   addressing, and this case must be supported as well. */
-		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
-		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
-		FAIL_IF(!inst);
-		*inst |= mode;
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
-#else
-		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
-		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
-		FAIL_IF(!inst);
-		*inst |= mode;
-		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = 0;
 #endif
-		if (dst != TMP_REG1)
-			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
+		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = mode32;
+#endif
+		return SLJIT_SUCCESS;
 	}
 
+	/* This case is complex since ecx itself may be used for
+	   addressing, and this case must be supported as well. */
+	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
+#else /* !SLJIT_CONFIG_X86_32 */
+	mode32 = compiler->mode32;
+	compiler->mode32 = 0;
+	EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
+	compiler->mode32 = mode32;
+#endif /* SLJIT_CONFIG_X86_32 */
+
+	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
+	inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
+	FAIL_IF(!inst);
+	*inst |= mode;
+
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
+#else
+	compiler->mode32 = 0;
+	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
+	compiler->mode32 = mode32;
+#endif /* SLJIT_CONFIG_X86_32 */
+
+	if (dst != TMP_REG1)
+		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
+
 	return SLJIT_SUCCESS;
 }
 
@@ -2202,12 +2374,13 @@
 	/* The CPU does not set flags if the shift count is 0. */
 	if (src2 & SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
+		src2w &= compiler->mode32 ? 0x1f : 0x3f;
+#else /* !SLJIT_CONFIG_X86_64 */
+		src2w &= 0x1f;
+#endif /* SLJIT_CONFIG_X86_64 */
+		if (src2w != 0)
 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
-#else
-		if ((src2w & 0x1f) != 0)
-			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
-#endif
+
 		if (!set_flags)
 			return emit_mov(compiler, dst, dstw, src1, src1w);
 		/* OR dst, src, 0 */
@@ -2289,14 +2462,23 @@
 		return emit_cum_binary(compiler, BINARY_OPCODE(XOR),
 			dst, dstw, src1, src1w, src2, src2w);
 	case SLJIT_SHL:
+	case SLJIT_MSHL:
 		return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op),
 			dst, dstw, src1, src1w, src2, src2w);
 	case SLJIT_LSHR:
+	case SLJIT_MLSHR:
 		return emit_shift_with_flags(compiler, SHR, HAS_FLAGS(op),
 			dst, dstw, src1, src1w, src2, src2w);
 	case SLJIT_ASHR:
+	case SLJIT_MASHR:
 		return emit_shift_with_flags(compiler, SAR, HAS_FLAGS(op),
 			dst, dstw, src1, src1w, src2, src2w);
+	case SLJIT_ROTL:
+		return emit_shift_with_flags(compiler, ROL, 0,
+			dst, dstw, src1, src1w, src2, src2w);
+	case SLJIT_ROTR:
+		return emit_shift_with_flags(compiler, ROR, 0,
+			dst, dstw, src1, src1w, src2, src2w);
 	}
 
 	return SLJIT_SUCCESS;
@@ -2312,10 +2494,7 @@
 	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
 
 	if (opcode != SLJIT_SUB && opcode != SLJIT_AND) {
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-			|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-		compiler->skip_checks = 1;
-#endif
+		SLJIT_SKIP_CHECKS(compiler);
 		return sljit_emit_op2(compiler, op, TMP_REG1, 0, src1, src1w, src2, src2w);
 	}
 
@@ -2334,6 +2513,122 @@
 	return emit_test_binary(compiler, src1, src1w, src2, src2w);
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 src_dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_s32 restore_ecx = 0;
+	sljit_s32 is_rotate, is_left;
+	sljit_u8* inst;
+	sljit_sw dstw = 0;
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	sljit_s32 tmp2 = SLJIT_MEM1(SLJIT_SP);
+#else /* !SLJIT_CONFIG_X86_32 */
+	sljit_s32 tmp2 = TMP_REG2;
+#endif /* SLJIT_CONFIG_X86_32 */
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_shift_into(compiler, op, src_dst, src1, src1w, src2, src2w));
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+	CHECK_EXTRA_REGS(src1, src1w, (void)0);
+	CHECK_EXTRA_REGS(src2, src2w, (void)0);
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	compiler->mode32 = op & SLJIT_32;
+#endif
+
+	if (src2 & SLJIT_IMM) {
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+		src2w &= 0x1f;
+#else /* !SLJIT_CONFIG_X86_32 */
+		src2w &= (op & SLJIT_32) ? 0x1f : 0x3f;
+#endif /* SLJIT_CONFIG_X86_32 */
+
+		if (src2w == 0)
+			return SLJIT_SUCCESS;
+	}
+
+	is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL);
+
+	is_rotate = (src_dst == src1);
+	CHECK_EXTRA_REGS(src_dst, dstw, (void)0);
+
+	if (is_rotate)
+		return emit_shift(compiler, is_left ? ROL : ROR, src_dst, dstw, src1, src1w, src2, src2w);
+
+	if ((src2 & SLJIT_IMM) || src2 == SLJIT_PREF_SHIFT_REG) {
+		if (!FAST_IS_REG(src1)) {
+			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+			src1 = TMP_REG1;
+		}
+	} else if (FAST_IS_REG(src1)) {
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = 0;
+#endif
+		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = op & SLJIT_32;
+#endif
+		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
+
+		if (src1 == SLJIT_PREF_SHIFT_REG)
+			src1 = TMP_REG1;
+
+		if (src_dst == SLJIT_PREF_SHIFT_REG)
+			src_dst = TMP_REG1;
+
+		restore_ecx = 1;
+	} else {
+		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = 0;
+#endif
+		EMIT_MOV(compiler, tmp2, 0, SLJIT_PREF_SHIFT_REG, 0);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = op & SLJIT_32;
+#endif
+		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
+
+		src1 = TMP_REG1;
+
+		if (src_dst == SLJIT_PREF_SHIFT_REG) {
+			src_dst = tmp2;
+			SLJIT_ASSERT(dstw == 0);
+		}
+
+		restore_ecx = 2;
+	}
+
+	inst = emit_x86_instruction(compiler, 2, src1, 0, src_dst, dstw);
+	FAIL_IF(!inst);
+	inst[0] = GROUP_0F;
+
+	if (src2 & SLJIT_IMM) {
+		inst[1] = U8((is_left ? SHLD : SHRD) - 1);
+
+		/* Immedate argument is added separately. */
+		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
+		FAIL_IF(!inst);
+		INC_SIZE(1);
+		*inst = U8(src2w);
+	} else
+		inst[1] = U8(is_left ? SHLD : SHRD);
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	compiler->mode32 = 0;
+#endif
+
+	if (restore_ecx == 1)
+		return emit_mov(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
+	if (restore_ecx == 2)
+		return emit_mov(compiler, SLJIT_PREF_SHIFT_REG, 0, tmp2, 0);
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 src, sljit_sw srcw)
 {
@@ -2516,6 +2811,19 @@
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w)
 {
+	switch (GET_FLAG_TYPE(op)) {
+	case SLJIT_ORDERED_LESS:
+	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
+	case SLJIT_UNORDERED_OR_GREATER:
+	case SLJIT_ORDERED_LESS_EQUAL:
+		if (!FAST_IS_REG(src2)) {
+			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w));
+			src2 = TMP_FREG;
+		}
+
+		return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_32), src2, src1, src1w);
+	}
+
 	if (!FAST_IS_REG(src1)) {
 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
 		src1 = TMP_FREG;
@@ -2769,7 +3077,6 @@
 	ADJUST_LOCAL_OFFSET(dst, dstw);
 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
 
-	type &= 0xff;
 	/* setcc = jcc + 0x10. */
 	cond_set = U8(get_jump_code((sljit_uw)type) + 0x10);
 
@@ -2813,10 +3120,7 @@
 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
 	}
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
 
 #else
@@ -2839,10 +3143,10 @@
 		}
 
 		/* Low byte is not accessible. */
-		if (cpu_has_cmov == -1)
+		if (cpu_feature_list == 0)
 			get_cpu_features();
 
-		if (cpu_has_cmov) {
+		if (cpu_feature_list & CPU_FEATURE_CMOV) {
 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
 			/* a xor reg, reg operation would overwrite the flags. */
 			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
@@ -2927,10 +3231,7 @@
 	if (GET_OPCODE(op) < SLJIT_ADD)
 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
 
-#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
-		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	compiler->skip_checks = 1;
-#endif
+	SLJIT_SKIP_CHECKS(compiler);
 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
 #endif /* SLJIT_CONFIG_X86_64 */
 }
@@ -2945,7 +3246,7 @@
 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-	dst_reg &= ~SLJIT_32;
+	type &= ~SLJIT_32;
 
 	if (!sljit_has_cpu_feature(SLJIT_HAS_CMOV) || (dst_reg >= SLJIT_R3 && dst_reg <= SLJIT_S3))
 		return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);
@@ -2958,8 +3259,8 @@
 	CHECK_EXTRA_REGS(src, srcw, (void)0);
 
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-	compiler->mode32 = dst_reg & SLJIT_32;
-	dst_reg &= ~SLJIT_32;
+	compiler->mode32 = type & SLJIT_32;
+	type &= ~SLJIT_32;
 #endif
 
 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
@@ -2971,7 +3272,7 @@
 	inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
 	FAIL_IF(!inst);
 	*inst++ = GROUP_0F;
-	*inst = U8(get_jump_code(type & 0xff) - 0x40);
+	*inst = U8(get_jump_code((sljit_uw)type) - 0x40);
 	return SLJIT_SUCCESS;
 }
 
diff --git a/src/sljit/sljitWXExecAllocator.c b/src/sljit/sljitWXExecAllocator.c
index 72d5b8d..6893813 100644
--- a/src/sljit/sljitWXExecAllocator.c
+++ b/src/sljit/sljitWXExecAllocator.c
@@ -59,38 +59,15 @@
 #include <sys/mman.h>
 
 #ifdef __NetBSD__
-#if defined(PROT_MPROTECT)
-#define check_se_protected(ptr, size) (0)
 #define SLJIT_PROT_WX PROT_MPROTECT(PROT_EXEC)
-#else /* !PROT_MPROTECT */
-#ifdef _NETBSD_SOURCE
-#include <sys/param.h>
-#else /* !_NETBSD_SOURCE */
-typedef unsigned int	u_int;
-#define devmajor_t sljit_s32
-#endif /* _NETBSD_SOURCE */
-#include <sys/sysctl.h>
-#include <unistd.h>
-
-#define check_se_protected(ptr, size) netbsd_se_protected()
-
-static SLJIT_INLINE int netbsd_se_protected(void)
-{
-	int mib[3];
-	int paxflags;
-	size_t len = sizeof(paxflags);
-
-	mib[0] = CTL_PROC;
-	mib[1] = getpid();
-	mib[2] = PROC_PID_PAXFLAGS;
-
-	if (SLJIT_UNLIKELY(sysctl(mib, 3, &paxflags, &len, NULL, 0) < 0))
-		return -1;
-
-	return (paxflags & CTL_PROC_PAXFLAGS_MPROTECT) ? -1 : 0;
-}
-#endif /* PROT_MPROTECT */
+#define check_se_protected(ptr, size) (0)
 #else /* POSIX */
+#if !(defined SLJIT_SINGLE_THREADED && SLJIT_SINGLE_THREADED)
+#include <pthread.h>
+#define SLJIT_SE_LOCK()		pthread_mutex_lock(&se_lock)
+#define SLJIT_SE_UNLOCK()	pthread_mutex_unlock(&se_lock)
+#endif /* !SLJIT_SINGLE_THREADED */
+
 #define check_se_protected(ptr, size) generic_se_protected(ptr, size)
 
 static SLJIT_INLINE int generic_se_protected(void *ptr, sljit_uw size)
@@ -102,22 +79,20 @@
 }
 #endif /* NetBSD */
 
-#if defined SLJIT_SINGLE_THREADED && SLJIT_SINGLE_THREADED
+#ifndef SLJIT_SE_LOCK
 #define SLJIT_SE_LOCK()
+#endif
+#ifndef SLJIT_SE_UNLOCK
 #define SLJIT_SE_UNLOCK()
-#else /* !SLJIT_SINGLE_THREADED */
-#include <pthread.h>
-#define SLJIT_SE_LOCK()	pthread_mutex_lock(&se_lock)
-#define SLJIT_SE_UNLOCK()	pthread_mutex_unlock(&se_lock)
-#endif /* SLJIT_SINGLE_THREADED */
-
+#endif
 #ifndef SLJIT_PROT_WX
 #define SLJIT_PROT_WX 0
-#endif /* !SLJIT_PROT_WX */
+#endif
 
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
 {
-#if !(defined SLJIT_SINGLE_THREADED && SLJIT_SINGLE_THREADED)
+#if !(defined SLJIT_SINGLE_THREADED && SLJIT_SINGLE_THREADED) \
+	&& !defined(__NetBSD__)
 	static pthread_mutex_t se_lock = PTHREAD_MUTEX_INITIALIZER;
 #endif
 	static int se_protected = !SLJIT_PROT_WX;
diff --git a/testdata/grepinputC.bz2 b/testdata/grepinputC.bz2
new file mode 100644
index 0000000..457047a
--- /dev/null
+++ b/testdata/grepinputC.bz2
Binary files differ
diff --git a/testdata/grepinputC.gz b/testdata/grepinputC.gz
new file mode 100644
index 0000000..c7ff390
--- /dev/null
+++ b/testdata/grepinputC.gz
Binary files differ
diff --git a/testdata/grepnot.bz2 b/testdata/grepnot.bz2
new file mode 100644
index 0000000..730cc8a
--- /dev/null
+++ b/testdata/grepnot.bz2
@@ -0,0 +1,43 @@
+This is a second file of input for the pcregrep tests.
+
+Here is the pattern again.
+
+Pattern
+That time it was on a line by itself.
+
+To pat or not to pat, that is the question.
+
+complete pair
+of lines
+
+That was a complete pair
+of lines all by themselves.
+
+complete pair
+of lines
+
+And there they were again, to check line numbers.
+
+one
+two
+three
+four
+five
+six
+seven
+eight
+nine
+ten
+eleven
+twelve
+thirteen
+fourteen
+fifteen
+sixteen
+seventeen
+eighteen
+nineteen
+twenty
+
+This line contains pattern not on a line by itself.
+This is the last line of this file.
diff --git a/testdata/grepoutput b/testdata/grepoutput
index 66af4cf..aa53aab 100644
--- a/testdata/grepoutput
+++ b/testdata/grepoutput
Binary files differ
diff --git a/testdata/grepoutputC b/testdata/grepoutputC
index 87897f0..56bd283 100644
--- a/testdata/grepoutputC
+++ b/testdata/grepoutputC
@@ -1,26 +1,26 @@
 Arg1: [T] [he ] [ ] Arg2: |T| () () (0)
-Arg1: [T] [his] [s] Arg2: |T| () () (0)
-Arg1: [T] [his] [s] Arg2: |T| () () (0)
-Arg1: [T] [he ] [ ] Arg2: |T| () () (0)
-Arg1: [T] [he ] [ ] Arg2: |T| () () (0)
-Arg1: [T] [he ] [ ] Arg2: |T| () () (0)
 The quick brown
+Arg1: [T] [his] [s] Arg2: |T| () () (0)
 This time it jumps and jumps and jumps.
+Arg1: [T] [his] [s] Arg2: |T| () () (0)
 This line contains \E and (regex) *meta* [characters].
+Arg1: [T] [he ] [ ] Arg2: |T| () () (0)
 The word is cat in this line
+Arg1: [T] [he ] [ ] Arg2: |T| () () (0)
 The caterpillar sat on the mat
+Arg1: [T] [he ] [ ] Arg2: |T| () () (0)
 The snowcat is not an animal
 Arg1: [qu] [qu]
-Arg1: [ t] [ t]
-Arg1: [ l] [ l]
-Arg1: [wo] [wo]
-Arg1: [ca] [ca]
-Arg1: [sn] [sn]
 The quick brown
+Arg1: [ t] [ t]
 This time it jumps and jumps and jumps.
+Arg1: [ l] [ l]
 This line contains \E and (regex) *meta* [characters].
+Arg1: [wo] [wo]
 The word is cat in this line
+Arg1: [ca] [ca]
 The caterpillar sat on the mat
+Arg1: [sn] [sn]
 The snowcat is not an animal
 0:T
 The quick brown
@@ -34,6 +34,24 @@
 The caterpillar sat on the mat
 0:T
 The snowcat is not an animal
+0:T
+
+The quick brown
+0:T
+
+This time it jumps and jumps and jumps.
+0:T
+
+This line contains \E and (regex) *meta* [characters].
+0:T
+
+The word is cat in this line
+0:T
+
+The caterpillar sat on the mat
+0:T
+
+The snowcat is not an animal
 T
 T
 T
diff --git a/testdata/grepoutputCN b/testdata/grepoutputCN
index 838bee6..aef1a3d 100644
--- a/testdata/grepoutputCN
+++ b/testdata/grepoutputCN
@@ -22,6 +22,12 @@
 The caterpillar sat on the mat
 0:T
 The snowcat is not an animal
+The quick brown
+This time it jumps and jumps and jumps.
+This line contains \E and (regex) *meta* [characters].
+The word is cat in this line
+The caterpillar sat on the mat
+The snowcat is not an animal
 T
 T
 T
diff --git a/testdata/grepoutputCNU b/testdata/grepoutputCNU
new file mode 100644
index 0000000..2fbfba0
--- /dev/null
+++ b/testdata/grepoutputCNU
@@ -0,0 +1,18 @@
+0:¦
+The quick brown
+0:¦
+This time it jumps and jumps and jumps.
+0:¦
+This line contains \E and (regex) *meta* [characters].
+0:¦
+The word is cat in this line
+0:¦
+The caterpillar sat on the mat
+0:¦
+The snowcat is not an animal
+The quick brown
+This time it jumps and jumps and jumps.
+This line contains \E and (regex) *meta* [characters].
+The word is cat in this line
+The caterpillar sat on the mat
+The snowcat is not an animal
diff --git a/testdata/grepoutputCU b/testdata/grepoutputCU
new file mode 100644
index 0000000..ed8f491
--- /dev/null
+++ b/testdata/grepoutputCU
@@ -0,0 +1,30 @@
+0:¦
+The quick brown
+0:¦
+This time it jumps and jumps and jumps.
+0:¦
+This line contains \E and (regex) *meta* [characters].
+0:¦
+The word is cat in this line
+0:¦
+The caterpillar sat on the mat
+0:¦
+The snowcat is not an animal
+0:¦
+
+The quick brown
+0:¦
+
+This time it jumps and jumps and jumps.
+0:¦
+
+This line contains \E and (regex) *meta* [characters].
+0:¦
+
+The word is cat in this line
+0:¦
+
+The caterpillar sat on the mat
+0:¦
+
+The snowcat is not an animal
diff --git a/testdata/grepoutputCbz2 b/testdata/grepoutputCbz2
new file mode 100644
index 0000000..6eebfab
--- /dev/null
+++ b/testdata/grepoutputCbz2
@@ -0,0 +1,6 @@
+one
+two
+RC=0
+one
+two
+RC=0
diff --git a/testdata/grepoutputCgz b/testdata/grepoutputCgz
new file mode 100644
index 0000000..f9b65a3
--- /dev/null
+++ b/testdata/grepoutputCgz
@@ -0,0 +1,3 @@
+one
+two
+RC=0
diff --git a/testdata/grepoutputN b/testdata/grepoutputN
index 811c52d..b39654f 100644
--- a/testdata/grepoutputN
+++ b/testdata/grepoutputN
Binary files differ
diff --git a/testdata/grepoutputUN b/testdata/grepoutputUN
new file mode 100644
index 0000000..ae5eb7a
--- /dev/null
+++ b/testdata/grepoutputUN
@@ -0,0 +1,3 @@
+---------------------------- Test UN1 ------------------------------

+1:abcሴdef
+
diff --git a/testdata/testinput15 b/testdata/testinput15
index 5dd6897..22d739b 100644
--- a/testdata/testinput15
+++ b/testdata/testinput15
@@ -6,38 +6,44 @@
 
 # (2) Other tests that must not be run with JIT.
 
+# This test is first so that it doesn't inherit a large enough heap frame 
+# vector from a previous test.
+
+/(*LIMIT_HEAP=21)\[(a)]{60}/expand
+    \[a]{60}
+
 /(a+)*zz/I
-  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
-  aaaaaaaaaaaaaz\=find_limits
+  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits_noheap
+  aaaaaaaaaaaaaz\=find_limits_noheap
 
 !((?:\s|//.*\\n|/[*](?:\\n|.)*?[*]/)*)!I
-   /* this is a C style comment */\=find_limits
+   /* this is a C style comment */\=find_limits_noheap
 
 /^(?>a)++/
-    aa\=find_limits
-    aaaaaaaaa\=find_limits
+    aa\=find_limits_noheap
+    aaaaaaaaa\=find_limits_noheap
 
 /(a)(?1)++/
-    aa\=find_limits
-    aaaaaaaaa\=find_limits
+    aa\=find_limits_noheap
+    aaaaaaaaa\=find_limits_noheap
 
 /a(?:.)*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits_noheap
 
 /a(?:.(*THEN))*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits_noheap
 
 /a(?:.(*THEN:ABC))*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits_noheap
 
 /^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
-     aabbccddee\=find_limits
+     aabbccddee\=find_limits_noheap
 
 /^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
-     aabbccddee\=find_limits
+     aabbccddee\=find_limits_noheap
 
 /^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
-     aabbccddee\=find_limits
+     aabbccddee\=find_limits_noheap
 
 /(*LIMIT_MATCH=12bc)abc/
 
@@ -228,9 +234,6 @@
 /(|]+){2,2452}/
     (|]+){2,2452}
 
-/(*LIMIT_HEAP=21)\[(a)]{60}/expand
-    \[a]{60}
-
 /b(?<!ax)(?!cx)/allusedtext
     abc
     abcz
diff --git a/testdata/testinput18 b/testdata/testinput18
index c1f4c22..de3645d 100644
--- a/testdata/testinput18
+++ b/testdata/testinput18
@@ -139,5 +139,9 @@
 \= Expect errors
     \=null_subject
     abc\=null_subject
+    
+/(*LIMIT_HEAP=0)xx/posix
+\= Expect error
+    xxxx 
 
 # End of testdata/testinput18
diff --git a/testdata/testinput19 b/testdata/testinput19
index 3bf1720..7b2ab91 100644
--- a/testdata/testinput19
+++ b/testdata/testinput19
@@ -2,7 +2,7 @@
 # interface with UTF/UCP support, which is supported only with the 8-bit
 # library. This test should not be run with JIT (which is not available for the
 # POSIX interface).
-    
+
 #pattern posix
 
 /a\x{1234}b/utf
@@ -14,8 +14,12 @@
 
 /\w/ucp
     +++\x{c2}
-    
+
 /"^AB" 00 "\x{1234}$"/hex,utf
-    AB\x{00}\x{1234}\=posix_startend=0:6 
-    
+    AB\x{00}\x{1234}\=posix_startend=0:6
+
+/\w/utf
+\= Expect UTF error
+    A\xabB
+
 # End of testdata/testinput19
diff --git a/testdata/testinput2 b/testdata/testinput2
index d37d8f3..c63921c 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5932,4 +5932,12 @@
 /[Aa]{2,3}/BI
     aabcd
 
+--
+    \[X]{-10}
+    
+# Check imposition of maximum by match_data_create().
+
+/abcd/
+    abcd\=ovector=65536
+
 # End of testinput2
diff --git a/testdata/testinputheap b/testdata/testinputheap
new file mode 100644
index 0000000..ee165e1
--- /dev/null
+++ b/testdata/testinputheap
@@ -0,0 +1,13 @@
+#pattern framesize, memory
+
+/abcd/
+    abcd\=memory
+    abcd\=find_limits
+
+/(((((((((((((((((((((((((((((( (^abc|xyz){1,20}$  ))))))))))))))))))))))))))))))/x
+    abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcX\=memory
+    abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcX\=find_limits
+
+/ab(cd)/
+    abcd\=memory
+    abcd\=memory,ovector=0 
diff --git a/testdata/testoutput15 b/testdata/testoutput15
index 9154e5f..af0d7c2 100644
--- a/testdata/testoutput15
+++ b/testdata/testoutput15
@@ -6,19 +6,24 @@
 
 # (2) Other tests that must not be run with JIT.
 
+# This test is first so that it doesn't inherit a large enough heap frame 
+# vector from a previous test.
+
+/(*LIMIT_HEAP=21)\[(a)]{60}/expand
+    \[a]{60}
+Failed: error -63: heap limit exceeded
+
 /(a+)*zz/I
 Capture group count = 1
 Starting code units: a z 
 Last code unit = 'z'
 Subject length lower bound = 2
-  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
-Minimum heap limit = 0
+  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits_noheap
 Minimum match limit = 7
 Minimum depth limit = 7
  0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
  1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-  aaaaaaaaaaaaaz\=find_limits
-Minimum heap limit = 0
+  aaaaaaaaaaaaaz\=find_limits_noheap
 Minimum match limit = 20481
 Minimum depth limit = 30
 No match
@@ -27,70 +32,60 @@
 Capture group count = 1
 May match empty string
 Subject length lower bound = 0
-   /* this is a C style comment */\=find_limits
-Minimum heap limit = 0
+   /* this is a C style comment */\=find_limits_noheap
 Minimum match limit = 64
 Minimum depth limit = 7
  0: /* this is a C style comment */
  1: /* this is a C style comment */
 
 /^(?>a)++/
-    aa\=find_limits
-Minimum heap limit = 0
+    aa\=find_limits_noheap
 Minimum match limit = 5
 Minimum depth limit = 3
  0: aa
-    aaaaaaaaa\=find_limits
-Minimum heap limit = 0
+    aaaaaaaaa\=find_limits_noheap
 Minimum match limit = 12
 Minimum depth limit = 3
  0: aaaaaaaaa
 
 /(a)(?1)++/
-    aa\=find_limits
-Minimum heap limit = 0
+    aa\=find_limits_noheap
 Minimum match limit = 7
 Minimum depth limit = 5
  0: aa
  1: a
-    aaaaaaaaa\=find_limits
-Minimum heap limit = 0
+    aaaaaaaaa\=find_limits_noheap
 Minimum match limit = 21
 Minimum depth limit = 5
  0: aaaaaaaaa
  1: a
 
 /a(?:.)*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum heap limit = 0
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits_noheap
 Minimum match limit = 24
 Minimum depth limit = 3
  0: abbbbbbbbbbbbbbbbbbbbba
 
 /a(?:.(*THEN))*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum heap limit = 0
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits_noheap
 Minimum match limit = 66
 Minimum depth limit = 45
  0: abbbbbbbbbbbbbbbbbbbbba
 
 /a(?:.(*THEN:ABC))*?a/ims
-    abbbbbbbbbbbbbbbbbbbbba\=find_limits
-Minimum heap limit = 0
+    abbbbbbbbbbbbbbbbbbbbba\=find_limits_noheap
 Minimum match limit = 66
 Minimum depth limit = 45
  0: abbbbbbbbbbbbbbbbbbbbba
 
 /^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
-     aabbccddee\=find_limits
-Minimum heap limit = 0
+     aabbccddee\=find_limits_noheap
 Minimum match limit = 7
 Minimum depth limit = 7
  0: aabbccddee
 
 /^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
-     aabbccddee\=find_limits
-Minimum heap limit = 0
+     aabbccddee\=find_limits_noheap
 Minimum match limit = 12
 Minimum depth limit = 12
  0: aabbccddee
@@ -101,8 +96,7 @@
  5: ee
 
 /^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
-     aabbccddee\=find_limits
-Minimum heap limit = 0
+     aabbccddee\=find_limits_noheap
 Minimum match limit = 10
 Minimum depth limit = 10
  0: aabbccddee
@@ -521,10 +515,6 @@
  0: 
  1: 
 
-/(*LIMIT_HEAP=21)\[(a)]{60}/expand
-    \[a]{60}
-Failed: error -63: heap limit exceeded
-
 /b(?<!ax)(?!cx)/allusedtext
     abc
  0: abc
diff --git a/testdata/testoutput18 b/testdata/testoutput18
index 55cd0cc..6d64cdf 100644
--- a/testdata/testoutput18
+++ b/testdata/testoutput18
@@ -221,5 +221,10 @@
 No match: POSIX code 16: bad argument
     abc\=null_subject
 No match: POSIX code 16: bad argument
+    
+/(*LIMIT_HEAP=0)xx/posix
+\= Expect error
+    xxxx 
+No match: POSIX code 14: failed to get memory
 
 # End of testdata/testinput18
diff --git a/testdata/testoutput19 b/testdata/testoutput19
index a4a8b1a..25aa67d 100644
--- a/testdata/testoutput19
+++ b/testdata/testoutput19
@@ -2,7 +2,7 @@
 # interface with UTF/UCP support, which is supported only with the 8-bit
 # library. This test should not be run with JIT (which is not available for the
 # POSIX interface).
-    
+
 #pattern posix
 
 /a\x{1234}b/utf
@@ -17,9 +17,14 @@
 /\w/ucp
     +++\x{c2}
  0: \xc2
-    
+
 /"^AB" 00 "\x{1234}$"/hex,utf
-    AB\x{00}\x{1234}\=posix_startend=0:6 
+    AB\x{00}\x{1234}\=posix_startend=0:6
  0: AB\x{00}\x{1234}
-    
+
+/\w/utf
+\= Expect UTF error
+    A\xabB
+No match: POSIX code 16: bad argument
+
 # End of testdata/testinput19
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index ce090f8..7069b65 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -17746,6 +17746,16 @@
     aabcd
  0: aa
 
+--
+    \[X]{-10}
+** Zero or negative repeat not allowed
+    
+# Check imposition of maximum by match_data_create().
+
+/abcd/
+    abcd\=ovector=65536
+ 0: abcd
+
 # End of testinput2
 Error -70: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data
diff --git a/testdata/testoutputheap-16 b/testdata/testoutputheap-16
new file mode 100644
index 0000000..41855e4
--- /dev/null
+++ b/testdata/testoutputheap-16
@@ -0,0 +1,40 @@
+#pattern framesize, memory
+
+/abcd/
+Memory allocation (code space): 26
+Frame size for pcre2_match(): 128
+    abcd\=memory
+malloc  20480
+ 0: abcd
+    abcd\=find_limits
+Minimum heap limit = 1
+Minimum match limit = 2
+Minimum depth limit = 2
+ 0: abcd
+
+/(((((((((((((((((((((((((((((( (^abc|xyz){1,20}$  ))))))))))))))))))))))))))))))/x
+Memory allocation (code space): 1294
+Frame size for pcre2_match(): 624
+    abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcX\=memory
+malloc  40960
+free unremembered block
+No match
+    abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcX\=find_limits
+Minimum heap limit = 22
+Minimum match limit = 37
+Minimum depth limit = 35
+No match
+
+/ab(cd)/
+Memory allocation (code space): 36
+Frame size for pcre2_match(): 144
+    abcd\=memory
+ 0: abcd
+ 1: cd
+    abcd\=memory,ovector=0 
+free    40960
+free unremembered block
+malloc    128
+malloc  20480
+ 0: abcd
+ 1: cd
diff --git a/testdata/testoutputheap-32 b/testdata/testoutputheap-32
new file mode 100644
index 0000000..36fa1e3
--- /dev/null
+++ b/testdata/testoutputheap-32
@@ -0,0 +1,40 @@
+#pattern framesize, memory
+
+/abcd/
+Memory allocation (code space): 52
+Frame size for pcre2_match(): 128
+    abcd\=memory
+malloc  20480
+ 0: abcd
+    abcd\=find_limits
+Minimum heap limit = 1
+Minimum match limit = 2
+Minimum depth limit = 2
+ 0: abcd
+
+/(((((((((((((((((((((((((((((( (^abc|xyz){1,20}$  ))))))))))))))))))))))))))))))/x
+Memory allocation (code space): 2588
+Frame size for pcre2_match(): 624
+    abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcX\=memory
+malloc  40960
+free unremembered block
+No match
+    abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcX\=find_limits
+Minimum heap limit = 22
+Minimum match limit = 37
+Minimum depth limit = 35
+No match
+
+/ab(cd)/
+Memory allocation (code space): 72
+Frame size for pcre2_match(): 144
+    abcd\=memory
+ 0: abcd
+ 1: cd
+    abcd\=memory,ovector=0 
+free    40960
+free unremembered block
+malloc    128
+malloc  20480
+ 0: abcd
+ 1: cd
diff --git a/testdata/testoutputheap-8 b/testdata/testoutputheap-8
new file mode 100644
index 0000000..05a6b0c
--- /dev/null
+++ b/testdata/testoutputheap-8
@@ -0,0 +1,40 @@
+#pattern framesize, memory
+
+/abcd/
+Memory allocation (code space): 15
+Frame size for pcre2_match(): 128
+    abcd\=memory
+malloc  20480
+ 0: abcd
+    abcd\=find_limits
+Minimum heap limit = 1
+Minimum match limit = 2
+Minimum depth limit = 2
+ 0: abcd
+
+/(((((((((((((((((((((((((((((( (^abc|xyz){1,20}$  ))))))))))))))))))))))))))))))/x
+Memory allocation (code space): 855
+Frame size for pcre2_match(): 624
+    abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcX\=memory
+malloc  40960
+free unremembered block
+No match
+    abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcX\=find_limits
+Minimum heap limit = 22
+Minimum match limit = 37
+Minimum depth limit = 35
+No match
+
+/ab(cd)/
+Memory allocation (code space): 23
+Frame size for pcre2_match(): 144
+    abcd\=memory
+ 0: abcd
+ 1: cd
+    abcd\=memory,ovector=0 
+free    40960
+free unremembered block
+malloc    128
+malloc  20480
+ 0: abcd
+ 1: cd