# Copyright 2022 The JAX Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import importlib import jaxlib.mlir.ir as ir import jaxlib.mlir.dialects.stablehlo as hlo import numpy as np from jaxlib import xla_client from .gpu_common_utils import GpuLibNotLinkedError for cuda_module_name in [".cuda", "jax_cuda12_plugin"]: try: _rnn = importlib.import_module(f"{cuda_module_name}._rnn", package="jaxlib") except ImportError: _rnn = None else: break if _rnn: for _name, _value in _rnn.registrations().items(): xla_client.register_custom_call_target(_name, _value, platform='CUDA') compute_rnn_workspace_reserve_space_sizes = _rnn.compute_rnn_workspace_reserve_space_sizes def cudnn_rnn_lowering(ctx, input, h_0, c_0, weights, seq_lengths, *, input_size: int, hidden_size: int, num_layers: int, dropout: bool, bidirectional: bool, cudnn_allow_tf32: bool): """CuDnn RNN.""" out_dtype = ctx.avals_out[0].dtype if out_dtype == np.float32: out_type = ir.F32Type.get() elif out_dtype == np.float64: out_type = ir.F64Type.get() elif out_dtype == np.complex64: out_type = ir.ComplexType.get(ir.F32Type.get()) elif out_dtype == np.complex128: out_type = ir.ComplexType.get(ir.F64Type.get()) else: raise ValueError(f'Unknown output type {out_dtype}') output_type = ir.RankedTensorType.get(ctx.avals_out[0].shape, out_type) batch_size = ctx.avals_in[0].shape[0] max_seq_length = ctx.avals_in[0].shape[1] # workspace_shape = ctx.avals_out[3].shape workspace_size, _ = compute_rnn_workspace_reserve_space_sizes( input_size, hidden_size, num_layers, batch_size, max_seq_length, dropout, bidirectional, cudnn_allow_tf32) workspace_shape = (workspace_size,) workspace_type = ir.RankedTensorType.get(workspace_shape, ir.F32Type.get()) reserve_space_shape = ctx.avals_out[3].shape reserve_space_type = ir.RankedTensorType.get(reserve_space_shape, ir.F32Type.get()) if not _rnn: raise GpuLibNotLinkedError() opaque = _rnn.build_rnn_descriptor(input_size, hidden_size, num_layers, batch_size, max_seq_length, dropout, bidirectional, cudnn_allow_tf32, workspace_shape[0], reserve_space_shape[0]) i32_type = ir.IntegerType.get_signless(32) out = hlo.CustomCallOp( [output_type, h_0.type, c_0.type, workspace_type, reserve_space_type], [input, h_0, c_0, weights, seq_lengths], call_target_name=ir.StringAttr.get('cudnn_rnn'), has_side_effect=ir.BoolAttr.get(False), backend_config=ir.StringAttr.get(opaque), api_version=ir.IntegerAttr.get(i32_type, 2), called_computations=ir.ArrayAttr.get([]), ) return out.results[:-2] + out.results[-1:] # drop workspace output def _hlo_zeros_f32(shape): return hlo.constant( ir.DenseElementsAttr.get( np.zeros(shape, dtype=np.float32), type=ir.F32Type.get())) def cudnn_rnn_bwd_lowering(ctx, dy, dhn, dcn, x, h0, c0, w, y, reserve_space, seq_lengths, *, input_size: int, hidden_size: int, num_layers: int, dropout: bool, bidirectional: bool, cudnn_allow_tf32: bool): """CuDnn RNN Backward pass.""" batch_size = ctx.avals_in[3].shape[0] max_seq_length = ctx.avals_in[3].shape[1] workspace_size, _ = compute_rnn_workspace_reserve_space_sizes( input_size, hidden_size, num_layers, batch_size, max_seq_length, dropout, bidirectional, cudnn_allow_tf32) workspace_shape = (workspace_size,) workspace_type = ir.RankedTensorType.get(workspace_shape, ir.F32Type.get()) reserve_space_shape = ctx.avals_in[8].shape if _rnn is None: raise RuntimeError("cuda couldn't be imported") opaque = _rnn.build_rnn_descriptor(input_size, hidden_size, num_layers, batch_size, max_seq_length, dropout, bidirectional, cudnn_allow_tf32, workspace_shape[0], reserve_space_shape[0]) i32_type = ir.IntegerType.get_signless(32) zeroed_dw = _hlo_zeros_f32(ctx.avals_out[3].shape) out = hlo.CustomCallOp( [x.type, h0.type, c0.type, w.type, workspace_type], [ dy, dhn, dcn, x, h0, c0, w, y, reserve_space, zeroed_dw, seq_lengths ], call_target_name=ir.StringAttr.get('cudnn_rnn_bwd'), has_side_effect=ir.BoolAttr.get(False), backend_config=ir.StringAttr.get(opaque), api_version=ir.IntegerAttr.get(i32_type, 2), called_computations=ir.ArrayAttr.get([]), output_operand_aliases=ir.ArrayAttr.get([ hlo.OutputOperandAlias.get( output_tuple_indices=[3], operand_index=9, operand_tuple_indices=[]) ])) return out.results[:-1] # drop workspace output