Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 158 additions & 0 deletions qdp/qdp-core/src/gpu/encodings/amplitude.rs
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this computes L2 norms (expensive) just to check finiteness (cheap side effect). Maybe we could introduce a dedicated check_finite_batch kernel would be more direct. I think we could do it in a follow up and please correct me if I'm wrong. Thanks!

cc @rich7420 @ryankert01

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree : )

Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,164 @@ impl QuantumEncoder for AmplitudeEncoder {
Ok(batch_state_vector)
}

#[cfg(target_os = "linux")]
unsafe fn encode_from_gpu_ptr(
&self,
device: &Arc<CudaDevice>,
input_d: *const c_void,
input_len: usize,
num_qubits: usize,
stream: *mut c_void,
) -> Result<GpuStateVector> {
let state_len = 1 << num_qubits;
if input_len == 0 {
return Err(MahoutError::InvalidInput(
"Input data cannot be empty".into(),
));
}
if input_len > state_len {
return Err(MahoutError::InvalidInput(format!(
"Input size {} exceeds state vector size {} (2^{} qubits)",
input_len, state_len, num_qubits
)));
}
let input_d = input_d as *const f64;
let state_vector = {
crate::profile_scope!("GPU::Alloc");
GpuStateVector::new(device, num_qubits, Precision::Float64)?
};
let inv_norm = {
crate::profile_scope!("GPU::NormFromPtr");
unsafe { Self::calculate_inv_norm_gpu_with_stream(device, input_d, input_len, stream)? }
};
let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
MahoutError::InvalidInput(
"State vector precision mismatch (expected float64 buffer)".to_string(),
)
})?;
{
crate::profile_scope!("GPU::KernelLaunch");
let ret = unsafe {
launch_amplitude_encode(
input_d,
state_ptr as *mut c_void,
input_len,
state_len,
inv_norm,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Amplitude encode kernel failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
}
{
crate::profile_scope!("GPU::Synchronize");
sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
}
Ok(state_vector)
}

#[cfg(target_os = "linux")]
unsafe fn encode_batch_from_gpu_ptr(
&self,
device: &Arc<CudaDevice>,
input_batch_d: *const c_void,
num_samples: usize,
sample_size: usize,
num_qubits: usize,
stream: *mut c_void,
) -> Result<GpuStateVector> {
let state_len = 1 << num_qubits;
if sample_size == 0 {
return Err(MahoutError::InvalidInput(
"Sample size cannot be zero".into(),
));
}
if sample_size > state_len {
return Err(MahoutError::InvalidInput(format!(
"Sample size {} exceeds state vector size {} (2^{} qubits)",
sample_size, state_len, num_qubits
)));
}
let input_batch_d = input_batch_d as *const f64;
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
GpuStateVector::new_batch(device, num_samples, num_qubits)?
};
let inv_norms_gpu = {
crate::profile_scope!("GPU::BatchNormKernel");
use cudarc::driver::DevicePtrMut;
let mut buffer = device.alloc_zeros::<f64>(num_samples).map_err(|e| {
MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
})?;
let ret = unsafe {
launch_l2_norm_batch(
input_batch_d,
num_samples,
sample_size,
*buffer.device_ptr_mut() as *mut f64,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Norm reduction kernel failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
buffer
};
{
crate::profile_scope!("GPU::NormValidation");
let host_inv_norms = device
.dtoh_sync_copy(&inv_norms_gpu)
.map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
return Err(MahoutError::InvalidInput(
"One or more samples have zero or invalid norm".to_string(),
));
}
}
{
crate::profile_scope!("GPU::BatchKernelLaunch");
use cudarc::driver::DevicePtr;
let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
MahoutError::InvalidInput(
"Batch state vector precision mismatch (expected float64 buffer)".to_string(),
)
})?;
let ret = unsafe {
launch_amplitude_encode_batch(
input_batch_d,
state_ptr as *mut c_void,
*inv_norms_gpu.device_ptr() as *const f64,
num_samples,
sample_size,
state_len,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Batch kernel launch failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
}
{
crate::profile_scope!("GPU::Synchronize");
sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
}
Ok(batch_state_vector)
}

fn name(&self) -> &'static str {
"amplitude"
}
Expand Down
154 changes: 154 additions & 0 deletions qdp/qdp-core/src/gpu/encodings/angle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,160 @@ impl QuantumEncoder for AngleEncoder {
Ok(batch_state_vector)
}

#[cfg(target_os = "linux")]
unsafe fn encode_from_gpu_ptr(
&self,
device: &Arc<CudaDevice>,
input_d: *const c_void,
input_len: usize,
num_qubits: usize,
stream: *mut c_void,
) -> Result<GpuStateVector> {
if input_len != num_qubits {
return Err(MahoutError::InvalidInput(format!(
"Angle encoding expects {} values (one per qubit), got {}",
num_qubits, input_len
)));
}
let state_len = 1 << num_qubits;
let angles_d = input_d as *const f64;
let state_vector = {
crate::profile_scope!("GPU::Alloc");
GpuStateVector::new(device, num_qubits, Precision::Float64)?
};
let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
MahoutError::InvalidInput(
"State vector precision mismatch (expected float64 buffer)".to_string(),
)
})?;
{
crate::profile_scope!("GPU::KernelLaunch");
let ret = unsafe {
qdp_kernels::launch_angle_encode(
angles_d,
state_ptr as *mut c_void,
state_len,
num_qubits as u32,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Angle encoding kernel failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
}
{
crate::profile_scope!("GPU::Synchronize");
crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
}
Ok(state_vector)
}

#[cfg(target_os = "linux")]
unsafe fn encode_batch_from_gpu_ptr(
&self,
device: &Arc<CudaDevice>,
input_batch_d: *const c_void,
num_samples: usize,
sample_size: usize,
num_qubits: usize,
stream: *mut c_void,
) -> Result<GpuStateVector> {
if sample_size == 0 {
return Err(MahoutError::InvalidInput(
"Sample size cannot be zero".into(),
));
}
if sample_size != num_qubits {
return Err(MahoutError::InvalidInput(format!(
"Angle encoding expects sample_size={} (one angle per qubit), got {}",
num_qubits, sample_size
)));
}
let state_len = 1 << num_qubits;
let input_batch_d = input_batch_d as *const f64;
let angle_validation_buffer = {
crate::profile_scope!("GPU::AngleFiniteCheckBatch");
use cudarc::driver::DevicePtrMut;
let mut buffer = device.alloc_zeros::<f64>(num_samples).map_err(|e| {
MahoutError::MemoryAllocation(format!(
"Failed to allocate angle validation buffer: {:?}",
e
))
})?;
let ret = unsafe {
qdp_kernels::launch_l2_norm_batch(
input_batch_d,
num_samples,
sample_size,
*buffer.device_ptr_mut() as *mut f64,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Angle validation norm kernel failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
buffer
};
{
crate::profile_scope!("GPU::AngleFiniteValidationHostCopy");
let host_norms = device
.dtoh_sync_copy(&angle_validation_buffer)
.map_err(|e| {
MahoutError::Cuda(format!(
"Failed to copy angle validation norms to host: {:?}",
e
))
})?;
if host_norms.iter().any(|v| !v.is_finite()) {
return Err(MahoutError::InvalidInput(
"Angle encoding batch contains non-finite values (NaN or Inf)".to_string(),
));
}
}
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
GpuStateVector::new_batch(device, num_samples, num_qubits)?
};
let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
MahoutError::InvalidInput(
"Batch state vector precision mismatch (expected float64 buffer)".to_string(),
)
})?;
{
crate::profile_scope!("GPU::BatchKernelLaunch");
let ret = unsafe {
qdp_kernels::launch_angle_encode_batch(
input_batch_d,
state_ptr as *mut c_void,
num_samples,
state_len,
num_qubits as u32,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Batch angle encoding kernel failed: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
}
{
crate::profile_scope!("GPU::Synchronize");
crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
}
Ok(batch_state_vector)
}

fn validate_input(&self, data: &[f64], num_qubits: usize) -> Result<()> {
validate_qubit_count(num_qubits)?;
if data.len() != num_qubits {
Expand Down
Loading