Skip to content

Commit

Permalink
Parallelize sampling measure (#2049)
Browse files Browse the repository at this point in the history
* parallelize sampling measure

* replace BitVector to SampleVector is special class for sampling measure

* format

---------

Co-authored-by: Hiroshi Horii <hhorii@users.noreply.github.com>
  • Loading branch information
doichanj and hhorii authored Mar 19, 2024
1 parent 76b1042 commit cb3a4ff
Show file tree
Hide file tree
Showing 18 changed files with 552 additions and 230 deletions.
5 changes: 4 additions & 1 deletion qiskit_aer/backends/aerbackend.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,10 @@ def target(self):
if self._target is not None:
return self._target

return convert_to_target(self.configuration(), self.properties(), None, NAME_MAPPING)
tgt = convert_to_target(self.configuration(), self.properties(), None, NAME_MAPPING)
if self._coupling_map is not None:
tgt._coupling_graph = self._coupling_map.graph.copy()
return tgt

def clear_options(self):
"""Reset the simulator options to default values."""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
features:
- |
Added BitVector class to store classical bits instead of using reg_t
to save memory usage and memory bandwidth.
upgrade:
- |
Parallelize un-parallelized loops in sampling measure to speed up
for simulation with large number of shots.
17 changes: 6 additions & 11 deletions src/controllers/state_controller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1458,10 +1458,10 @@ std::vector<std::string> AerState::sample_memory(const reg_t &qubits,

std::vector<std::string> ret;
ret.reserve(shots);
std::vector<reg_t> samples = state_->sample_measure(qubits, shots, rng_);
std::vector<SampleVector> samples =
state_->sample_measure(qubits, shots, rng_);
for (auto &sample : samples) {
ret.push_back(
Utils::int2string(Utils::reg2int(sample, 2), 2, qubits.size()));
ret.push_back(sample.to_string());
}
return ret;
}
Expand All @@ -1472,16 +1472,11 @@ std::unordered_map<uint_t, uint_t> AerState::sample_counts(const reg_t &qubits,

flush_ops();

std::vector<reg_t> samples = state_->sample_measure(qubits, shots, rng_);
std::vector<SampleVector> samples =
state_->sample_measure(qubits, shots, rng_);
std::unordered_map<uint_t, uint_t> ret;
for (const auto &sample : samples) {
uint_t sample_u = 0ULL;
uint_t mask = 1ULL;
for (const auto b : sample) {
if (b)
sample_u |= mask;
mask <<= 1;
}
uint_t sample_u = sample(0); // only the first 64bits is used
if (ret.find(sample_u) == ret.end())
ret[sample_u] = 1ULL;
else
Expand Down
101 changes: 70 additions & 31 deletions src/simulators/circuit_executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ class Executor : public Base {
template <typename InputIterator>
void measure_sampler(InputIterator first_meas, InputIterator last_meas,
uint_t shots, state_t &state, ExperimentResult &result,
RngEngine &rng, bool save_creg_to_state = false) const;
RngEngine &rng) const;

#ifdef AER_MPI
void gather_creg_memory(std::vector<ClassicalRegister> &cregs,
Expand All @@ -219,14 +219,14 @@ class Executor : public Base {

// Sample n-measurement outcomes without applying the measure operation
// to the system state
virtual std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) const {
std::vector<reg_t> ret;
virtual std::vector<SampleVector>
sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) const {
std::vector<SampleVector> ret;
return ret;
};
virtual std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
uint_t shots,
std::vector<RngEngine> &rng) const {
virtual std::vector<SampleVector>
sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
std::vector<RngEngine> &rng) const {
// this is for single rng, impement in sub-class for multi-shots case
return state.sample_measure(qubits, shots, rng[0]);
}
Expand Down Expand Up @@ -1033,8 +1033,7 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,
InputIterator last_meas, uint_t shots,
state_t &state,
ExperimentResult &result,
RngEngine &rng,
bool save_creg_to_state) const {
RngEngine &rng) const {
// Check if meas_circ is empty, and if so return initial creg
if (first_meas == last_meas) {
while (shots-- > 0) {
Expand Down Expand Up @@ -1065,7 +1064,7 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,

// Generate the samples
auto timer_start = myclock_t::now();
std::vector<reg_t> all_samples;
std::vector<SampleVector> all_samples;
all_samples = state.sample_measure(meas_qubits, shots, rng);
auto time_taken =
std::chrono::duration<double>(myclock_t::now() - timer_start).count();
Expand Down Expand Up @@ -1095,30 +1094,70 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,
(memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first;
uint_t num_registers =
(register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first;
ClassicalRegister creg;
for (int_t i = all_samples.size() - 1; i >= 0; i--) {
creg.initialize(num_memory, num_registers);

// process memory bit measurements
for (const auto &pair : memory_map) {
creg.store_measure(reg_t({all_samples[i][pair.second]}),
reg_t({pair.first}), reg_t());
}
// process register bit measurements
for (const auto &pair : register_map) {
creg.store_measure(reg_t({all_samples[i][pair.second]}), reg_t(),
reg_t({pair.first}));
}

// process read out errors for memory and registers
for (const Operations::Op &roerror : roerror_ops)
creg.apply_roerror(roerror, rng);
if (roerror_ops.size() > 0) {
// can not parallelize for read out error because of rng
ClassicalRegister creg;
for (uint_t is = 0; is < all_samples.size(); is++) {
uint_t i = all_samples.size() - is - 1;
creg.initialize(num_memory, num_registers);

// process memory bit measurements
for (const auto &pair : memory_map) {
creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
reg_t({pair.first}), reg_t());
}
// process register bit measurements
for (const auto &pair : register_map) {
creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
reg_t(), reg_t({pair.first}));
}

// process read out errors for memory and registers
for (const Operations::Op &roerror : roerror_ops)
creg.apply_roerror(roerror, rng);

// Save count data
if (save_creg_to_state)
state.creg() = creg;
else
// Save count data
result.save_count_data(creg, save_creg_memory_);
}
} else {
uint_t npar = parallel_state_update_;
if (npar > all_samples.size())
npar = all_samples.size();

std::vector<ExperimentResult> par_results(npar);
auto copy_samples_lambda = [this, &par_results, num_memory, num_registers,
memory_map, register_map, npar,
&all_samples](int_t ip) {
ClassicalRegister creg;
uint_t is, ie;
is = all_samples.size() * ip / npar;
ie = all_samples.size() * (ip + 1) / npar;
for (; is < ie; is++) {
uint_t i = all_samples.size() - is - 1;
creg.initialize(num_memory, num_registers);

// process memory bit measurements
for (const auto &pair : memory_map) {
creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
reg_t({pair.first}), reg_t());
}
// process register bit measurements
for (const auto &pair : register_map) {
creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
reg_t(), reg_t({pair.first}));
}

// Save count data
par_results[ip].save_count_data(creg, save_creg_memory_);
}
};
Utils::apply_omp_parallel_for((npar > 1), 0, npar, copy_samples_lambda,
npar);

for (int_t i = 0; i < npar; i++) {
result.combine(std::move(par_results[i]));
}
}
}

Expand Down
66 changes: 34 additions & 32 deletions src/simulators/density_matrix/densitymatrix_executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,8 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,

// Sample n-measurement outcomes without applying the measure operation
// to the system state
std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) const override;
std::vector<SampleVector> sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) const override;

rvector_t sample_measure_with_prob(CircuitExecutor::Branch &root,
const reg_t &qubits);
Expand All @@ -180,9 +180,9 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits,
const reg_t &cmemory, const reg_t &cregister);

std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
uint_t shots,
std::vector<RngEngine> &rng) const override;
std::vector<SampleVector>
sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
std::vector<RngEngine> &rng) const override;

//-----------------------------------------------------------------------
// Functions for multi-chunk distribution
Expand Down Expand Up @@ -1215,15 +1215,14 @@ void Executor<densmat_t>::measure_reset_update(const reg_t &qubits,
}

template <class densmat_t>
std::vector<reg_t> Executor<densmat_t>::sample_measure(const reg_t &qubits,
uint_t shots,
RngEngine &rng) const {
std::vector<SampleVector>
Executor<densmat_t>::sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) const {
// Generate flat register for storing
std::vector<double> rnds;
rnds.reserve(shots);
for (uint_t i = 0; i < shots; ++i)
rnds.push_back(rng.rand(0, 1));
reg_t allbit_samples(shots, 0);

uint_t i, j;
std::vector<double> chunkSum(Base::states_.size() + 1, 0);
Expand Down Expand Up @@ -1322,20 +1321,27 @@ std::vector<reg_t> Executor<densmat_t>::sample_measure(const reg_t &qubits,
#ifdef AER_MPI
BasePar::reduce_sum(local_samples);
#endif
allbit_samples = local_samples;

// Convert to reg_t format
std::vector<reg_t> all_samples;
all_samples.reserve(shots);
for (int_t val : allbit_samples) {
reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
reg_t sample;
sample.reserve(qubits.size());
for (uint_t qubit : qubits) {
sample.push_back(allbit_sample[qubit]);
// Convert to SampleVector format
int_t npar = Base::parallel_state_update_;
if (npar > local_samples.size())
npar = local_samples.size();
std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));

auto convert_to_bit_lambda = [this, &local_samples, &all_samples, shots,
qubits, npar](int_t i) {
uint_t ishot, iend;
ishot = local_samples.size() * i / npar;
iend = local_samples.size() * (i + 1) / npar;
for (; ishot < iend; ishot++) {
SampleVector allbit_sample;
allbit_sample.from_uint(local_samples[ishot], qubits.size());
all_samples[ishot].map(allbit_sample, qubits);
}
all_samples.push_back(sample);
}
};
Utils::apply_omp_parallel_for(
(npar > 1 && BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1), 0,
npar, convert_to_bit_lambda, npar);
return all_samples;
}

Expand Down Expand Up @@ -1439,7 +1445,7 @@ void Executor<state_t>::apply_measure(CircuitExecutor::Branch &root,
}

template <class state_t>
std::vector<reg_t>
std::vector<SampleVector>
Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
uint_t shots,
std::vector<RngEngine> &rng) const {
Expand All @@ -1454,17 +1460,13 @@ Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
auto allbit_samples = state.qreg().sample_measure(rnds);
state.qreg().enable_batch(flg);

// Convert to reg_t format
std::vector<reg_t> all_samples;
all_samples.reserve(shots);
// Convert to bit format
std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));
i = 0;
for (int_t val : allbit_samples) {
reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
reg_t sample;
sample.reserve(qubits.size());
for (uint_t qubit : qubits) {
sample.push_back(allbit_sample[qubit]);
}
all_samples.push_back(sample);
SampleVector allbit_sample;
allbit_sample.from_uint(val, qubits.size());
all_samples[i++].map(allbit_sample, qubits);
}
return all_samples;
}
Expand Down
40 changes: 24 additions & 16 deletions src/simulators/density_matrix/densitymatrix_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ class State : public QuantumState::State<densmat_t> {

// Sample n-measurement outcomes without applying the measure operation
// to the system state
std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) override;
std::vector<SampleVector> sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) override;

// Helper function for computing expectation value
double expval_pauli(const reg_t &qubits, const std::string &pauli) override;
Expand Down Expand Up @@ -987,9 +987,9 @@ void State<densmat_t>::measure_reset_update(const reg_t &qubits,
}

template <class densmat_t>
std::vector<reg_t> State<densmat_t>::sample_measure(const reg_t &qubits,
uint_t shots,
RngEngine &rng) {
std::vector<SampleVector> State<densmat_t>::sample_measure(const reg_t &qubits,
uint_t shots,
RngEngine &rng) {
// Generate flat register for storing
std::vector<double> rnds;
rnds.reserve(shots);
Expand All @@ -999,18 +999,26 @@ std::vector<reg_t> State<densmat_t>::sample_measure(const reg_t &qubits,

allbit_samples = BaseState::qreg_.sample_measure(rnds);

// Convert to reg_t format
std::vector<reg_t> all_samples;
all_samples.reserve(shots);
for (int_t val : allbit_samples) {
reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::qreg_.num_qubits());
reg_t sample;
sample.reserve(qubits.size());
for (uint_t qubit : qubits) {
sample.push_back(allbit_sample[qubit]);
// Convert to bit format
int_t npar = BaseState::threads_;
if (npar > shots)
npar = shots;
std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));

auto convert_to_bit_lambda = [this, &allbit_samples, &all_samples, shots,
qubits, npar](int_t i) {
uint_t ishot, iend;
ishot = shots * i / npar;
iend = shots * (i + 1) / npar;
for (; ishot < iend; ishot++) {
SampleVector allbit_sample;
allbit_sample.from_uint(allbit_samples[ishot], qubits.size());
all_samples[ishot].map(allbit_sample, qubits);
}
all_samples.push_back(sample);
}
};
Utils::apply_omp_parallel_for((npar > 1), 0, npar, convert_to_bit_lambda,
npar);

return all_samples;
}

Expand Down
Loading

0 comments on commit cb3a4ff

Please sign in to comment.