Optimizing QAOA by Bayesian Optimization (BO)#

Overview#

In this tutorial, we show how to use Bayesian optimization to optimize QAOA. For the introduction of QAOA, please refer to the previous tutorial. Bayesian optimization in this tutorial is based on \(\text{ODBO}\), please refer to Cheng, Yang, Hsieh, Liao and Zhang for details and this repository for source code and installation. In this tutorial, the updated modules of BO, TuRBO and DARBO are packaged.

Setup#

[ ]:

import tensorcircuit as tc
from jax import numpy as jnp
import optax
import torch
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import cotengra as ctg
from typing import Union
import time
import odbo
from IPython.display import clear_output

K = tc.set_backend("jax")
tc.set_dtype("complex128")
dtype = torch.float64

# cotengra package to speed up the calculation
opt_ctg = ctg.ReusableHyperOptimizer(
    methods=["greedy", "kahypar"],
    parallel=True,
    minimize="combo",
    max_time=20,
    max_repeats=128,
    progbar=True,
)

tc.set_contractor("custom", optimizer=opt_ctg, preprocessing=True)

nlayers = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
acqfn = "ucb"

MAX-CUT Hamiltonian#

Define the Graph#

[3]:

# a graph instance
graph_dict = {
    0: {1: {"weight": 1.0}, 7: {"weight": 1.0}, 3: {"weight": 1.0}},
    1: {0: {"weight": 1.0}, 2: {"weight": 1.0}, 3: {"weight": 1.0}},
    2: {1: {"weight": 1.0}, 3: {"weight": 1.0}, 5: {"weight": 1.0}},
    3: {1: {"weight": 1.0}, 2: {"weight": 1.0}, 0: {"weight": 1.0}},
    4: {7: {"weight": 1.0}, 6: {"weight": 1.0}, 5: {"weight": 1.0}},
    5: {6: {"weight": 1.0}, 4: {"weight": 1.0}, 2: {"weight": 1.0}},
    6: {7: {"weight": 1.0}, 4: {"weight": 1.0}, 5: {"weight": 1.0}},
    7: {4: {"weight": 1.0}, 6: {"weight": 1.0}, 0: {"weight": 1.0}},
}

graph = nx.to_networkx_graph(graph_dict)
pos = nx.spring_layout(graph)
nx.draw_networkx(graph, with_labels=True, pos=pos)
ax = plt.gca()
ax.set_facecolor("w")

Brutal Force Result#

[4]:

def classical_solver(graph):
    num_nodes = len(graph)
    max_cut = [0]
    best_case = [0]  # "01" series with max cut
    for i in range(2**num_nodes):
        case = f"{bin(i)[2:]:0>{num_nodes}}"
        cat1, cat2 = [], []
        for j in range(num_nodes):
            if str(case)[j] == "0":
                cat1.append(j)
            else:
                cat2.append(j)

        # calculate the cost function
        cost = 0
        for node1 in cat1:
            for node2 in cat2:
                if graph[node1].get(node2):
                    cost += graph[node1][node2]["weight"]
        cost = round(cost, 4)  # elimate minor error
        if max_cut[-1] <= cost:
            max_cut.append(cost)
            best_case.append(case)

    # optimal cases maybe more than 1, but they are all at the end
    index = max_cut.index(max_cut[-1])

    return max_cut[-1], best_case[index:]


max_cut, best_case = classical_solver(graph_dict)
print("bit string:", best_case, "\nmax cut:", max_cut)

colors = ["r" if best_case[0][i] == "0" else "c" for i in graph.nodes]
weighted_graph = nx.to_networkx_graph(graph_dict)
nx.draw_networkx(weighted_graph, with_labels=True, node_color=colors, pos=pos)
ax = plt.gca()
ax.set_facecolor("w")

bit string: ['01010101', '10101010']
max cut: 10.0

QAOA Ansatz#

[46]:

def QAOAansatz(params, each=1, return_circuit=False):
    n = graph.number_of_nodes()  # the number of nodes

    # PQC loop
    def pqc_loop(s_, params_):
        c_ = tc.Circuit(n, inputs=s_)
        for j in range(each):
            # driving layer
            for a, b in graph.edges:
                c_.RZZ(a, b, theta=graph[a][b]["weight"] * params_[2 * j])
            # mixing layer
            for i in range(n):
                c_.RX(i, theta=params_[2 * j + 1])
        s_ = c_.state()
        return s_

    c0 = tc.Circuit(n)
    for i in range(n):
        c0.H(i)
    s0 = c0.state()
    s = K.scan(pqc_loop, K.reshape(params, [nlayers // each, 2 * each]), s0)
    c = tc.Circuit(n, inputs=s)

    # whether to return the circuit
    if return_circuit is True:
        return c

    # calculate the loss function
    loss = 0.0
    for a, b in graph.edges:
        loss += c.expectation_ps(z=[a, b]) * graph[a][b]["weight"]

    return K.real(loss)

[47]:

QAOA_vag = K.jit(
    tc.backend.value_and_grad(QAOAansatz, argnums=0), static_argnums=(1, 2)
)
QAOA_nograd = K.jit(QAOAansatz, static_argnums=(1, 2))


def eval_objective(x):
    """This is a helper function we use to unnormalize and evalaute a point"""
    return -torch.from_numpy(np.asarray(QAOA_nograd(jnp.asarray(x.ravel()))))

Adam Optimizer#

[48]:

opt = K.optimizer(optax.adam(1e-2))

BO Optimizer#

[49]:

class BO_optimizer:
    def __init__(self, eval_func, batch_size: int = 1, device="cpu"):
        self.eval_func = eval_func
        self.device = device
        self.batch_size = batch_size
        self.best_dict = {"X": None, "Y": torch.tensor(-float("inf"))}

    def compute_Y(self, X):
        return torch.tensor(
            [self.eval_func(x) for x in X], dtype=X.dtype, device=self.device
        ).unsqueeze(-1)

    def update_best(self, X_next, Y_next):
        new_Y, new_idx = torch.max(Y_next, dim=0)
        new_Y = new_Y.squeeze()
        if new_Y > self.best_dict["Y"]:
            self.best_dict["Y"] = new_Y
            self.best_dict["X"] = X_next[new_idx]

    def update(
        self,
        X,
        Y=None,
        acqfn: str = "ucb",
        normalize: bool = False,
        verbose: bool = False,
    ):
        if Y is None:
            Y = self.compute_Y(X)
        X_next = odbo.run_exp.bo_design(
            X=X,
            Y=Y,
            batch_size=self.batch_size,
            acqfn=acqfn,
            normalize=normalize,
            verbose=verbose,
        )[0].reshape(self.batch_size, X.shape[-1])
        Y_next = self.compute_Y(X_next)

        self.update_best(X_next, Y_next)

        # Update training set
        X = torch.cat((X, X_next), dim=0)
        Y = torch.cat((Y, Y_next), dim=0)

        return X, Y, Y_next.mean()

[50]:

batch_size = 1

bo_opt = BO_optimizer(eval_objective, batch_size, device)

TuRBO Optimizer#

[51]:

class TuRBO_optimizer(BO_optimizer):
    def __init__(
        self, eval_func, num_params, tr_length, failure_tolerance, device="cpu"
    ):
        super(TuRBO_optimizer, self).__init__(eval_func, device=device)
        self.batch_size = 1  # There is a bug in odbo.run_exp.turbo_design, batch_size can only be 1 right now.
        self.tr_length = tr_length
        self.state = odbo.turbo.TurboState(
            dim=num_params,
            batch_size=batch_size,
            length=tr_length,
            n_trust_regions=len(tr_length),
            failure_tolerance=failure_tolerance,
        )

    def inverse_transform(self, X):
        """
        Note TuRBO is working on only [0,1] parameter range
        We need to transform parameters from [0,1] to [-pi,pi] before using eval_func
        """
        return X * 2 * np.pi - np.pi

    def transform(self, X):
        """
        Note TuRBO is working on only [0,1] parameter range
        We need to transform parameters from [-pi,pi] to [0,1] before using odbo.run_exp.turbo_design
        """
        return X / 2 / np.pi + 0.5

    def compute_Y(self, X, transformed_input: bool = True):
        if transformed_input:
            X = self.inverse_transform(X)
        return torch.tensor(
            [self.eval_func(x) for x in X], dtype=X.dtype, device=self.device
        ).unsqueeze(-1)

    def get_next(self, X, Y, acqfn, normalize, verbose):
        X_next = odbo.run_exp.turbo_design(
            state=self.state,
            X=X,
            Y=Y,
            n_trust_regions=len(self.tr_length),
            batch_size=self.batch_size,
            acqfn=acqfn,
            normalize=normalize,
            verbose=verbose,
        )[0].reshape(len(self.tr_length) * self.batch_size, X.shape[-1])
        Y_next = self.compute_Y(X_next)
        return X_next, Y_next

    def update_state(self, Y_next):
        self.state = odbo.turbo.update_state(
            state=self.state,
            Y_next=Y_next.reshape(len(self.tr_length), self.batch_size, 1),
        )

    def preprocess(self, X, Y, transformed_input):
        if not transformed_input:
            X = self.transform(X)
        if Y is None:
            Y = self.compute_Y(X)
            best_Y, best_idx = torch.max(Y, dim=0)
            best_Y = best_Y.squeeze()
            if best_Y > self.best_dict["Y"]:
                self.state.best_value = best_Y.item()
                self.best_dict["Y"] = best_Y
                self.best_dict["X"] = X[best_idx]
        return X, Y

    def postprocess(self, X, Y, X_next, Y_next, transformed_output):
        X = torch.cat((X, X_next), dim=0)
        Y = torch.cat((Y, Y_next), dim=0)
        if not transformed_output:
            X = self.inverse_transform(X)
        return X, Y

    def update(
        self,
        X,
        Y=None,
        acqfn: str = "ucb",
        normalize: bool = False,
        verbose: bool = False,
        transformed_input: bool = False,
        transformed_output: bool = False,
    ):
        X, Y = self.preprocess(X, Y, transformed_input)

        X_next, Y_next = self.get_next(X, Y, acqfn, normalize, verbose)

        self.update_best(X_next, Y_next)

        self.update_state(Y_next)

        X, Y = self.postprocess(X, Y, X_next, Y_next, transformed_output)

        return X, Y, Y_next.mean()

[52]:

failure_tolerance = 10
tr_length = [1.6]

turbo_opt = TuRBO_optimizer(
    eval_objective, 2 * nlayers, tr_length, failure_tolerance, device
)

DARBO Optimizer#

Please refer to Cheng, Chen, Zhang and Zhang for details.

[53]:

class DARBO_optimizer(TuRBO_optimizer):
    def __init__(
        self,
        eval_func,
        num_params,
        tr_length,
        failure_tolerance,
        mode: Union[bool, str] = True,
        device="cpu",
    ):
        super(DARBO_optimizer, self).__init__(
            eval_func, num_params, tr_length, failure_tolerance, device
        )
        self.switch_counter = 0
        if mode == True or mode == "large":
            self.mode = True
        else:
            self.mode = False
        self.best_dict = {
            "X": None,
            "Y": torch.tensor(self.state.best_value),
            "mode": self.mode,
        }

    def get_mode(self):
        return "large" if self.mode else "small"

    def compute_Y(self, X, transformed_input: bool = True, mode=None):
        if transformed_input:
            X = self.inverse_transform(X, mode)
        return torch.tensor(
            [self.eval_func(x) for x in X], dtype=X.dtype, device=self.device
        ).unsqueeze(-1)

    def inverse_transform(self, X, mode=None):
        """
        Note TuRBO is working on only [0,1] parameter range
        We need to transform parameters from [0, 1] to [-pi, pi] or [-pi / 2, pi / 2] before using eval_func
        """
        if mode is None:
            mode = self.mode
        if mode:  # [0, 1] to [-pi, pi]
            return X * 2 * np.pi - np.pi
        else:  # [0, 1] to [-pi / 2, pi / 2]
            return X * np.pi - np.pi / 2

    def transform(self, X, mode=None):
        """
        Note TuRBO is working on only [0,1] parameter range
        We need to transform parameters from [-pi,pi] or [-pi / 2, pi / 2] to [0,1] before using odbo.run_exp.turbo_design
        """
        if mode is None:
            mode = self.mode
        if mode:  # [-pi, pi] to [0, 1]
            return X / 2 / np.pi + 0.5
        else:  # [-pi / 2, pi / 2] to [0, 1]
            return X / np.pi + 0.5

    def update_best(self, X_next, Y_next):
        new_Y, new_idx = torch.max(Y_next, dim=0)
        new_Y = new_Y.squeeze()
        if new_Y > self.best_dict["Y"]:
            self.best_dict["Y"] = new_Y
            self.best_dict["X"] = X_next[new_idx]
            self.best_dict["mode"] = self.mode
        else:
            self.switch_counter += 1

    def update(
        self,
        X,
        Y=None,
        acqfn: str = "ucb",
        normalize: bool = False,
        verbose: bool = False,
        transformed_input: bool = False,
        transformed_output: bool = False,
        frequency: int = 4,
    ):
        X, Y = self.preprocess(X, Y, transformed_input)

        # check if we need to switch the searching parameter range.
        if self.switch_counter >= frequency:
            if self.mode:
                X = X * 2 - 0.5
                self.mode = False  # small
            else:
                X = X / 2 + 0.25
                self.mode = True  # large
            self.switch_counter = 0

        X_next, Y_next = self.get_next(X, Y, acqfn, normalize, verbose)

        self.update_best(X_next, Y_next)

        self.update_state(Y_next)

        X, Y = self.postprocess(X, Y, X_next, Y_next, transformed_output)

        return X, Y, Y_next.mean()

[54]:

mode = "small"

darbo_opt = DARBO_optimizer(
    eval_objective, 2 * nlayers, tr_length, failure_tolerance, mode, device
)
print(f"initial mode: {darbo_opt.get_mode()}")

initial mode: small

Optimization#

[55]:

# initial parameters
params = K.implicit_randu(shape=(2 * nlayers,))
initial_X = torch.from_numpy(np.asarray(params)).type(dtype)

# First point by BO is actually just a random selection, to have a better search, we pick the most distant point
X_new = []
for i in initial_X:
    if i <= 0.5:
        X_new.append(i + 0.5)
    else:
        X_new.append(i - 0.5)
X_new = torch.tensor(X_new)

X_bo = torch.stack((initial_X, X_new), dim=0)
Y_bo = None
X_turbo = X_bo.clone()
Y_turbo = None
X_darbo = X_bo.clone()
Y_darbo = None

losses, losses_bo, losses_turbo, losses_darbo = [], [], [], []
t0 = ts = time.time()

for i in range(300):
    loss, grads = QAOA_vag(params)
    params = opt.update(grads, params)  # gradient descent
    losses.append(loss)

    X_bo, Y_bo, _ = bo_opt.update(X_bo, Y_bo, acqfn)
    losses_bo.append(-bo_opt.best_dict["Y"].item())

    X_turbo, Y_turbo, _ = turbo_opt.update(
        X_turbo,
        Y_turbo,
        acqfn,
        transformed_input=False if i == 0 else True,
        transformed_output=True,
    )
    losses_turbo.append(-turbo_opt.best_dict["Y"].item())

    X_darbo, Y_darbo, _ = darbo_opt.update(
        X_darbo,
        Y_darbo,
        acqfn,
        transformed_input=False if i == 0 else True,
        transformed_output=True,
    )
    losses_darbo.append(-darbo_opt.best_dict["Y"].item())

    # visualise the progress
    clear_output(wait=True)
    plt.figure()
    plt.xlabel("Iteration")
    plt.ylabel("Cost")
    plt.plot(range(i + 1), losses, c="r", label="Adam")
    plt.plot(range(i + 1), losses_bo, c="b", label="BO")
    plt.plot(range(i + 1), losses_turbo, c="g", label="TuRBO")
    plt.plot(range(i + 1), losses_darbo, c="y", label="DARBO")
    plt.legend()
    plt.show()

    print(
        f"Epoch {i} min loss: {min(losses)}\t{losses_bo[-1]}\t{losses_turbo[-1]}\t{losses_darbo[-1]}"
    )

    te = time.time()
    print(f"Epoch {i} time: {te - ts}\tTotal time: {te - t0}")
    ts = te

Epoch 299 min loss: -5.837215998677346  -6.87628698348999       -6.8708058821631575     -7.000515157123274
Epoch 299 time: 13.83302903175354       Total time: 2123.8649847507477

Results#

After inputting the optimized parameters back to the ansatz circuit, we can perform the projective measurement on the output quantum state to get the solution. Here we directly use the bit string with the maximum probability as the solution since we know all information of the probability distribution of the output quantum state, but which is not feasible in the experiment.

[56]:

params_bo = jnp.asarray(bo_opt.best_dict["X"])
params_turbo = jnp.asarray(turbo_opt.inverse_transform(turbo_opt.best_dict["X"]))
params_darbo = jnp.asarray(
    darbo_opt.inverse_transform(
        darbo_opt.best_dict["X"], mode=darbo_opt.best_dict["mode"]
    )
)


# find the states with max probabilities
def find_max(params):
    loss = QAOA_nograd(params)
    c = QAOAansatz(params, return_circuit=True)
    probs = K.numpy(c.probability())
    max_prob = max(probs)
    index = np.where(probs == max_prob)[0]
    states = []
    for i in index:
        states.append(f"{bin(i)[2:]:0>{graph.number_of_nodes()}}")
    return loss, max_prob, states


loss, prob, states = find_max(params)
loss_bo, prob_bo, states_bo = find_max(params_bo)
loss_turbo, prob_turbo, states_turbo = find_max(params_turbo)
loss_darbo, prob_darbo, states_darbo = find_max(params_darbo)
print(f"Adam\nloss: {loss}\tprob: {prob}\tbit strings: {states}\n")
print(f"BO\nloss: {loss_bo}\tprob: {prob_bo}\tbit strings: {states_bo}\n")
print(f"TuRBO\nloss: {loss_turbo}\tprob: {prob_turbo}\tbit strings: {states_turbo}\n")
print(f"DARBO\nloss: {loss_darbo}\tprob: {prob_darbo}\tbit strings: {states_darbo}")

Adam
loss: -5.837558827215271        prob: 0.318607790021735 bit strings: ['10101010']

BO
loss: -6.876287191047742        prob: 0.3434109359024678        bit strings: ['10101010']

TuRBO
loss: -6.8708058821631575       prob: 0.3339171040448526        bit strings: ['10101010']

DARBO
loss: -7.000515157123274        prob: 0.36778546095898973       bit strings: ['01010101']

[57]:

tc.about()

OS info: Linux-5.4.119-1-tlinux4-0010.2-x86_64-with-glibc2.28
Python version: 3.10.11
Numpy version: 1.23.5
Scipy version: 1.11.0
Pandas version: 2.0.2
TensorNetwork version: 0.4.6
Cotengra version: 0.2.1.dev15+g120379e
TensorFlow version: 2.12.0
TensorFlow GPU: []
TensorFlow CUDA infos: {'cpu_compiler': '/dt9/usr/bin/gcc', 'cuda_compute_capabilities': ['sm_35', 'sm_50', 'sm_60', 'sm_70', 'sm_75', 'compute_80'], 'cuda_version': '11.8', 'cudnn_version': '8', 'is_cuda_build': True, 'is_rocm_build': False, 'is_tensorrt_build': True}
Jax version: 0.4.13
Jax installation doesn't support GPU
JaxLib version: 0.4.13
PyTorch version: 2.0.1
PyTorch GPU support: False
PyTorch GPUs: []
Cupy is not installed
Qiskit version: 0.24.1
Cirq version: 1.1.0
TensorCircuit version 0.10.0