Purchased

\documentclass[nonblindrev]{informs3}
%\documentclass[nonblindrev]msom,
%\documentclass[mnsc,blindrev]{informs3}
%\documentclass[opre,nonblindrev]{informs3} % current default for manuscript submission
%\documentclass[nonblindrev]{informs3}


\OneAndAHalfSpacedXI
%%\OneAndAHalfSpacedXII % Current default line spacing
%%\DoubleSpacedXII
%%\DoubleSpacedXI

% If hyperref is used, dvi-to-ps driver of choice must be declared as
%   an additional option to the \documentclass. For example
%\documentclass[dvips,mnsc]{informs3}      % if dvips is used
%\documentclass[dvipsone,mnsc]{informs3}   % if dvipsone is used, etc.

% Private macros here (check that there is no clash with the style)


\usepackage{boldline}
\usepackage[table]{xcolor}
\usepackage{tabularx,booktabs}
\usepackage{array}
\usepackage{endnotes}
\usepackage{amsmath}
\usepackage{longtable}
\usepackage{adjustbox}

\usepackage{multirow}
\usepackage{framed}
\usepackage{makecell}
\usepackage{threeparttable}
\usepackage{comment}
\usepackage{bm}
\usepackage{subfloat}
%\usepackage[flushleft]{threeparttable}
\usepackage{subcaption}
\usepackage{tabularx}
\usepackage{float}
\usepackage{rotating}
\usepackage{caption}


\let\footnote=\endnote
\let\enotesize=\normalsize
\def\notesname{Endnotes}%
\def\makeenmark{$^{\theenmark}$}
\def\enoteformat{\rightskip0pt\leftskip0pt\parindent=1.75em
  \leavevmode\llap{\theenmark.\enskip}}
\newcommand{\mb}[1]{\mbox{\boldmath $#1$}}
\newcommand{\mbt}[1]{\mbox{\boldmath $\tilde{#1}$}}
\newcommand{\mbst}[1]{{\mbox{\boldmath \scriptsize{$\tilde{#1}$}}}}
\newcommand {\E}{{\rm E}}
%\newcommand{\proof}{{\bf Proof : }}
%\newcommand{\qed}{\rule{7pt}{7pt}}
%\newcommand{\remark}{{\bf Remark : }}
\newcommand{\defi}{\stackrel{\Delta}{=}}
%\newtheorem{theorem}{Theorem}
%\newtheorem{lemma}{Lemma}
%\newtheorem{prop}{Proposition}
%\newtheorem{coro}{Corollary}
%\newtheorem{claim}{Claim}
\newtheorem{Defi}{Definition}
%\usepackage{ntheorem}
%\theoremstyple{break}
\theoremstyle{break}
\newtheorem{algorithm}{Algorithm}

\def\blot{\quad \mbox{$\vcenter{ \vbox{ \hrule height.4pt
      \hbox{\vrule width.4pt height.9ex \kern.9ex \vrule width.4pt}
           \hrule height.4pt}}$}}
% Natbib setup for author-year style
\usepackage{natbib}
 \bibpunct[, ]{(}{)}{,}{a}{}{,}%
 \def\bibfont{\small}%
 \def\bibsep{\smallskipamount}%
 \def\bibhang{24pt}%
 \def\newblock{\ }%
 \def\BIBand{and}%

%\usepackage{threeparttable}

%% Setup of theorem styles. Outcomment only one.
%% Preferred default is the first option.
\TheoremsNumberedThrough     % Preferred (Theorem 1, Lemma 1, Theorem 2)
%\TheoremsNumberedByChapter  % (Theorem 1.1, Lema 1.1, Theorem 1.2)
\ECRepeatTheorems

%% Setup of the equation numbering system. Outcomment only one.
%% Preferred default is the first option.
\EquationsNumberedThrough    % Default: (1), (2), ...
%\EquationsNumberedBySection % (1.1), (1.2), ...

% For new submissions, leave this number blank.
% For revisions, input the manuscript number assigned by the on-line
% system along with a suffix ".Rx" where x is the revision number.
\MANUSCRIPTNO{}
%(Please, provide the manuscript number!)


\def \a{{\mbox{\boldmath $a$}}}
\def \b{{\mbox{\boldmath $b$}}}
\def \c{{\mbox{\boldmath $c$}}}
\def \d{{\mbox{\boldmath $d$}}}
\def \e{{\mbox{\boldmath $e$}}}
\def \f{{\mbox{\boldmath $f$}}}
\def \g{{\mbox{\boldmath $\mathbf{g}$}}}
\def \h{{\mbox{\boldmath $h$}}}
\def \q{{\mbox{\boldmath $q$}}}
\def \p{{\mbox{\boldmath $p$}}}
\def \u{{\mbox{\boldmath $u$}}}
\def \v{{\mbox{\boldmath $v$}}}
\def \w{{\mbox{\boldmath $w$}}}
\def \r{{\mbox{\boldmath $r$}}}
\def \s{{\mbox{\boldmath ${s}$}}}
\def \x{\mathbf{x}}
\def \b{\mathbf{b}}
\def \y{\mathbf{y}}
\def \z{\mathbf{z}}
\def \q{\mathbf{q}}
\def \p{\mathbf{p}}
\def \g{\mathbf{g}}
\def \s{\mathbf{s}}
\def \t{\mathbf{t}}
\def \d{\mathbf{d}}
\def \u{\mathbf{u}}
\def \v{\mathbf{v}}
\def \h{\mathbf{h}}
\def \r{\mathbf{r}}
%\def \s{{\mbox{\boldmath $\mathbf{s}$}}}
\def \t{{\mbox{\boldmath $t$}}}
%\def \mathbf{x}{{\mbox{\boldmath $\mathbf{x}$}}}
\def \lx{{\mbox{\footnotesize $\mathbf{x}$}}}
\def \ly{{\mbox{\footnotesize $\y$}}}
\def \y{{\mbox{\boldmath $\mathbf{y}$}}}
%\def \z{{\mbox{\boldmath $\tilde{z}$}}}
\def \z{{\mbox{\boldmath $\mathbf{z}$}}}
\def \rz{{\mbox{\boldmath $\tilde{\mathbf{z}}$}}}
\def \rs{{\mbox{\boldmath $\tilde{{s}}$}}}
%\def \rs{{\mbox{\boldmath $\tilde{\mathbf{s}}$}}}
\def \rw{{\mbox{\boldmath $\tilde{\mathbf{w}}$}}}
\def \rrw{{\mbox{$\tilde{w}$}}}


\def \hxi{{\mbox{\boldmath $\xi$}}}
\def \hnu{{\mbox{\boldmath $\nu$}}}
\def \hpi{{\mbox{\boldmath $\pi$}}}
\def \hiota{{\mbox{\boldmath $\iota$}}}
\def \hpsi{{\mbox{\boldmath $\psi$}}}
\def \hvarpi{{\mbox{\boldmath $\varpi$}}}
\def \hmu{{\mbox{\boldmath $\mu$}}}
\def \hchi{{\mbox{\boldmath $\chi$}}}
\def \hlambda{{\mbox{\boldmath $\lambda$}}}
\def \halpha{{\mbox{\boldmath $\alpha$}}}
\def \hbeta{{\mbox{\boldmath $\beta$}}}
\def \hrho{{\mbox{\boldmath $\rho$}}}
\def \hvarrho{{\mbox{\boldmath $\varrho$}}}
\def \hkappa{{\mbox{\boldmath $\kappa$}}}
\def \bxi{{\mbox{\boldmath {\footnotesize $z$}}}}
\def \heta{{\mbox{\boldmath $\eta$}}}
\def \hzeta{{\mbox{\boldmath $\zeta$}}}
\def \htheta{{\mbox{\boldmath $\theta$}}}
\def \hvartheta{{\mbox{\boldmath $\vartheta$}}}
\def \hvarphi{{\mbox{\boldmath $\varphi$}}}
\def \hdelta{{\mbox{\boldmath $\delta$}}}
\def \hdelta{{\mbox{\boldmath $\delta$}}}
\def \htau{{\mbox{\boldmath $\tau$}}}
\def \hphi{{\mbox{\boldmath $\phi$}}}
\def \hepsilon{{\mbox{\boldmath $\epsilon$}}}


\def \homega{{\mbox{\boldmath $\omega$}}}
\def \hgamma{{\mbox{\boldmath $\gamma$}}}
\def \hsigma{{\mbox{\boldmath $\sigma$}}}
\def \hGamma{{\mbox{\boldmath $\Gamma$}}}

\def \hpi{{\mbox{\boldmath $\pi$}}}
\def \hkappa{{\mbox{\boldmath $\kappa$}}}
\def \htheta{{\mbox{\boldmath $\theta$}}}
\def \hvartheta{{\mbox{\boldmath $\vartheta$}}}
\def \hvarphi{{\mbox{\boldmath $\varphi$}}}
\def \hphi{{\mbox{\boldmath $\phi$}}}
\def \hvarsigma{{\mbox{\boldmath $\varsigma$}}}
\def \hvarpi{{\mbox{\boldmath $\varpi$}}}
\def \hUpsilon{{\mbox{\boldmath $\Upsilon$}}}

\def \D{{\mbox{\boldmath $D$}}}

\def \bB{{\mbox{\boldmath $B$}}}
\def \C{{\mbox{\boldmath $C$}}}
\def \H{{\mbox{\boldmath $\mathbf{H}$}}}
\def \A{{\mbox{\boldmath $A$}}}
\def \T{{\mbox{\boldmath $T$}}}
\def \X{{\mbox{\boldmath $Y$}}}
\def \Y{{\mbox{\boldmath $Y$}}}
\def \bM{{\mbox{\boldmath $\mathbf{M}$}}}
\def \bF{{\mbox{\boldmath $F$}}}
\def \hL{{\mbox{\boldmath $\mathcal{L}$}}}
\def \bV{{\mbox{$\boldsymbol{V}$}}}
\def \bU{{\mbox{$\boldsymbol{U}$}}}
\def \hT{{\mbox{\boldmath $\mathcal{T}$}}}
\def \U{{\mbox{\boldmath $\mathcal{U}$}}}
%\def \F{{\mbox{\boldmath $\mathcal{F}$}}}
\def \hP{{\mbox{$\mathcal{P}$}}}
\def \bI{{\mbox{\boldmath $I$}}}
\def \T{{\mbox{\boldmath $T$}}}
\def \W{{\mbox{\boldmath $W$}}}
\def \bK{{\mbox{\boldmath $K$}}}
\def \bQ{{\mbox{\boldmath $Q$}}}
\def \bG{{\mbox{\boldmath $\mathbf{G}$}}}
\def \bE{{\mbox{\boldmath $E$}}}
\def \L{{\mbox{\boldmath $\mathcal{L}$}}}
%\def \S{{\mbox{\boldmath $\mathbf{S}$}}}
\def \S{{\mbox{\boldmath ${S}$}}}
%\def \rS{{\mbox{\boldmath $\tilde{\mathbf{S}}$}}}
\def \rS{{\mbox{\boldmath $\tilde{{S}}$}}}
\def \I{{\mbox{\boldmath $\mathcal{I}$}}}
\def \P{\mathbb{P}}
\def \E{\mathbb{E}}
\def \F{\mathcal{F}}
\def \Q{\mathbb{Q}}
\def \G{\mathcal{G}}
\def \J{{\mbox{\boldmath $\mathcal{J}$}}}
\def \B{{\mbox{\boldmath $\mathcal{B}$}}}
\def \K{{\mbox{\boldmath $\mathcal{K}$}}}
\def \K{{\mbox{$\mathcal{K}$}}}
\def \O{{\mbox{\boldmath $\mathcal{O}$}}}
\def \M{{\mbox{$\mathcal{M}$}}}
\def \hW{{\mbox{\boldmath $\mathcal{W}$}}}
\def \N{{\mbox{\boldmath $\mathcal{N}$}}}


%-----------------------------------------------------------------------------
% To set spacing:
\def\spacingset#1{\renewcommand{\baselinestretch}%
 {#1}\small\normalsize}
\newcommand{\resetspacing}{\spacingset{1.70}}
\newcommand{\unitspacing}{\spacingset{1.0}}
\newcommand{\tightspacing}{\spacingset{1.25}}
%-----------------------------------------------------------------------------


\begin{document}
\RUNAUTHOR{ }

\RUNTITLE{Redundancy Optimization with Side Information}

\TITLE{Redundancy Optimization with Side Information: State-Dependent Distributionally Robust Models}
%\author{Shuming Wang}
% use optional labels to link authors explicitly to addresses:

\ARTICLEAUTHORS{%
}


%\author{}
%\affil[1]{School of Economics and Management, University of Chinese Academy of Sciences, China}
%\affil[2]{University of Chinese Academy of Sciences, China}
%% \address[label2]{}
\ABSTRACT{{\color{red}
In this paper, we present a state-dependent, distributionally robust framework for solving a series-parallel, multi-type component, mixed redundancy strategy system with uncertain component lifetimes. We assume that the distribution of component lifetimes are not exactly known, instead only partial distributional information (like mean and dispersion) can be extracted from the data set. We aim to product a system design that are reliable enough even under the worst possible distribution, given the partial information extracted. Moreover, We introduce clustering process in extracting information, which also includes techniques of dimension reduction and side information. We extract distributional information from each cluster, instead of from the entire data set. This helps us to incorporate more information into the ambiguity set, which results in smaller ambiguity set and therefore better result. Although the model itself is highly nonlinear, we utilize linearization technique to transform the model into a mixed integer linear program, without adding integer decision variables other than the original ones. This makes the solving the problem tractable, and easy to implement by off-the-shelves library. Finally, we present computational study to support our theoretical results.}
}
\KEYWORDS {Redundancy optimization, lifetime uncertainty, mixed redundancy strategies, robust optimization, linear programming, clustering}


\maketitle
%\end{frontmatter}


\section{Introduction}
{\color{blue}
Redundancy optimization or redundancy allocation, roughly speaking, is  to determine the most cost-efficient allocations for redundancies while keeping system reliability above the predefined level (Kuo and Wan~2007)). It is well-known in multiple engineering domains, {\it e.g.}, railway engineering, nuclear engineering and aerospace engineering, to name a few. In a redundancy system, the cold-standby and the active-parallel are two typical and commonly used strategies in redundancy configuration ({\color{red} Ardakan~et~al.~2014}), and lifetime uncertainty of components has always been a crucial modeling issue for computing the system reliability and optimizing redundancy allocations, under different redundancy strategies.


As a famous optimization problem under uncertainty, redundancy optimization has been extensively studied in the past decades. Different approaches have been proposed to deal with the modeling difficulty on the lifetime uncertainty and the resulting redundancy optimization problems. For instance, the optimization models with deterministic reliability levels of components({\it e.g.}, Coit~et~al.~1996, Kulturel-Konak~et~al.~2003, Ardakan~et~al.~2014) and with uncertain component reliability levels ({\it e.g.}, Feizollahi and Modarres~2012,  Feizollahi~et~al.~2014,~Feizollahi~et~al.~2015);  the stochastic programming approaches with known lifetime distributions of components ({\it e.g.}, Coit and Smith~1998,  Prasad et al.~2001, Zhao and Liu~2003); and most recently, the distributionally robust optimization (DRO) approach with uncertain lifetime distributions (Wang~et~al.~2019). A detailed literature review is provided in Section~\ref{sec:LR}. Several difficulties have been resolved, yet new challenges are constantly emerging.


On the one hand, all the above mentioned approaches leverage solely on utilizing the lifetime information, which might not be sufficient for the increasingly complex systems nowadays. Technology advancements constantly drive the engineering systems to evolve toward ever higher complexities with larger number of components. For instance, the recent autonomous cars involve multiple sensors, control units, communication devices, safety barriers, artificial intelligence (AI) components, etc., which render them much more complex than the conventional electromechanical style of cars. Another example is the high-speed train, where the new technologies, such as automatic train operation (ATO) system, 5G telecommunication devices and auxiliary braking systems, are being integrated into the train system for more efficient operations. New technologies bring great upgrade to the existing system, which however also create the new sources of failures. The complexity of the system could largely complicate the uncertainty in the occurrence of the time-to-failures or lifetimes, and there is call for more related useful side information (in addition to lifetime data), {\it e.g.}, working condition (frequency and intensity), seasonal information (temperature and wether), quality information of the components, to be incorporated for estimating more effectively the component lifetime characteristics for redundancy optimization.


On the other hand, the above mentioned side information is becoming more and more available. Thanks to
the pervasive sensing capabilities of new generation of industrial IoT systems, it is often possible to have the senor records multiple types of data (even realtime data stream) on the working environment of components.
For instance, there are thousands of sensors installed in one electric multiple unit (EMU) train, such as temperature sensor, humidity sensor, accelerator, {\it etc}. The sensors transmit their readings at a very high frequency, {\it i.e.}, 100Hz, to the onboard database system in the train, and the train will upload all stored data to the station once it completes the daily operation. Among all these data collected, some can be importantly relevant to the particular failure events. For example, the vibration signal has a strong correlation to the degradation condition or the abnormal condition of a running wheel of a train. Therefore,
such side information can improve the description to the failures as well as to the lifetime distributions, and should be incorporated into the model together with the lifetime data itself to enhance the decision quality of the redundancy allocation under lifetime uncertainty.


In this paper, we consider a redundancy optimization problem of a series-parallel system with  mixed redundancy strategy of active-parallels and cold-standbys, where the distributions of component lifetimes are assume to be uncertain. We develop a new optimization framework, based on distributionally robust optimization with conditional states, that is able to effectively incorporate the side information on component lifetimes, via the advanced machine learning techniques, {\it e.g.}, clustering, regression trees and PCA, to realize a more effective decision making of redundancy allocation under distributional uncertainty of lifetimes. Among all the above mentioned related studies, our paper is most relevant to Wang~et~al.~(2019), yet with key extensions in
several directions. We briefly summarize our major contributions as follows:
\begin{itemize}
\item We develops a new distributionally robust redundancy optimization framework with multiple conditional states, which is able to flexibly incorporate the side information related lifetimes. To the best of our knowledge, our study is the first redundancy optimization approach with distributional uncertainty that is equipped with the machinery for harnessing the side information via the advanced machine learning techniques. In addition, the parameter selection of our framework can be readily realized by a cross validation approach developed.
\item From the modeling perspective, our present study generalizes naturally Wang~et~al.~(2019) in two folds: (i) The distributionally robust redundancy model of Wang~et~al.~(2019) can be treated as a special case of our current model with a single state. (ii) Our current model considers a set of general conic constraints to capture the possible marginal and/or cross correlations of the lifetimes of different types of components, which also is the generation of the absolute dispersion constraints in the ambiguity set considered in Wang~et~al.~(2019).
\item Methodologically,  our developed distributionally robust redundancy optimization model, to our best knowledge, is also the first distributionally robust chance-constrained optimization model over the state-dependent ambiguity set with general conic distributional constraints, which can also be readily extended to the general chance-constrained optimization problems.
\item Computationally, we show that the worst-case system reliability level over the state-dependent ambiguity set given redundancy design, in several common cases, can be efficiently computed by solving a tractable conic program ({\it e.g.}, LP or SOCP), while the resulting distributionally robust redundancy optimization model solves a mixed integer conic program ({\it e.g.}, MILP or MISOCP). Furthermore, we develop a computationally viable supergradient-based decomposition algorithm to further enhance the scalability of the resulting MIP problems.
\end{itemize}


The remaining of the paper is organized as follows. Section~\ref{sec:LR} reviews the related studies on redundancy optimization. Section~\ref{sec:base} introduces the base problem formulation of redundancy optimization that we are studying. In section~\ref{sec:DRRM}, we introduce our distributionally robust redundancy optimization model with side information, and discuss the tractable reformulations of the developed model as well as the hyperparameter selection via cross validation. In Section~\ref{sec:algo}, we discuss the development of the supergradient-based decomposition algorithm. Finally, we present numerical experiments and a case study in Section~\ref{sec:CS}, and conclude our study in~\ref{sec:conclusion}.


}

\paragraph{\bf Notations:}
We use the tilde to denote a random parameter, {\it e.g.,} $\bm{\tilde{z}} \in \mathbb{R}^n$; ``$\mathbb{P}$" to denote a specific probability distribution; and $\bm{\tilde{z}} \sim \P$ to denote the random variable $\bm{\tilde{z}}$ with  probability distribution $\P$.
For a random variable $\bm{\tilde{z}} \in \mathbb{R}^n$ with distribution $\P$ and function $\g:\mathbb{R}^n \mapsto \mathbb{R}^m$, we denote $\E_{\P}(\g(\bm{\tilde{z}}))$ as the expectation of random variable $\g(\bm{\tilde{z}})$ with the probability measure $\P$. We use  $\mathcal{P}\left( \mathbb{R}^{n}\right)$ to represent the collection of all probability distributions of a random variable of dimension $n$, and ``$\mathbb{F}$" is a set of distributions for modeling the distributional ambiguity.

\section{Literature Review}\label{sec:LR}
In this section we provide a brief review of approaches to RAP, including RAP with deterministic component reliability, RAP with uncertain component reliability and RAP with uncertain component lifetimes. In addition, we also briefly review the studies on robust optimization that are methodologically related to our work.

{\bf RAP with deterministic component reliability} Early studies on RAP focuses on formulations in which components have deterministic reliability. These problems are in general NP-hard(Chern.~1992), and efforts have been put into approximate algorithms to make them tractable. Coit~et~al.~(1996) uses a combination of neural network and genetic algorithm to search for the minimum cost design for a series-parallel system, given a minimum reliability constraint. Genetic algorithm is also applied to a system with cold-standby redundancy strategy (Ardakan~et~al.~2014). Liang~et~al.~(2004) applies ant colony optimization to a series-parallel system, in which the failure rates of components when not in use are the same as when in use (i.e., active redundancy). Kulturel-Konak~et~al.~(2003) introduces tabu search to solve a singled type component series-parallel system, with k-out-of-n subsystems and exponential component failure times.

It can be seen that RAP with deterministic component reliability suffers from its inherent theoretical intractability. Researchers has to develop various heuristic algorithms to mitigate this drawback, and there is currently no algorithm that is superior to others in all cases. Moreover, the assumption of known deterministic component reliability level is often impractical due to lack of data, as discussed in previous section. As the result, the focus recently shifted to RAP with uncertain component reliability.


{\bf RAP with uncertain component reliability.} Most papers on RAP with uncertainty consider variations of the individual component reliability levels. Bhunia~et~al.~(2010) considered a reliability optimization problem for a series system as a stochastic chance constrained optimization problem with interval-valued reliability levels of individual components; the problem was transformed into an unconstrained integer programming problem with interval coefficients and solved by metaheuristics. Tekiner-Mogulkoc and Coit (2011) discussed an RAP that minimizes the coefficient of variation of the system reliability estimate with respect to a minimum system reliability constraint, in a series-parallel system, and an exact algorithm based on linear integer programming and a heuristic approach based on combined neighborhood search were proposed to solve the problems when component mixing is or is not allowed, respectively. Sun et al.~(2017) considered the uncertain component state performance and state probability in RAP, where the experts' epistemic estimations as the
uncertainty parameters were modeled to be set-valued, and the resulting model was solved by a local-search-based metaheuristics approach.


Recently, some studies have addressed component reliability uncertainty in RAP using robust optimization (Bertsimas and Sim~2004), where component reliability levels are allowed to vary within an {\em uncertainty set}, and the resulting worst-case system reliability level is considered in the optimization of the RAP.  Feizollahi and Modarres (2012) and Feizollahi~et~al.~(2014) considered active redundancy in series-parallel systems and developed a robust RAP with an interval uncertainty set and a robust RAP with polyhedral budgeted uncertainty set, respectively. In both studies, the structures of the resulting robust RAP problems were well investigated, and problems could be transformed and solved iteratively with a series of MIP instances. Furthermore, Feizollahi~et~al.~(2015) developed a robust cold-standby redundancy allocation model for series-parallel systems with budgeted uncertainty; the problem could also be solved by an MIP-based iterative algorithm.

In addition, other related studies in this vein include multicriteria RAP models (Coit~et~al.~2004, Zaretalab~et~al.~2015, Govindan~et~al.~2017) and system reliability evaluation due to the lack of knowledge or the imprecision of human estimation (Li~et~al.~2014). More related papers can be found in an excellent survey (Kuo and Wan~2007).


{\bf RAP with stochastic lifetimes.} In the RAP literature, only a few studies have explicitly considered lifetime distributions. For example, Coit and Smith~(1998) and Prasad et al.~(2001) have proposed the maximization of a percentile life of the system subject to a budget constraint. Zhao and Liu~(2003) developed stochastic programming models for both parallel and standby redundancy optimization problems, which are solved by simulating the component lifetimes from any given known probability distributions and using metaheuristics. In addition, several studies have also performed theoretical analysis on the stochastic comparisons of redundancy allocation for a very limited number of components ({\it e.g.,} two or three) based on the given lifetime distributions (Li and Hu~2008, Zhao~et~al.~2011). All of these studies require the exactly known probability distributions of component lifetimes, which in practice is often difficult to specify or calibrate. In addition, Coit and Smith~(2002) considered uncertain Weibull component lifetime distributions with random-scale parameters in RAP; their model maximizes a lower percentile of the system time-to-failure distribution. The solution also leverages the approach of metaheuristics due to the complicated problem structure.


Most of the resulting redundancy optimization models of the above RAP studies, except for the robust optimization models, are difficult to solve, and the proposed solution approaches ({\it e.g.,} metaheuristics), in general, are intractable. Among the related studies in the above two streams, our work is closest to that of Feizollahi and Modarres (2012), Feizollahi~et~al.~(2014), Feizollahi~et~al.~(2015), and Zhao and Liu~(2003). The research gap can be established in the following aspects:
\begin{itemize}
  \item The stochastic RAP model of Zhao and Liu~(2003) assumes the specific probability distributions for component lifetimes, which, as discussed previously, cannot be easily calibrated in the practice of reliability engineering, while our proposed RAP model allows for the lifetime distributions to be uncertain and vary over a set of possible distributions that are characterized by the available (limited) lifetime information. On the other hand, the robust RAP models of Feizollahi and Modarres (2012, Feizollahi~et~al.~(2014) and Feizollahi~et~al.~(2015) are based on regular robust optimization, which may sacrifice some critical distributional information of lifetimes, while our proposed RAP model is able to incorporate the distributional characteristics of component lifetimes ({\it e.g.,} mean, dispersion, and support information) into the redundancy optimization.
  \item Our proposed RAP model considers a general setting with different redundancy strategies as well as multiple types of components for redundancy, while Zhao and Liu~(2003), Feizollahi and Modarres (2012) and Feizollahi~et~al.~(2014) only considered active redundancy and a single type of component. On the other hand, Feizollahi~et~al.~(2015) only considered cold-standby redundancy with a single type of component, while the multiple component types would violate the optimization structure utilized by the authors. Furthermore, our model is also able to incorporate starting failures for cold-standbys and common-cause failures for active redundancies, respectively, which have not been attempted in any of these related studies.
  \item Finally, the resulting redundancy optimization problem of our proposed RAP model is equivalent to an MILP problem, which does not induce any binary variables in addition to the original redundancy allocation variables (binaries), while all the above four RAP models result in either bilinear MIP formulations (Feizollahi and Modarres~2012, Feizollahi~et~al.~2014), a linear MIP formulation with additional auxiliary integers (Feizollahi~et~al.~2015), or a highly intractable problem that was approached by metaheuristics (Zhao and Liu~2003).
\end{itemize}


{\bf Robust optimization.} Our work is based on distributionally robust optimization (Wiesemann et al.~2014), which is different from the abovementioned regular robust optimization (Bertsimas and Sim~2004, Ben-Tal~et~al.~2017): the latter focuses on the uncertainty of the actual values of the parameters, while the former concerns the uncertainty of the probability distributions of the parameters and is able to utilize the distributional information. In particular, our developed RAP model belongs to the class of distributionally robust chance-constrained programs (DR-CCPs), where the probability distributions of the uncertain parameters are allowed to vary within a distributional set or {\it Chebyshev ambiguity set}, and the worst-case chance level (reliability level in our context) is protected ({\it i.e.,} required to be above a threshold). In DR-CCPs, the case of joint chance constraint (our model belongs to this case) in general is much more difficult than the single chance constraint (Pr\'{e}kopa~1998), and several approximation approaches have been proposed, for instance, Bonferroni's inequality (Nemirovski and Shapiro~2006, Bertsimas~et~al.~2017), $\phi$-divergence (Yanikoglu and den Hertog~2013) and  conditional value-at-risk (CVaR) (Chen~et~al.~2010, Zymler~et~al.~2012). Most recently, some exact and tractable models have also been developed if the ambiguity set is carefully designed. Hanasusanto~et~al.~(2017) considered a joint constraint of affine functions with uncertainty and an ambiguity set that contains mean and support information, as well as an upper bound of dispersion for the DR-CCP, and they proved that the model is tractable whenever the support set and the dispersion function can be represented via polynomially numerous tractable (linear, conic quadratic and/or semidefinite) constraints.  In addition, Xie and Ahmed~(2018) considered a power flow optimization problem with two-sided chance constraints over the ambiguity set, with only mean and covariance information, and derived a second-order conic representable set for the feasible set of the distributionally robust two-sided chance constraints. Most recently, Wang, et al. (2019) proposed to use Chebyshev ambiguity set with distributional information of mean, support and dispersion to solve a mixed-redundancy strategy RAP. Due to the unique structure of the problem ({\it e.g.,} the mixed cold-standby and active parallel redundancy strategy), the resulting system lifetime function is a joint constraint of nonlinear functions with uncertainty, but Wang, et al. managed to reformulate the model into a linear MIP, where the binary variables are the same as the original redundancy allocation variables, thus the problem becomes tractable.

Our work build on the work of Wang, et al.. In particular, we introduce clustering process on data, and used it to construct conditional ambiguity set. In this way, more information and be incorporated into the ambiguity set, which can lead to more favorable result. We utilize cross validation to improve the clustering result. Our ambiguity set is also very general, and can incorporate any information of the data set that is defined by a convex function. Methodologically, we are also the first to attempt conditional ambiguity set in the distributionally robust chance constrained optimization, which enriches the modeling opportunities for the ambiguous CCPs.

As for our introduction of clustering algorithm to distributionally robust optimization, it is in some way similar to the work of Shang~et~al.~(2017), in which support vector clustering is applied as part of a data-driven robust optimization framework. The incorporation of side information into the clustering process is also a well known technique (Xing~et~al.~2003, Aggrawal~et~al.~2012, Liu~et~al.~2015).


\section{\color{blue}Redundancy Allocation: The Base Problem}{\color{blue}
In this section, we formally introduce the problem of redundancy allocation. We consider a system that consists of multiple subsystems indexed by $i \in \mathbf{N}$ that are connected in series. If any of the subsystems fails, the whole system fails. Furthermore, the subsystem $i$ consists of multiple types of components, indexed by $j \in \mathbf{M}_i$. In particular, every subsystem adopts a mixed redundancy strategy. That is, components of each type $j$ can be either in cold-standby fashion or active-parallel fashion, which makes up the two subsets of $\mathbf{M}_i=\mathbf{M}^{\rm c}_i \cup \mathbf{M}^{\rm a}_i$, where $\mathbf{M}^{\rm c}_i$ and $\mathbf{M}^{\rm a}_i$ indicate the index sets for cold-standbys and active-parallels, respectively. {\color{red} The cold-standby components begin to work (and thus begin its lifetime) only when its predecessor fails, while the active-parallel components begin their lifetime together.} Finally, each component of type $j$ in subsystem $i$ can have multiple redundant components, indexed by $t \in \mathbf{T}_{ij}$. The configuration of the series system with mixed redundancy strategy is illustrated in Figure~\ref{fig:mixed-redundancy}.


\begin{figure}[htp]
\centering
\includegraphics[scale=0.65]{AC-configuration0819.pdf}
\caption{\color{red} A series-parallel system of mixed redundancy strategy}\label{fig:mixed-redundancy}
\label{figure1}
\end{figure}


We denote by $\tilde{z}_{ijt}$ the lifetime random variable of the $t$-th redundant component of type $j$ in subsystem $i$, and use binary variables $x_{ijt}$ to denote that whether this component is used ($x_{ijt}=1$) or not ($x_{ijt}=0$), for $i \in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij}$. Thus, the system lifetime can be expressed as follows:
$$
\min\limits_{i \in \mathbf{N}}\left(\sum_{j \in \mathbf{M}^{\rm c}_i}\sum_{t\in \mathbf{T}_{ij}} \tilde{z}_{ijt}x_{ijt} + \max_{j \in \mathbf{M}^{\rm a}_i}  \max_{t \in \mathbf{T}_{ij}}\tilde{z}_{ijt}x_{ijt}\right).
$$
Now, we assume that the lifetime distribution
$$\bm{\tilde{z}}=\left(\tilde{z}_{ijt}\right)_{i \in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij}} \sim \P$$ is known, and the redundancy allocation problem can be formulated as the following chance-constrained optimization problem:
\begin{equation}\label{HP1-ambiguity-XY}
\begin{array}{rcll}
& \min\limits_{\x} &  \sum\limits_{i\in \mathbf{N}} \sum\limits_{j \in \mathbf{M}_i}\left[ \sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\right]c_{ij} \\[0.3 cm]
&{\rm s.t.} & \displaystyle \P\left[\min\limits_{i \in \mathbf{N}}\left(\sum_{j \in \mathbf{M}^{\rm c}_i}\sum_{t\in \mathbf{T}_{ij}} \tilde{z}_{ijt}x_{ijt} + \max_{j \in \mathbf{M}^{\rm a}_i}  \max_{t \in \mathbf{T}_{ij}}\tilde{z}_{ijt}x_{ijt}\right)> \mathcal{T}_R \right]\ge R_{0} & \\[0.3 cm]
&& L_{i} \le \sum\limits_{j\in \mathbf{M}_i}\sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\le  U_{i}, \forall i \in \mathbf{N},\\
&& \x \in \{0,1\}^{H},
\end{array}
\end{equation}
where $c_{ij}$ is the cost of each type $j$ redundant component in subsystem $i$, $L_i$ and $U_i$ are lower and upper limit of number of redundant component in each subsystem $i$, $\mathcal{T}_R$ is the required lifetime lower bound, and $R_0$ is the designed reliability level required to reach $\mathcal{T}_R$, and \begin{equation}
H := \sum\limits_{i \in \mathbf{N}}\sum\limits_{j \in \mathbf{M}_i}|\mathbf{T}_{ij}|
\end{equation}
is the total number of possible components in the system.

It is noted in the base model that we assume the distribution $\mathbb{P}$ is exactly known for the lifetimes $\bm{\tilde{z}}$ of all components. However, as mentioned in the Introduction, it is often hard in practice to calibrate such a high-dimensional lifetime distribution due to the scarce of the sufficient lifetime data in many practical reliability applications of redundancy systems. Furthermore, although the lifetime (or time-to-failure) observations might not be sufficient, we do have many observations (even realtime observations) on the working conditions related to component lifetimes. Therefore how to incorporate the possible {\it Side Information} to assist learning the lifetime patterns and achieve a more reliability and economic redundancy allocation is critical, especially in the insufficiency of lifetime data.  To this end, we propose in this work a {\it Distributionally Robust Optimization} model for the redundancy allocation that allows the lifetime distribution $\mathbb{P}$ to be ambiguous and is able to incorporate the possible valuable side information with learning. This will be discussed in detail in the forthcoming two sections.
}

\section{\color{blue}A Learning-based Distributionally Robust Redundancy Model}
{\color{blue}In this section, we focus on the modeling of redundancy optimization problem with side information incorporated using the distributionally robust optimization technique. In particular, we first introduce the construction of the {\it Ambiguity Set}  with side information knowledge established from the machine learning methods, {\it e.g.}, clustering (Section~\ref{subsec:ambiguityset}). We then develop the resulting distributionally robust chance-constrained model for the redundancy allocation and discuss its tractable reformulation (Section~\ref{subsec:DROmodel}). Finally, we design a {\it Cross Validation} method for choosing the best learning parameter~(Section~\ref{subsec:CV}).
}


\subsection{Constructing ambiguity set with clustering using side information}\label{subsec:ambiguityset}
{\color{blue}
As mentioned in the Introduction that in many practical situations, the components of a given type (or even of different types) may fail due to several common causes which can usually reflected by the side information like  producer information, temperature, working intensity, maintenance, to name a few. In other words, these components, once with the side information, can be readily grouped or clustered together, and the resulting clusters or groups should provide valuable information for our redundancy allocation decision. This motivates us to use the {\it Ambiguity Set} with the conditional indicator variables ({\color{red}Chen~et~al.~2019}) to the distributional knowledge incorporated with the side information under the ambiguity.}

{\color{blue}
In particular, we assume that the true distribution $\P$ of the lifetime random variables $\bm{\tilde{z}}$ is not exactly known or {\em ambiguous}, and only partial information of the lifetime distribution is available. Furthermore, we also assume that some side information related to component lifetimes is also available and we let an indicator random variable $\tilde{k}$ to model the possible clustering knowledge learned from both side information and lifetime observations (which will be explained in detail later). Formally, we define the following distributional ambiguity set $\mathbb{F}$ by incorporating the above mentioned distributional information conditional on the indicator variable $\tilde{k}$, within which the distribution $\P$ is allowed to vary:}
\begin{equation}\label{ambiguity-set}
\mathbb{F}_{K}:=\left\{\P \in \mathcal{P}\left( \mathbb{R}_+^H\times[K]\right) \left |
\begin{array}{ll}
(\tilde{\bm{z} }, \tilde{k })\sim \P \\[0.3 cm]
\mathbb{E}_{\P}\Big[\tilde{z}_{ijt}~\Big\vert~\tilde{k}=k\Big]\in \left[\underline{\mu}^{k}_{ij}, \overline{\mu}^{k}_{ij} \right], & \forall k \in [K], i \in \mathbf{N}, j\in \mathbf{M}_i, t \in \mathbf{T}_{ij}\\ [0.3 cm]
\mathbb{E}_{\P}\Big[ g_{lk}(\bm{\tilde{z}})  ~\Big |~ \tilde{k}=k \Big]\le 0, & \forall k \in [K],  l \in \mathbf{L}_k\\[0.5 cm]
\P\Big[\bm{\tilde{z}}\in \mathcal{Z}_k ~\Big |~ \tilde{k}=k \Big]=1, & \forall k \in [K]\\[0.3 cm]
\P\Big[\tilde{k}=k\Big]=p_k, & \forall k \in [K]
\end{array}\right. \right\}.
\end{equation}
{\color{blue}In the above ambiguity set, $K$ is the number of clusters of lifetime pattern learned (which will be explained later and the selection of the hyperparameter $K$ will discussed in Section~\ref{subsec:CV}), the second set of constraints capture the expected lifetime range $[\underline{\mu}^{k}_{ij}, \overline{\mu}^{k}_{ij}]$ of each type $j$ of components in subsystem $i$ in cluster $k$; The forth set of constraints captures the information of conditional support set $\mathcal{Z}_k$ given each cluster $k$, where unless specification we let
$\mathcal{Z}_k=\mathcal{Z}, \forall k \in [K]$ and
\begin{equation}\label{equ:Z}
\mathcal{Z}:= \Big\{\z \in \mathbb{R}^{H}| z_{ijt} \in [\underline{z}_{ij},\overline{z}_{ij}], \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij} \Big\}.
\end{equation}
The last set of constraints represent the probability $p_k$ of $\bm{z}$ falling within each cluster, which can be estimated upon the data points within each cluster. Finally, the third set of constraints
\begin{equation}\label{cons:g}
\mathbb{E}_{\P}\Big[ g_{lk}(\bm{\tilde{z}})~\Big |~ \tilde{k}=k \Big]\le 0, \quad \forall k \in [K],  l \in \mathbf{L}_k,
\end{equation}
where each $g_{lk}(\bm{{z}})$ is a convex function of $\bm{{z}}$, are utilized to incorporate, flexibly, more distributional information ({\it e.g.}, correlation) per necessary, for which we illustrate by the following examples.
\begin{example}[Marginal Variance]\label{ex:ambiguityset-2}
If we define  in \eqref{cons:g}
\begin{equation}\label{equ:g-1}
g_{ijtk}({\bm{z}}):=\left(z_{ijt}-\nu^{k}_{ij} \right)^2 - S^{k}_{ij}, \forall k \in [K], i \in \mathbf{N}, j \in {\mathbf{M}_{i}},
\end{equation}
where $\nu^{k}_{ij}, S^{k}_{ij}$ are sample means and standard deviation bounds estimated from the data of each cluster $k$ for each type $j$ in subsystem $i$. This leads to the conditional marginal variance constraints:
$$
\mathbb{E}_{\P}\left[\left. \left(\tilde{z}_{ijt}-\nu^{k}_{ij} \right)^2~\right|~ \tilde{k}=k \right]\le S^{k}_{ij}, \forall k \in [K], i \in \mathbf{N}, j\in \mathbf{M}_i.
$$
\end{example}
\begin{example}[Conditional Average Dispersion]\label{ex:ambiguityset-1}
If we define
\begin{equation}\label{equ:g-2}
g_{ijk}({\bm{z}}):=\sum\limits_{t \in \mathbf{T}_{ij}}\left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right| - \epsilon^{k}_{ij}, \forall k \in [K], i \in \mathbf{N}, j \in {\mathbf{M}_{i}},
\end{equation}
where $\nu^{k}_{ij}, \sigma^{k}_{ij}$ and $\epsilon^{k}_{ij}$ are the sample mean values, standard deviations and dispersion upper bound levels estimated from the data of each cluster $k$. This leads to the conditional average dispersion constraints:
$$
\mathbb{E}_{\P}\left[\left. \frac{1}{|\mathbf{T}_{ij} |}\sum\limits_{t \in \mathbf{T}_{ij}}\left|\frac{ \tilde{z}_{ijt}-\nu^{k}_{ij}}{\sigma^{k}_{ij}} \right|  ~\right|~ \tilde{k}=k \right]\le \epsilon^{k}_{ij}, \forall k \in [K], i \in \mathbf{N}, j\in \mathbf{M}_i,\\
$$
which captures the modeling requirement that within each cluster $k$, the lifetimes of the same type of components in the same subsystem should be distributionally similar.  We also point out that when $K=1$ ({\it i.e.}, only 1 cluster considered), the model of \eqref{equ:g-2} with $g_{ijk}({\bm{z}})$ reduces to the regular average dispersion constraints employed in Wang~et~al.~(2019).
\end{example}
\begin{example}[Conditional Cross Variance]\label{ex:ambiguityset-1}
If we define
\begin{equation}\label{equ:g-3}
g_{ik}({\bm{z}}):=\left(\sum\limits_{j \in {\mathbf{M}_{i}}}\sum\limits_{t \in \mathbf{T}_{ij}}\frac{z_{ijt}}{|\mathbf{T}_{ij}|}-\nu^{k}_{i} \right)^2 - S^k_{i}, \forall k \in [K], i \in \mathbf{N},
\end{equation}
where $\nu^{k}_{i}$ and $S^{k}_{i}$ are the estimated sample mean values and upper bound levels of variance of sum of component lifetimes in all types of each subsystem $i$ given cluster $k$. This leads to the conditional cross variance constraints:
$$
\mathbb{E}_{\P}\left[\left. \left(\sum\limits_{j \in {\mathbf{M}_{i}}}\sum\limits_{t \in \mathbf{T}_{ij}}\frac{z_{ijt}}{|\mathbf{T}_{ij}|}-\nu^{k}_{i} \right)^2  ~\right|~ \tilde{k}=k \right]\le S^k_{i}, \forall k \in [K], i \in \mathbf{N},
$$
which captures the possible correlations among the lifetimes of different types of components.
\end{example}


Finally, we elaborate on how the ambiguity set model $\mathbb{F}_K$ can incorporate, in a flexible fashion, the information of lifetime patterns extracted using machine learning approaches of clustering (Hastie~et~al.~2009). }

{\color{blue}
{\it Direct clustering on Lifetime Sample.} We first point out that even without side information, it is possible to identify more valuable component lifetime patterns by clustering directly over the lifetime sample and we lose nothing by just set $K=1$. For instance, we can use the {\it K-means}, which proves effective and is one of the most popular and unsupervised learning clustering algorithm. A modeling and computational advantage of using K-means for our DRO redundancy allocation model is that the resulting partition of the support $\mathcal{Z}$, which is also called {\it Voronoi Tesselation} (Hastie~et~al.~2009), forms naturally different polyhedral conditional support subsets:
\begin{equation}\label{k-means-classifier}
\mathcal{Z}_k := \Big\{\bm{z} \in \mathcal{Z}\:\big|\: 2(\hmu_i - \hmu_k)^{\top}\bm{z} \leq \hmu_{i}^{\top}\hmu_{i} - \hmu_{k}^{\top}\hmu_{k}, \forall i \in [K] \Big\}, \forall k \in [K],
\end{equation}
where $\hmu_k$ is the mean of cluster $\mathcal{Z}_k$ which is also the output of the K-means algorithm. This conditional support subsets $\mathcal{Z}_k$'s in \eqref{k-means-classifier} not only partitions the original support (therefore provides more effective conditional support sets), but also enjoy computationally viable geometry of polyhedra.  More details on K-means or clustering implementation issues, {\it e.g.}, using some tie-breaking rules to ensure each data point to belongs to only one cluster, can be found in {\color{red}the work of MacQueen (1967)}.
}


{\color{red}
{\it Clustering with Dimension Reduction.} In some situations, it may be beneficial to incorporate dimension reduction into the clustering process. In complex systems with very large number of components, dimension reduction may speed up the clustering process. More importantly, if the lifetime ranges of different components vary to a great extent or if the lifetime of components correlate with each other, Euclidean distance on the raw data may not be a suitable distance metric to measure the similarity of data points. Dimension reduction is likely to reveal the underlying structure of the data, and provide a better distance metric, thus resulting in better clusters. There are many well-developed dimension reduction algorithms to be employed. Principle Component Analysis (PCA) is a popular linear dimension reduction algorithm that can execute very fast. t-Distributed Stochastic Neighbor Embedding (t-SNE) is a slower algorithm, but is particularly well-suited to reduce very high dimensional data into low dimensions like 2 or 3. Spectral Clustering is a technique that can combine a standard clustering method (like K-means) and dimension reduction seamlessly together (Ng et al. 2002). }

{\color{blue}
{\it Clustering based on Side Information.} As we mentioned in the Introduction, in many situations, the data collected on system redundancy design contains not only the samples of component lifetimes, but also the {\it covariates} of side information such as producer information, working conditions, time to last maintenance. These side information should be help for characterizing the lifetime patterns of the components, especially in the scarce of the historical lifetime data, see the following Example~\ref{ex:sideinformation}.
\begin{example}[value of side information]\label{ex:sideinformation}
\color{red} Consider an example of a simple railway system consists of two lines, North Line (case 0) and South Line (case 1). The climate alongside the lines is vastly different, which affects the expected lifetime of the braking system on the train. Consider the braking system as a single component, it has expected lifetime of 1 year operating on the North Line and 3 years on the South Line ( $\mathbb{E}_{\P}\Big[\tilde{z}~\Big\vert~\tilde{k}=0\Big] = 1, \mathbb{E}_{\P}\Big[\tilde{z}~\Big\vert~\tilde{k}=1\Big] = 3$). The two lines have same amount of traffic ($p_0 = p_1 = 0.5$). Without side information, we are able to construct the following ambiguity set, using only the mean information:


\begin{equation}\label{ambiguity-set-foo}
\mathbb{F}:=\left\{\P \in \mathcal{P}\left( \mathbb{R}_+\right) \left|
\begin{array}{ll}
\tilde{z}\sim \P \\[0.3 cm]
\mathbb{E}_{\P}\Big[\tilde{z}]\in \left[1.6, 2.4\right]\\ [0.3 cm]
\P\Big[{\tilde{z}}\in \mathcal{Z}\Big]=1
\end{array}\right. \right\}.
\end{equation}

While using side information, the conditional ambiguity set is:

\begin{equation}\label{ambiguity-set-bar}
\mathbb{F}_{K}:=\left\{\P \in \mathcal{P}\left( \mathbb{R}_+\times[K]\right) \left |
\begin{array}{ll}
(\tilde{z}, \tilde{k })\sim \P \\[0.3 cm]
\mathbb{E}_{\P}\Big[\tilde{z}~\Big\vert~\tilde{k}=0\Big]\in \left[0.8, 1.2\right] \\ [0.3 cm]
\mathbb{E}_{\P}\Big[\tilde{z}~\Big\vert~\tilde{k}=1\Big]\in \left[2.4, 3.6\right] \\ [0.3 cm]
\P\Big[\tilde{z}\in \mathcal{Z}_0 ~\Big |~ \tilde{k}=0 \Big]=1\\[0.3 cm]
\P\Big[\tilde{z}\in \mathcal{Z}_1 ~\Big |~ \tilde{k}=1 \Big]=1\\[0.3 cm]
\P\Big[\tilde{k}=0\Big]=0.5\\[0.3cm]
\P\Big[\tilde{k}=1\Big]=0.5\\[0.3cm]
\end{array}\right. \right\}.
\end{equation}

Note that the lower bound and upper bound of mean is constructed by shrinking or enlarging the mean by 20\%.

It is clear that the conditional ambiguity set describes the data set better. While the lifetime distributions under different scenarios may not be so distinctive from each other in practice, with the introduction of side information we can incorporate much more information into the ambiguity set, and better describe the structure of the data set.

\end{example}

Therefore, we can extract the lifetime pattern information by clustering collectively both the lifetime data and the side information, using suitable machine learning approaches, such as K-means clustering, hierachical clustering ({\color{red}Sibson 1973}) and regression trees ({\color{red}Quinlan 1986}), per convenience. We can therefore form a number of clusters or scenarios $k\in [K]$ of the component lifetime patterns and achieve a more accurate description for the component lifetimes using the statistical information within each cluster (scenario).
}

{\color{blue} In the next section, we discuss the distributionally robust redundancy allocation model with ambiguity set $\mathbb{F}_K$ given the number $K$ of clusters, and derive its tractable reformulations.}


\subsection{The model with tractable reformulation}\label{subsec:DROmodel}{\color{blue}
With the ambiguity set $\mathbb{F}_K$ of lifetime distributions of all components, we now consider the worst-case probabilistic constraint for safeguarding the reliability level $R_0$, {\it i.e.},
$$
\inf\limits_{\P \in \mathbb{F}_{K}}\P\left[\min\limits_{i \in \mathbf{N}}\left(\sum_{j \in \mathbf{M}^{\rm c}_i}\sum_{t\in \mathbf{T}_{ij}} \tilde{z}_{ijt}x_{ijt} + \max_{j \in \mathbf{M}^{\rm a}_i}  \max_{t \in \mathbf{T}_{ij}}\tilde{z}_{ijt}x_{ijt}\right)> \mathcal{T}_S \right]\ge R_{0}
$$
in the original redundancy allocation problem~\eqref{HP1-ambiguity-XY}. This leads to the following {\em distributionally robust redundancy optimization (DRRO) model}:
\begin{equation}\label{HP1-ambiguity-X}
\begin{array}{rcll}
& \min\limits_{\x} &  \sum\limits_{i\in \mathbf{N}} \sum\limits_{j \in \mathbf{M}_i}\left[ \sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\right]c_{ij} \\[0.3 cm]
&{\rm s.t.} & \displaystyle \inf\limits_{\P \in \mathbb{F}_{K}}\P\left[\min\limits_{i \in \mathbf{N}}\left(\sum_{j \in \mathbf{M}^{\rm c}_i}\sum_{t\in \mathbf{T}_{ij}} \tilde{z}_{ijt}x_{ijt} + \max_{j \in \mathbf{M}^{\rm a}_i}  \max_{t \in \mathbf{T}_{ij}}\tilde{z}_{ijt}x_{ijt}\right)> \mathcal{T}_S \right]\ge R_{0} & \\[0.3 cm]
&& L_{i} \le \sum\limits_{j\in \mathbf{M}_i}\sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\le  U_{i}, \forall i \in \mathbf{N},\\
&& \x \in \{0,1\}^H.
\end{array}
\end{equation}
In the above model, the distributionally robust chance constraint essentially ensures to achieve the required reliability level $R_{0}$ over all the qualified probability distributions $\P \in \mathbb{F}_K$.

{\color{red}It can be shown that the conditional ambiguity set ($\mathbb{F}_{K}$) is tighter than the ambiguity set constructed without clustering (which can also be considered as the case when $K = 1$, denoted by $\mathbb{F}_{1}$). Therefore, by incorporating cluster information, we can achieve better result in solving the problem. We present a proof when using marginal variance in example 1. In the following proposition, parameters with subscript or superscript $k$ are parameters in $\mathbb{F}_K$, while the parameters without them are parameters in $\mathbb{F}_1$. $P_K$ is the program \eqref{HP1-ambiguity-X} where $\P \in \mathbb{F}_{K}$, with optimal solution $c^*_K$, and $P_1$ is the program \eqref{HP1-ambiguity-X} where $\P \in \mathbb{F}_{1}$, with optimal solution $c^*_1$.
\begin{proposition}
Given ambiguity set $\mathbb{F}_{K}$ and $\mathbb{F}_{1}$, assuming that $\underline{\hmu} = \sum\limits_{k \in [K]}p_k\underline{\hmu}_{k}$, $\overline{\hmu} = \sum\limits_{k \in [K]}p_k\overline{\hmu}_{k}$, $\hmu_k = \frac{\underline{\hmu}_{k} + \overline{\hmu}_{k}}{2}$, $S_{ij}^2 + \mu_{ij}^2 = \sum\limits_{k \in [K]}p_k({S_{ij}^k}^2 + {\mu_{ij}^k}^2), \forall i \in \mathbf{N}, j \in \mathbf{M}_i$, then $\mathbb{F}_{K} \subseteq \mathbb{F}_{1}$, and the optimal solution $c^*_K \leq c^*_1$.
\end{proposition}
{\bf Proof.} First, we prove that $\forall \P \in \mathbb{F}_{K}, \P \in \mathbb{F}_{1}$ too.
%\begin{eqnarray}
$$
\begin{aligned}
\mathbb{E}_{\P}\Big(\tilde{z}_{ijt}\Big) \geq \sum\limits_{k \in [K]}p_k\underline{\hmu}_{ij}^{k} &= \underline{\mu}_{ij} & \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij}\\
\mathbb{E}_{\P}\Big(\tilde{z}_{ijt}\Big) \leq \sum\limits_{k \in [K]}p_k\overline{\hmu}_{ij}^{k} &= \overline{\mu}_{ij} & \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij}\\
\mathbb{E}_{\P}\Big((\tilde{z}_{ijt} - \mu_{ij})^2 \Big) &= \sum\limits_{k \in [K]}p_k\mathbb{E}_{\P}\Big((\tilde{z}_{ijt} - \mu_{ij}^k)^2 | \tilde{k} = k) & \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij}\\\\
&= \sum\limits_{k \in [K]}p_k\mathbb{E}_{\P}\Big((\tilde{z}_{ijt})^2 | \tilde{k} = k\Big) - \mu_{ij}^2&\\
&\leq \sum\limits_{k \in [K]}p_k\mathbb{E}_{\P}\Big(S_{ij}^2 + {\mu_{ij}^k}^2 | \tilde{k} = k \Big) - \mu_{ij}^2&\\
&= S_{ij}^2 + {\mu_{ij}}^2 - \mu_{ij}^2&\\
&= S_{ij}^2&
\end{aligned}
$$
%\end{eqnarray}

Moreover, since $\mathcal{Z}_k \subseteq \mathcal{Z}$ by definition, $\bigcup\limits_{k \in [K]}\mathcal{Z}_k \subseteq \mathcal{Z}$. Therefore $\P[\tilde{z} \in \mathcal{Z}] = 1$. Hence, $\P \in \mathbb{F}_{1}$.

Since $\forall \P \in \mathbb{F}_{K}, \P \in \mathbb{F}_{1}$, $\mathbb{F}_{K} \subseteq \mathbb{F}_{1}$. Therefore any feasible solution $c_1$ to $P_1$ is also a feasible solution to $P_K$. Since $c^*_K$ and $c^*_1$ are optimal solution to a minimization problem, $c^*_K \leq c^*_1$. \blot

The conditions in the proposition are used to require that the the momemnt specifications should be consistent.
}


Next, we derive a tractable formulation of the DRRO model \eqref{HP1-ambiguity-X}. Without loss of generality, we assume that $\mathbf{M}^{a}_{i} \neq \emptyset, \forall i \in \mathbf{N}$. The case of $\mathbf{M}^{a}_{i} = \emptyset, \exists i \in \mathbf{N}$, by our forthcoming discussions, can be treated similarly and is actually technically easier.  Also we illustrate our major results using
$$
g_{ijk}({\bm{z}}):=\sum\limits_{t \in \mathbf{T}_{ij}}\left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right| - \epsilon^{k}_{ij}, \forall k \in [K], i \in \mathbf{N}, j \in {\mathbf{M}_{i}}
$$
in the ambiguity set $\mathbb{F}_K$, and consider the conditional support set with K-means
\begin{equation}\label{equ:W}
\mathcal{Z}_k = \Big\{\bm{z} \in \mathcal{Z}\:\big|\: 2(\hmu_i - \hmu_k)^{\top}\bm{z} \leq \hmu_{i}^{\top}\hmu_{i} - \hmu_{k}^{\top}\hmu_{k}, \forall i \in [K] \Big\}, \forall k \in [K]
\end{equation}
where $\mathcal{Z}$ is the original support set given by \eqref{equ:Z}.


We begin by focusing on the computation of the following worst-case reliability function in the problem (\ref{HP1-ambiguity-X}) given system design $\x$:
\begin{equation}\label{Prob-1}
\displaystyle \inf\limits_{\P \in \mathbb{F}_K}\P\left[\min\limits_{i \in \mathbf{N}}\left(\sum_{j \in \mathbf{M}^{\rm c}_i}\sum_{t\in \mathbf{T}_{ij}} \tilde{z}_{ijt}x_{ijt} + \max_{j \in \mathbf{M}^{\rm a}_i}  \max_{t \in \mathbf{T}_{ij}}\tilde{z}_{ijt}x_{ijt}\right)> \mathcal{T}_R \right].
 \end{equation}
For a better exposition of our approach, we denote by
\begin{equation}\label{constraint-set}
\mathcal{Z}_i(\x):=\left\{\bm{z} \in \mathbb{R}^H_+ ~\left|~ \sum_{j \in \mathbf{M}^{\rm c}_i}\sum_{t\in \mathbf{T}_{ij}} \tilde{z}_{ijt}x_{ijt} + \max_{j \in \mathbf{M}^{\rm a}_i}  \max_{t \in \mathbf{T}_{ij}}\tilde{z}_{ijt}x_{ijt} > \mathcal{T}_R \right.\right \}, \forall i \in \mathbf{N}.
\end{equation}

Since $\mathbf{M}^{a}_{i} \neq \emptyset, \forall i \in \mathbf{N}$, by the nature of maximal value, it is clear that
\begin{equation}\label{constraint-set-2}
\mathcal{Z}_i(\x) = \bigcup\limits_{{j} \in \mathbf{M}^{\rm a}_i}\bigcup\limits_{t \in \mathbf{T}_{i{j}}}\left\{\bm{z} \in \mathbb{R}^H_+ ~\left|~ \sum_{{j} \in \mathbf{M}^{\rm c}_i}\sum_{t\in \mathbf{T}_{i{j}}} \tilde{z}_{i{j}{t}}x_{i{j}{t}} + \tilde{z}_{i{j}{t}}x_{i{j}{t}} > \mathcal{T}_R \right.\right \}, \forall i \in \mathbf{N}.
\end{equation}
Then the complimentary set of $\mathcal{Z}_i$ for each $i \in \mathbf{N}$, denoted by $\overline{\mathcal{Z}}_i$,  is
\begin{equation}\label{set-LT}
\overline{\mathcal{Z}}_i(\x)=\bigcap\limits_{{j} \in \mathbf{M}^{\rm a}_i}\bigcap\limits_{t \in \mathbf{T}_{i{j}}}\left\{\bm{z} \in \mathbb{R}^H_+ ~\left|~ \sum_{{j} \in \mathbf{M}^{\rm c}_i}\sum_{t\in \mathbf{T}_{i{j}}} \tilde{z}_{i{j}{t}}x_{i{j}{t}} + \tilde{z}_{i{j}{t}}x_{i{j}{t}}\leq \mathcal{T}_R \right.\right \}, \forall i \in \mathbf{N}.
\end{equation}
The following result establishes an equivalent formulation of regular robust optimization for the above worst-case reliability function~\eqref{Prob-1}.

\begin{lemma}\label{lem1-LT}
Given system design $\x$, the worst-case reliability function (\ref{Prob-1}) is equivalent to the optimal value of the following  optimization problem:
\begin{equation}\label{P1-ambiguity-sup-dual1}
\begin{array}{rcll}
&\!\!\!\!\!\!\!\!\!\!\!\! \max\limits_{\bm{\alpha}, \bm{\beta}, \bm{\lambda}, \bm{\tau}} &  1-\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}\sum\limits_{k\in[K]} \left(\alpha^{k}_{ijt}\underline{\mu}^{k}_{ij} + \beta^{k}_{ijt}\overline{\mu}^{k}_{ij}\right)-\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\sum\limits_{k\in[K]}\epsilon^{k}_{ij}\lambda^{k}_{ij} - \sum\limits_{k\in[K]}\tau_{k} \\[0.3 cm]
&\!\!\!\!\!\!\!\!\!\!\!\!{\rm s.t.} & \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge{p_k}, \forall \bm{z} \in  \mathcal{Z}_k\cap \overline{\mathcal{Z}}_i(\x), i \in \mathbf{N}, k \in [K]\\
&& \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ij{t}}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge 0, \forall \bm{z} \in  \mathcal{Z}_k, k\in [K]\\
&& \halpha \le \mathbf{0}, \hbeta, \hlambda \ge \mathbf{0}, \bm{\tau} \in \mathbb{R}^K.
\end{array}
\end{equation}

\end{lemma}

%\begin{proof}
{\bf Proof.}
With the notation in (\ref{constraint-set}), the worst-case probabilistic chance function (\ref{Prob-1})  can be rewritten in terms of the probability of its complementary event:
\begin{equation}\label{1minus}
\inf\limits_{\P\in \mathbb{F}_{K}} \P\Big[\tilde{\bm{z} }\in \mathcal{Z}_i(\x),\forall i \in \mathbf{N} \Big]=1-\sup\limits_{\P \in \mathbb{F}_K } \P\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big].
\end{equation}

Given the probability distribution of  $\tilde{s}$ as
$$
\P\Big[\tilde{k}=k\Big]=p_k, \forall k \in [K].
$$
We now define $\P_k$ as the conditional distribution of $\bm{\tilde{z}}$ given $\tilde{k}=k$ for $k \in [K]$, we then can decompose any distribution $\P \in \mathbb{F}_K$ using $\{\P_k, k\in [K]\}$ and rewrite the worst-case chance
$$
\sup\limits_{\P \in \mathbb{F}_K } \P\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big]
$$
using total probability law as following formulation:
\begin{eqnarray}
\sup\limits_{\P \in \mathbb{F}_K } \P\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big]
&=&\sup\limits_{\P_k, \forall k\in[K]}\sum\limits_{k\in[K]}p_k\P_k\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big]\label{P1-ambiguity-sup} \\[0.2 cm]
&=&\sup\limits_{\P_k}\sum\limits_{k\in[K]}\displaystyle \int_{\cup_{i\in \mathbf{N} }\left\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\right\}}p_k {\rm d}\P_k(\bm{\tilde{z}}) \\[0.2 cm]
&&{\rm s.t.}\displaystyle \int_{\bm{\tilde{z}} \in \mathcal{Z}_k} \tilde{z}_{ijt} {\rm d}\P_k(\bm{\tilde{z}}) \ge \underline{\mu}^{k}_{ij},    \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k\in [K]\\ [0.2 cm]
&&~\quad \displaystyle \int_{\bm{\tilde{z}} \in \mathcal{Z}_k} \tilde{z}_{ijt} {\rm d}\P_k(\bm{\tilde{z}}) \le \overline{\mu}^{k}_{ij},    \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij},k\in [K]\\ [0.2 cm]
&&~\quad  \displaystyle \int_{\bm{\tilde{z}} \in \mathcal{Z}_k} \sum\limits_{t\in \mathbf{T}_{ij}}\left|\frac{\tilde{z}_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right| {\rm d}\P_k(\bm{\tilde{z}}) \le \epsilon^{k}_{ij},   \forall i \in \mathbf{N}, j\in \mathbf{M}_i,k\in [K]\\[0.2 cm]
&&~\quad \displaystyle \int_{\bm{\tilde{z}} \in \mathcal{Z}_k} {\rm d}\P_k(\bm{\tilde{z}}) =1, \forall k\in[K], \label{P1-ambiguity-sup-2}
\end{eqnarray}
where the support $\mathcal{Z}_k$ is given by (\ref{equ:W}). The Lagrange dual of above moment problem \eqref{P1-ambiguity-sup}-\eqref{P1-ambiguity-sup-2} has the following formulation (Wiesemann~et al.~2014):
\begin{equation}\label{P1-ambiguity-sup-dual0}
\begin{array}{rcl}
&\!\!\!\!\!\!\!\!\!\!\!\! \min\limits_{\bm{\alpha}, \bm{\beta}, \bm{\lambda}, \bm{\tau}} &  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}\sum\limits_{k\in[K]} \left(\alpha^{k}_{ijt}\underline{\mu}^{k}_{ij}+ \beta^{k}_{ijt}\overline{\mu}^{k}_{ij}\right)+\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\sum\limits_{k\in[K]}\epsilon^{k}_{ij}\lambda^{k}_{ij} + \sum\limits_{k\in[K]}\tau_{k} \\[0.3 cm]
&\!\!\!\!\!\!\!\!\!\!\!\!{\rm s.t.} & \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge p_k \mathbb{I}\Big({\bigcup\limits_{i\in \mathbf{N} }\left\{ \bm{z} \in \overline{\mathcal{Z}}_i\right\}} \Big), \forall \bm{z} \in \mathcal{Z}_k, k \in [K]\\
&& \bm{\alpha} \le \mathbf{0}, \bm{\beta}, \bm{\lambda} \ge \mathbf{0}, \bm{\tau} \in \mathbb{R}^K,
\end{array}
\end{equation}
where $\mathbb{I}(\mathcal{A})$ is the indicator function with respect to set $\mathcal{A}$, and  $(\halpha, \hbeta, \hlambda, \htau)$ are the dual variables associated with the constraints of the primal problem \eqref{P1-ambiguity-sup}-\eqref{P1-ambiguity-sup-2}.


Furthermore, we show the strong duality holds. Since ${\mu^{k}_{ij}}$ is the expectation of $\bm{\tilde{z}}_{ijt}$, we can always find a Dirac probability distribution $\P^{\dag}_{\bm{\mu}}$ with $\underline{\hmu}<\hmu<\overline{\hmu}$ which is relative interior point of the feasible set of problem \eqref{P1-ambiguity-sup}-\eqref{P1-ambiguity-sup-2}. Therefore, the Slater condition holds, and then the optimal value of (\ref{P1-ambiguity-sup-dual0}) is equivalent to that of problem \eqref{P1-ambiguity-sup}-\eqref{P1-ambiguity-sup-2}.


Next, expanding the indication function $\mathbb{I}\left({\cup_{i\in \mathbf{N} }\left\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\right\}} \right)$ for different cases of $\bm{z}$, the above problem (\ref{P1-ambiguity-sup-dual0}) is also equivalent to the following formulation:
\begin{equation}\label{P1-ambiguity-sup-dual00}
\begin{array}{rcll}
&\!\!\!\!\!\!\!\!\!\!\!\! \min\limits_{\bm{\alpha}, \bm{\beta}, \bm{\lambda}, \bm{\tau}} &  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}\sum\limits_{k\in[K]} \left(\alpha^{k}_{ijt}\underline{\mu}^{k}_{ij}+ \beta^{k}_{ijt}\overline{\mu}^{k}_{ij}\right)+\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\sum\limits_{k\in[K]}\epsilon^{k}_{ij}\lambda^{k}_{ij} + \sum\limits_{k\in[K]}\tau_{k} \\[0.3 cm]
&\!\!\!\!\!\!\!\!\!\!\!\!{\rm s.t.} & \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge p_k, \forall \bm{z} \in \mathcal{Z}_k\cap\overline{\mathcal{Z}}_i, i \in \mathbf{N}, k \in [K]\\
&& \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge0, \forall \bm{z} \in \mathcal{Z}_k, k \in [K]\\\nonumber
&& \halpha \le \mathbf{0}, \hbeta, \hlambda \ge \mathbf{0}, \bm{\tau} \in \mathbb{R}^K.
\end{array}
\end{equation}
Finally, plugging this formulation  into the equation (\ref{1minus}), we arrive at the the formulation of (\ref{P1-ambiguity-sup-dual1}) whose optimal objective value is exactly the worst-case value of probabilistic chance function (\ref{Prob-1}).  The proof is completed. \blot
%\end{proof}

It is noted that the derived optimization problem (\ref{P1-ambiguity-sup-dual1}) in current version still belongs to semi-infinitely dimensional optimization problems which are not directly computable.  In the following, we show that by duality argument the problem can be further transformed into a computationally tractable formulation of linear program.

\begin{proposition}\label{P-proposition1b}
Given a system design $\x$, The worst-case reliability function (\ref{Prob-1}) solves the following linear program (LP):
\begin{eqnarray}
&\!\!\!\!\!\! \max &  1-\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}\sum\limits_{k\in[K]} \left(\alpha^{k}_{ijt}\underline{\mu}^{k}_{ij} + \beta^{k}_{ijt}\overline{\mu}^{k}_{ij}\right) - \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\sum\limits_{k\in[K]}\epsilon^{k}_{ij}\lambda^{k}_{ij} - \sum\limits_{k\in[K]}\tau_{k} \label{HP1-ambiguity-LP-FL} \\
 &\!\!\!\!\!\!{\rm s.t.} & \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \Big[\phi^{lk}_{ijt}\underline{z}_{ij}+\varphi^{lk}_{ijt}\overline{z}_{ij}+{\nu^{k}_{ij}\left(\pi^{lk}_{ijt}-\varpi^{lk}_{ijt} \right)}  \nonumber\\
  &&+  \sum\limits_{n \in [K]}\psi^{lk}_{n}({(\mu^{n}_{ijt})}^2 - {(\mu^{k}_{ijt})}^2)\Big] + \sum\limits_{j\in \mathbf{M}^{\rm a}_{l}}\sum\limits_{t\in \mathbf{T}_{lj}} q^{lk}_{jt}\mathcal{T}_R+ \tau_k \geq p_k, \forall l \in \mathbf{N}, k \in [K]  \label{HP1-ambiguity-LP-FL1}\\
  && \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \Big[\rho^{k}_{ijt}\underline{z}_{ij}+\varrho^{k}_{ijt}\overline{z}_{ij}+{\nu^{k}_{ij}\left(\gamma^{k}_{ijt}-\theta^{k}_{ijt} \right)} \nonumber\\ && + \sum\limits_{n \in [K]}\varsigma^{k}_{n}({(\mu^{n}_{ijt})}^2 - {(\mu^{k}_{ijt})}^2) \Big] + \tau_k \geq0, \forall k \in [K]\label{HP1-ambiguity-LP-FL1-2}\\
&&  {q^{lk}_{jt}}x_{l jt}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+\pi^{lk}_{ljt}-\varpi^{lk}_{ljt} \nonumber \\ &&  = \alpha^{k}_{ljt}+\beta^{k}_{ljt}, \forall l \in \mathbf{N}, j\in \mathbf{M}^{\rm a}_l, t \in \mathbf{T}_{l j}, k \in [K] \\[0.3 cm]
&&  x_{l jt}\sum\limits_{j \in \mathbf{M}^{\rm a}_l}\sum\limits_{t \in \mathbf{T}_{l j}}{q^{lk}_{jt}}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+\pi^{lk}_{ljt}-\varpi^{lk}_{ljt} \nonumber \\ &&  = \alpha^{k}_{ljt}+\beta^{k}_{ljt}, \forall l \in \mathbf{N}, j\in \mathbf{M}^{\rm c}_l, t \in \mathbf{T}_{l j}, k \in [K] \\[0.3 cm]
&&   \sum\limits_{n \in [K]}2(\mu^{n}_{ijt} - \mu^{k}_{ijt})\psi^{lk}_{n}+  \phi^{lk}_{ijt}+\varphi^{lk}_{ijt}+\pi^{lk}_{ijt}-\varpi^{lk}_{ijt} = \alpha^{k}_{ijt}+\beta^{k}_{ijt},\nonumber\\ && ~ \forall l \in \mathbf{N}, i \in \mathbf{N}\setminus\{l \}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k \in [K] \label{HP2-ambiguity-LP-FL2} \\
&&{|\mathbf{T}_{ij} |\sigma^{k}_{ij}}(\pi^{lk}_{ijt}+\varpi^{lk}_{ijt}) =\lambda^{k}_{ij},  ~ \forall l  \in \mathbf{N}, i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k \in [K] \\
&& \sum\limits_{n \in [K]}2(\mu^{n}_{ijt} - \mu^{k}_{ijt})\varsigma^{k}_{n}+ \rho^{k}_{ijt}+\varrho^{k}_{ijt}+\gamma^{k}_{ijt}-\theta^{k}_{ijt} = \alpha^{k}_{ijt}+\beta^{k}_{ijt},\nonumber\\ && ~ \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k \in [K]  \\
&&  {|\mathbf{T}_{ij} |\sigma^{k}_{ij}}(\gamma^{k}_{ijt}+\theta^{k}_{ijt}) =\lambda^{k}_{ij},  ~ \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k \in [K] \\
%&&  q_{l jk}\le y_{{l jk}}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}_{l}, k\in \mathcal{N}(l,j) \\[0.3 cm]
%&& y_{{l jk}} \ge M x_{l jk}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}_{l}, k\in \mathcal{N}(l,j) \\[0.3 cm]
%&& y_{{l jk}} \le  q_{l jk }+(x_{l jk}-1)M, ~\forall l  \in \mathbf{N}, j \in \mathcal{J}({l}), k \in \mathcal{N}(l,j)\\[0.3 cm]
%&&  \sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\ge  L_{ij}, ~  \forall  i \in \mathbf{N}, j\in \mathbf{M}_i \\
&& \halpha,\q, \hpsi, \hvarphi, \hvarsigma, \hvarrho \le \mathbf{0}, \htau \in \mathbb{R}^K,  \\
&& \hbeta, \hlambda, \hphi, \hrho, \hpi,\hvarpi, \hgamma, \htheta \ge \mathbf{0}, \label{HP2-ambiguity-LP-FL}
\end{eqnarray}
where $\halpha, \hbeta, \hlambda, \htau, \q, \hpsi, \hphi, \hvarphi, \hpi, \hvarpi, \hrho, \hvarrho, \hvarsigma, \hgamma, \htheta$ are auxiliary variables.
\end{proposition}

%\begin{proof}
{\bf Proof. }
First of all, for a given $l  \in \mathbf{N}$ we deal with the infinitely dimensional constraints
$$
\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge p_k, \forall \bm{z} \in \mathcal{Z}_k\cap\overline{\mathcal{Z}}_l, k \in [K].
$$
Recall that
\begin{equation}
\overline{\mathcal{Z}}_l(\x)=\bigcap\limits_{{j} \in \mathbf{M}^{\rm a}_l}\bigcap\limits_{{t} \in \mathbf{T}_{l{j}}}\left\{\bm{z} \in \mathbb{R}^H_+ ~\left|~ \sum_{{j} \in \mathbf{M}^{\rm c}_l}\sum_{{t}\in \mathbf{T}_{l{j}}} \tilde{z}_{l{j}{t}}x_{l{j}{t}} + \tilde{z}_{l{j}{t}}x_{l{j}{t}}\leq \mathcal{T}_R \right.\right \}
\end{equation}
\begin{equation}
=\left\{\bm{z} \in \mathbb{R}^H_+ ~\left|~ \sum_{j \in \mathbf{M}^{\rm c}_l}\sum_{{t}\in \mathbf{T}_{l{j}}} \tilde{z}_{l{j}{t}}x_{l{j}{t}} + \tilde{z}_{l{j}{t}}x_{l{j}{t}}\leq \mathcal{T}_R, \forall j \in \mathbf{M}^{\rm a}_l, {t}\in \mathbf{T}_{l{j}}  \right.\right \},
\end{equation}
and
$$
\mathcal{Z}_k = \left\{\bm{z} \in \mathbb{R}^{H} \left| \begin{array}{l}
                                                        2(\hmu_i - \hmu_k)^{\top}\bm{z} \leq \hmu_{i}^{\top}\hmu_{i} - \hmu_{k}^{\top}\hmu_{k}, \forall i \in [K]  \\[0.25 cm]
                                                        z_{ijt} \in [\underline{z}_{ij},\overline{z}_{ij}], \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}
                                                       \end{array}
\right.\right\}
$$
for any $k \in [K]$.

First of all, we claim that for any $k \in [K]$
\begin{equation}\label{Lifting-1}
\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge p_k, \forall \bm{z} \in \mathcal{Z}_k\cap\overline{\mathcal{Z}}_l
\end{equation}
is equivalent to
\begin{equation}\label{Lifting-2}
\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + u^{k}_{ijt}\lambda^{k}_{ij}\right] + \tau_k \ge p_k, \forall (\bm{z},\u) \in \mathcal{W}_k,
\end{equation}
{\color{red} where
$$
\mathcal{W}_k := \left\{(\z, \u) \middle| \: \z \in \mathcal{Z}_k\cap\overline{\mathcal{Z}}_l, \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right| \leq u^{k}_{ijt}, \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij} \right\}, \forall k \in [K].
$$
}
In fact, on the one hand, if \eqref{Lifting-1} holds, since $\hlambda \geq \mathbf{0}$, we have for any $(\bm{z}, \u) \in \mathcal{W}_k$
$$
u^{k}_{ijt}\lambda^{k}_{ij} \geq \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}, \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij}.
$$
Therefore \eqref{Lifting-2} holds. On the other hand, assume \eqref{Lifting-2} holds, then for any $\hat{\bm{z}} \in \mathcal{Z}_k\cap\overline{\mathcal{Z}}_l$, we take
$$
u^{k}_{ijt} = \left|\frac{ \hat{z}_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|, \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij}.
$$
Then we have
$$
\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[\hat{z}_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ \hat{z}_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k
=\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[\hat{z}_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + u^{k}_{ijt}\lambda^{k}_{ij}\right] + \tau_k \geq p_k.
$$
Therefore \eqref{Lifting-1} holds, and \eqref{Lifting-1} and \eqref{Lifting-2} are equivalent.

Then, by introducing auxiliary variables $u^{k}_{ijt}, \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}$, we can equivalently lift the above constraints into the following optimization-based formulation:
\begin{equation}\label{H-system1}
\left.\begin{array}{rcll}
  p_k-\tau_k\le & \min\limits_{\bm{z}, \u} & \displaystyle \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + u^{k}_{ijt} \lambda^{k}_{ij}\right]  \\[0.3 cm]
&{\rm s.t.} & \displaystyle   {\sum_{j \in \mathbf{M}^{\rm c}_l}\sum_{{t}\in \mathbf{T}_{l{j}}} \tilde{z}_{l{j}{t}}x_{l{j}{t}} + \tilde{z}_{l{j}{t}}x_{l{j}{t}}\leq \mathcal{T}_R} & \forall {j} \in \mathbf{M}^{\rm a}_l, {t}\in \mathbf{T}_{l{j}}  \\[0.3 cm]
&&  {|\mathbf{T}_{ij} |\sigma^{k}_{ij}}u^{k}_{ijt}- { z_{ijt}} \ge { -\nu^{k}_{ij}}, & \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij} \\[0.3 cm]
&&  {|\mathbf{T}_{ij} |\sigma^{k}_{ij}}u^{k}_{ijt}+  { z_{ijt}} \ge  {\nu^{k}_{ij} }, & \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij} \\[0.3 cm]
&&  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} 2(\mu^{n}_{ijt} - \mu^{k}_{ijt})z_{ijt}\\[0.3 cm]
&& \leq \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} {(\mu^{n}_{ijt})}^2 - {(\mu^{k}_{ijt})}^2, & \forall n \in [K]\\[0.3 cm]
&& \underline{z}_{ij} \leq z_{ijt} \leq \overline{z}_{ij} & \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}\\[0.3cm]
&& u^{k}_{ijt} \in \mathbb{R}, & \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}.
\end{array}\right\}, \forall k \in [K]
\end{equation}

By the strong duality of linear programming, the above constraint is also equivalent to the following system: for all $k$ in $[K]$,
\begin{equation*}
\left\{  \begin{array}{rl}
& p_k-\tau_k\le \sum\limits_{j\in \mathbf{M}^{\rm a}_{l}}\sum\limits_{t\in \mathbf{T}_{lj}} q^{lk}_{jt}\mathcal{T}_R\\[0.3 cm]
& +\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \left[\phi^{lk}_{ijt}\underline{z}_{ij}+\varphi^{lk}_{ijt}\overline{z}_{ij}+{\nu^{k}_{ij}\left(\pi^{lk}_{ijt}-\varpi^{lk}_{ijt} \right)}  + \sum\limits_{n \in [K]}\psi^{lk}_{n}({(\mu^{n}_{ijt})}^2 - {(\mu^{k}_{ijt})}^2)\right]  \\[0.3 cm]
&  {q^{lk}_{jt}}x_{l jt}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+\pi^{lk}_{ljt}-\varpi^{lk}_{ljt} \nonumber \\ &  = \alpha^{k}_{ljt}+\beta^{k}_{ljt}, \forall j\in \mathbf{M}^{\rm a}_l, t \in \mathbf{T}_{l j} \\[0.3 cm]
&  x_{ljt}\sum\limits_{j \in \mathbf{M}^{\rm a}_l}\sum\limits_{t \in \mathbf{T}_{l j}}{q^{lk}_{jt}}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{k}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+\pi^{lk}_{ljt}-\varpi^{lk}_{ljt} \nonumber \\ &  = \alpha^{k}_{ljt}+\beta^{k}_{ljt}, \forall j\in \mathbf{M}^{\rm c}_l, t \in \mathbf{T}_{l j} \\[0.3 cm]
&   \sum\limits_{n \in [K]}2(\mu^{n}_{ijt} - \mu^{k}_{ijt})\psi^{lk}_{n}+  \phi^{lk}_{ijt}+\varphi^{lk}_{ijt}+\pi^{lk}_{ijt}-\varpi^{lk}_{ijt} = \alpha^{k}_{ijt}+\beta^{k}_{ijt}, ~ \forall i \in \mathbf{N}\setminus\{l\}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}  \\[0.3 cm]
&{|\mathbf{T}_{ij} |\sigma^{k}_{ij}}(\pi^{lk}_{ijt}+\varpi^{lk}_{ijt}) =\lambda^{k}_{ij},  ~ \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij} \\[0.3 cm]
&   q^{lk}_{jt}\le 0, ~\forall j\in \mathbf{M}^{\rm a}_l, t\in \mathbf{T}_{lj}\\& \psi^{lk}_{n} \le 0, \phi^{lk}_{ijt} \geq 0, \varphi^{lk}_{ijt} \leq 0, \pi^{lk}_{ijt}\ge 0,\varpi^{lk}_{ijt}\ge 0, ~\forall n \in [K], i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}.
\end{array}
\right\}
\end{equation*}


Likewise, the constraints
$$
\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge0, \forall \bm{z} \in \mathcal{Z}_k, k \in [K]\\
$$
can also be dualized similarly.  Leveraging on the derived formulation (\ref{P1-ambiguity-sup-dual1}) in Lemma~\ref{lem1-LT}, we can arrive at the formulation of the linear program (\ref{HP1-ambiguity-LP-FL})--(\ref{HP2-ambiguity-LP-FL}). We are done. \blot
%\end{proof}


Leveraging on the derived reformulation (\ref{HP1-ambiguity-LP-FL})-(\ref{HP2-ambiguity-LP-FL}) of linear program, we can readily linearize the
bilinear terms $q^{lk}_{jt}x_{l jt}$ in the overall DRRO model~\eqref{HP1-ambiguity-X} using standard MIP techniques. This results in the following mixed integer linear program (MILP) reformulation for the DRRO model~\eqref{HP1-ambiguity-X}. In particular, the advantage of the resulting MIP reformulation is that it does increase the number of integers.


\begin{proposition}\label{proposition1b}
The problem (\ref{HP1-ambiguity-X}) is equivalent to the following mixed integer linear program:
\begin{eqnarray}
 & \min\limits_{\x} &  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \left[\sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\right]c_{ij} \label{HP1-ambiguity-MILP-FL1}\\
 &{\rm s.t.} & 1-\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}\sum\limits_{k\in[K]} \left(\alpha^{k}_{ijt}\underline{\mu}^{k}_{ij}+ \beta^{k}_{ijt}\overline{\mu}^{k}_{ij}\right)\nonumber\\
 && -\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\sum\limits_{k\in[K]}\epsilon^{k}_{ij}\lambda^{k}_{ij} - \sum\limits_{k\in[K]}\tau_{k}\ge R_{0}   \\
 &&  L_{i} \le \sum\limits_{j\in \mathbf{M}_i}\sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\le  U_{i}, ~  \forall  i \in \mathbf{N} \\
%&& \sum\limits_{j\in \mathbf{M}_{l}} \sum\limits_{k\in \mathbf{K}_{l j}}q_{l jk}\mathcal{T}_R\nonumber\\
% &&+\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{k\in \mathbf{K}_{ij}} \left[ \phi^{l }^{k}_{ij}\underline{z}_{ij}+ \varphi^{\varsigma}^{k}_{ij}\overline{z}_{ij}  + {\nu_{ij}\left(\pi^{l }^{k}_{ij}-\varpi^{l }^{k}_{ij} \right)} \right]+\tau \ge 1,~\forall {l  \in \mathbf{N}}  \\
&& y^{{\rm a}lk}_{jt}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+{ \pi^{lk}_{ljt}-\varpi^{lk}_{ljt} } = \alpha^{k}_{ljt}+\beta^{k}_{ljt},\nonumber\\ &&  \forall l \in \mathbf{N}, j\in \mathbf{M}^{\rm a}_l, t \in \mathbf{T}_{l j}, k \in [K]  \label{con:32}  \\
&& y^{{\rm c}lk}_{jt}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+\pi^{lk}_{ljt}-\varpi^{lk}_{ljt}  = \alpha^{k}_{ljt}+\beta^{k}_{ljt},\nonumber\\ &&  \forall l \in \mathbf{N}, j\in \mathbf{M}^{\rm c}_l, t \in \mathbf{T}_{l j}, k \in [K]    \\
 && (\ref{HP1-ambiguity-LP-FL1}-\ref{HP1-ambiguity-LP-FL1-2}); (\ref{HP2-ambiguity-LP-FL2})-(\ref{HP2-ambiguity-LP-FL})\\
&&  q^{lk}_{jt}\le y^{{\rm a}lk}_{jt}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm a}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \\
&& y^{{\rm a}lk}_{jt} \ge M x_{l jt}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm a}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \\
&& y^{{\rm a}lk}_{jt} \le  q^{lk}_{jt}+(x_{l jt}-1)M, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm a}_{l}, t \in \mathbf{T}_{l j}, k \in [K]\\
&&  \sum\limits_{j \in \mathbf{M}^{\rm a}_{l}}\sum\limits_{t\in \mathbf{T}_{l j}}q^{lk}_{jt}\le y^{{\rm c}lk}_{jt}, ~\forall l  \in \mathbf{N},j \in \mathbf{M}^{\rm c}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \\
&& y^{{\rm c}lk}_{jt} \ge M x_{l jt}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm c}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \\
&& y^{{\rm c}lk}_{jt} \le  \sum\limits_{j \in \mathbf{M}^{\rm a}_{l}}\sum\limits_{t\in \mathbf{T}_{l j}}q^{lk}_{jt}+(x_{l jt}-1)M, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm c}_{l}, t \in \mathbf{T}_{l j}, k \in [K]\\
&& \y^{\rm a}, \y^{\rm c} \le \mathbf{0}, \x\in \{0,1\}^{H},  \label{HP1-ambiguity-MILP-FL2}
\end{eqnarray}
where $\halpha, \hbeta, \hlambda, \htau, \q, \hpsi, \hphi, \hvarphi, \hpi, \hvarpi, \hrho, \hvarrho, \hvarsigma, \hgamma, \htheta, \y^{\rm a}$ and $\y^{\rm c}$ are auxiliary variables and $M$ is a sufficiently small negative number.
\end{proposition}
%\begin{proof}
{\bf Proof. }
In the proof of Proposition~\ref{P-proposition1b}, the feasible set $\mathcal{Z}_k\cap \overline{\mathcal{Z}}_{l }(\x)$ of the minimization problem
$$
\min\limits_{\bm{z}} \displaystyle \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right| \lambda^{k}_{ij}\right]
$$
is bounded. Assuming that it is nonempty, then its lifted equivalent form of the inner minimization problem in (\ref{H-system1}) is also bounded and nonempty. Therefore, the dual variables $q^{lk}_{jt}$ are also bounded. Therefore, we can linearize the bilinear terms by introducing new variables $y^{{\rm a}lk}_{jt}$ and $y^{{\rm c}lk}_{jt}$, such that
\begin{eqnarray}
&&  q^{lk}_{jt}\le y^{{\rm a}lk}_{jt}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm a}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \nonumber\\
&& y^{{\rm a}lk}_{jt} \ge M x_{l jt}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm a}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \nonumber\\
&& y^{{\rm a}lk}_{jt} \le  q^{lk}_{jt}+(x_{l jt}-1)M, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm a}_{l}, t \in \mathbf{T}_{l j}, k \in [K]\nonumber\\
&&  \sum\limits_{j \in \mathbf{M}^{\rm a}_{l}}\sum\limits_{t\in \mathbf{T}_{l j}}q^{lk}_{jt}\le y^{{\rm c}lk}_{jt}, ~\forall l  \in \mathbf{N}, , j \in \mathbf{M}^{\rm c}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \nonumber\\
&& y^{{\rm c}lk}_{jt} \ge M x_{l jt}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm c}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \nonumber\\
&& y^{{\rm c}lk}_{jt} \le  \sum\limits_{j \in \mathbf{M}^{\rm a}_{l}}\sum\limits_{t\in \mathbf{T}_{l j}}q^{lk}_{jt}+(x_{l jt}-1)M, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm c}_{l}, t \in \mathbf{T}_{l j}, k \in [K],\nonumber
\end{eqnarray}
where $M$ is a sufficiently small negative number (in numerical computation, M can be set to a negative number with very large absolute value).  Using this linearization technique, we can arrive at the MILP  reformulation \eqref{HP1-ambiguity-MILP-FL1}-\eqref{HP1-ambiguity-MILP-FL2} for the DRRO model (\ref{HP1-ambiguity-X}). \blot

}


\subsection{Learning the clustering parameter $K$ by Cross Validation}\label{subsec:CV}{\color{blue}
It is known that the clustering approaches ({e.g.}, K-means) in general belong to the {\it unsupervised learning} methods (Hastie~et~al.~2009), for which there is no labeled loss function for validating the parameter $K$\footnotemark \footnotetext{One of the popular methods to choose $K$ is the elbow method ({\color{red}Thorndike 1953, Ketchen et al. 1996}), in which the ratio of in-class variance and total variance is plotted, and optimal $K$ is chosen by observing the plot, which apparently depends partially on subjective understanding of the data.}.

Interestingly, our distributionally robust redundancy optimization framework with lifetime pattern clustering (of hyper-parameter $K$) can essentially be treated as a {\it supervised learning} from statistical learning perspective ({\color{red} Friedman et al. 2001, James et al. 2013}). In particular, the proposed optimization framework provides two labels of (i) redundancy cost and (ii) feasibility of reliability requirement, in the DRRO model~\eqref{HP1-ambiguity-X}. This important feature of our framework enables us to design {\it Cross Validation} to choose the {\it best} parameter $K$. Specifically, considering the problem structure of chance constrained optimization where the objective of  redundancy cost and feasibility of constraint (reliability) needs to be balanced, where the balance actually reflects the user's design preference in between the two. This motivates us introduce a preference level $\lambda$ to design the Cross Validation. Specifically, for each choice of $K$, we consider the number of constraint violation in cross validation, as well as its average design cost. We normalize both into the range of $[0, 1]$ separately, and combine them linearly by the coefficient $\lambda$. Low $\lambda$ means that the designer cares more about low cost than robustness (reliability); high $\lambda$ indicates the contrary. By applying this metric, cross validation can help choosing $K$ to find a balance of cost and robustness as per necessary. The procedures of cross validation are summarized in the following Algorithm 1.

\noindent\rule{\textwidth}{0.1em}\vspace{-5pt}\\
\noindent {\bf Algorithm 1. Cross Validation for Selecting $K$} \vspace{-10pt}\\
\noindent\rule{\textwidth}{0.05em} \\
{\bf Input:} Data sample $\mathcal{D}$, preference level $\lambda$ and the set of possible candidates for $K$: $\{K_l, l \in [L]\}$.

\begin{enumerate}
\item  Split $\mathcal{D}$ into $M$ disjoint subsets $\mathcal{D}_m, m \in [M]$, each of them have equal size.
\item  For each $K_l, l \in [L]$:
\begin{enumerate}
  \item Training: For each subset $\mathcal{D}_m, m \in [M]$, compute the optimal solution $\x^{*}(\mathcal{D}_m, K_l)$ and associated optimal cost $c(\mathcal{D}_m, K_l)$ by solving the model, using $\mathcal{D} \setminus \mathcal{D}_m$ as input data and split it into $K_l$ clusters.
  \item Validation: For each $m \in [M]$: For $\mathcal{D}_m$, use it as validation set to compute $R(\x^{*}(\mathcal{D}_m, K_l))$, the reliability level under design $\x^{*}(\mathcal{D}_m, K_l)$. If $R(\x^{*}(\mathcal{D}_m, K_l)) < R_0$, count it as one instance of violation. Sum up the total number of violations across all $m \in [M]$, that is, $$v(K_l) := \sum\limits_{m \in [M]}\mathbb{I}\Big[ R(\x^{*}(\mathcal{D}_m, K_l)) < R_0 \Big],$$
      where $\mathbb{I}[\cdot]$ is the indicator function. Compute average cost as $$\overline{c}(K_l) := \frac{1}{n}\sum\limits_{m \in M}c(\mathcal{D}_m, K_l)$$
  \item Compute the combined metric $\lambda{v( K_l)} + (1-\lambda)\overline{c}(K_l)$. Let $$K_{\rm opt} := \underset{K_l}{\mathrm{arg\,min}}\,\lambda{v(K_l)} + (1-\lambda)\overline{c}(K_l).$$
\end{enumerate}
\end{enumerate}
\noindent{\bf Output:} The optimal value $K_{\rm opt}$ for parameter $K$.

\vspace{-5pt}
\noindent\rule{\textwidth}{0.1em}\vspace{-17pt}\\

}
\section{A Supergradient-based Decomposition Algorithm}{\color{blue}
Although the linearized MIP formulation is tractable, when $K$ is large, it may take long time to solve it due to the large number of constraints. In this subsection we provide an iterative algorithm that exploits the structure of the problem, that decomposed that problem into subproblems that can be parallelized.

We first introduce $\y^a$ and $\y^c$ to separate $\q$ and $\x$ in \eqref{HP1-ambiguity-LP-FL}, by the same approach in the proof of proposition 3. Then, computing its dual,
we obtain the following formulation:

\begin{eqnarray} \label{dual}
&\!\!\!\!\!\! \min &
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{a}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}(\gamma^{lk}_{jt}+\delta^{lk}_{jt})Mx_{ljt} +
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{c}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}(\lambda^{lk}_{jt}+\tau^{lk}_{jt})Mx_{ljt} \nonumber\\
&& + \sum\limits_{l\in \mathbf{N}}\sum\limits_{k\in[K]}{p_k}{\alpha^{l}_k} -
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{a}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}\delta^{lk}_{jt}M -
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{c}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}\tau^{lk}_{jt}M \\
&\!\!\!\!\!\!{\rm s.t.} & \rho^{ik}_{jt}+\sum\limits_{l \in \mathbf{N} \setminus \{i\}}\phi^{lk}_{ijt} + \varphi^{k}_{ijt} \ge \underline{\mu}^{k}_{ij}, \forall i \in \mathbf{N}, j\in \mathbf{M}^{a}_i, t\in \mathbf{T}_{ij}, k\in[K]\label{model-c-1}\\
&& \varrho^{ik}_{jt}+\sum\limits_{l \in \mathbf{N} \setminus \{i\}}\phi^{lk}_{ijt} + \varphi^{k}_{ijt} \ge \underline{\mu}^{k}_{ij}, \forall i \in \mathbf{N}, j\in \mathbf{M}^{c}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \rho^{ik}_{jt}+\sum\limits_{l \in \mathbf{N} \setminus \{i\}}\phi^{lk}_{ijt} + \varphi^{k}_{ijt} \le \overline{\mu}^{k}_{ij}, \forall i \in \mathbf{N}, j\in \mathbf{M}^{a}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \varrho^{ik}_{jt}+\sum\limits_{l \in \mathbf{N} \setminus \{i\}}\phi^{lk}_{ijt} + \varphi^{k}_{ijt} \le \overline{\mu}^{k}_{ij}, \forall i \in \mathbf{N}, j\in \mathbf{M}^{c}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \sum\limits_{l \in \mathbf{N}}\sum\limits_{t\in \mathbf{T}_{ij}}\pi^{lk}_{ijt}+\sum\limits_{t\in \mathbf{T}_{ij}}\varpi^{k}_{ijt} \le \epsilon^{k}_{ij}, \forall i \in \mathbf{N}, j\in \mathbf{M}_i, k\in[K]\\
&& \sum\limits_{i \in \mathbf{N}}\alpha_{ik} + \beta_{k} = -1, \forall k \in [K]\\
&& \rho^{ik}_{jt}-\theta^{ik}_{jt}+\gamma^{ik}_{jt}+\delta^{ik}_{jt}\le 0, \forall i \in \mathbf{N}, j\in \mathbf{M}^{a}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \varrho^{ik}_{jt}-\vartheta^{ik}_{jt}+\lambda^{ik}_{jt}+\tau^{ik}_{jt}\le 0, \forall i \in \mathbf{N}, j\in \mathbf{M}^{c}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \mathcal{T}_R\alpha_{ik} + \theta^{ik}_{jt} - \delta^{ik}_{jt} + \sum\limits_{j \in \mathbf{M}^c_i}\sum\limits_{t \in \mathbf{T}_{ij}}(\vartheta^{ik}_{jt} - \tau^{ik}_{jt}) \le 0, \forall i \in \mathbf{N}, j\in \mathbf{M}^{a}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \underline{z}_{lj}\alpha_{lk} + \rho^{lk}_{jt} \ge 0, \forall l \in \mathbf{N}, j\in \mathbf{M}^a_l, t\in \mathbf{T}_{lj}, k\in[K]\\
&& \underline{z}_{lj}\alpha_{lk} + \varrho^{lk}_{jt} \ge 0, \forall l \in \mathbf{N}, j\in \mathbf{M}^c_l, t\in \mathbf{T}_{lj}, k\in[K]\\
&& \underline{z}_{ij}\alpha_{lk} + \phi^{lk}_{ijt} \ge 0, \forall l \in \mathbf{N}, i \in \mathbf{N} \setminus \{l\}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \overline{z}_{lj}\alpha_{lk} + \rho^{lk}_{jt} \le 0, \forall l \in \mathbf{N}, j\in \mathbf{M}^a_l, t\in \mathbf{T}_{lj}, k\in[K]\\
&& \overline{z}_{lj}\alpha_{lk} + \varrho^{lk}_{jt} \le 0, \forall l \in \mathbf{N}, j\in \mathbf{M}^c_l, t\in \mathbf{T}_{lj}, k\in[K]\\
&& \overline{z}_{ij}\alpha_{lk} + \phi^{lk}_{ijt} \le 0, \forall l \in \mathbf{N}, i \in \mathbf{N} \setminus \{l\}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \sum\limits_{i \in \mathbf{N}}\sum\limits_{j \in \mathbf{M}_i}\sum\limits_{t \in \mathbf{T}_{ij}}\Big[\Big((\mu^n_{ijt})^2-(\mu^k_{ijt})^2\Big)\alpha_{lk}\Big] + \sum\limits_{j \in \mathbf{M}^a_l}\sum\limits_{t \in \mathbf{T}_{lj}}2(\mu^n_{ljt}-\mu^k_{ljt})\rho^{lk}_{jt} + \sum\limits_{j \in \mathbf{M}^c_l}\sum\limits_{t \in \mathbf{T}_{lj}}2(\mu^n_{ljt}-\mu^k_{ljt})\varrho^{lk}_{jt}\nonumber \\&& + \sum\limits_{i \in \mathbf{N} \setminus \{l\}}\sum\limits_{j \in \mathbf{M}_i}\sum\limits_{t \in \mathbf{T}_{ij}}2(\mu^n_{ijt}-\mu^k_{ijt})\phi^{lk}_{ijt} \le 0 ,\forall l \in \mathbf{N}, n \in [K], k \in [K]\\
&& \nu^{k}_{lj}\alpha_{lk} + \rho^{lk}_{jt} + \pi^{lk}_{ljt} \ge 0, \forall l \in \mathbf{N}, j\in \mathbf{M}^a_l, t\in \mathbf{T}_{lj}, k\in[K]\\
&& \nu^{k}_{lj}\alpha_{lk} + \varrho^{lk}_{jt} + \pi^{lk}_{ljt} \ge 0, \forall l \in \mathbf{N}, j\in \mathbf{M}^c_l, t\in \mathbf{T}_{lj}, k\in[K]\\
&& \nu^{k}_{ij}\alpha_{lk} + \phi^{lk}_{ijt} + \pi^{lk}_{ijt} \ge 0, \forall l \in \mathbf{N}, i \in \mathbf{N} \setminus \{l\}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \nu^{k}_{lj}\alpha_{lk} + \rho^{lk}_{jt} - \pi^{lk}_{ljt} \le 0, \forall l \in \mathbf{N}, j\in \mathbf{M}^a_l, t\in \mathbf{T}_{lj}, k\in[K]\\
&& \nu^{k}_{lj}\alpha_{lk} + \varrho^{lk}_{jt} - \pi^{lk}_{ljt} \le 0, \forall l \in \mathbf{N}, j\in \mathbf{M}^c_l, t\in \mathbf{T}_{lj}, k\in[K]\\
&& \nu^{k}_{ij}\alpha_{lk} + \phi^{lk}_{ijt} - \pi^{lk}_{ijt} \le 0, \forall l \in \mathbf{N}, i \in \mathbf{N} \setminus \{l\}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \underline{z}_{ij}\beta_k + \varphi^{k}_{ijt} \ge 0, \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \overline{z}_{ij}\beta_k + \varphi^{k}_{ijt} \le 0, \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \sum\limits_{i \in \mathbf{N}}\sum\limits_{j \in \mathbf{M}_i}\sum\limits_{t \in \mathbf{T}_{ij}}\Big[\Big((\mu^n_{ijt})^2-(\mu^k_{ijt})^2\Big)\beta_k + 2(\mu^n_{ijt}-\mu^k_{ijt})\varphi^{k}_{ijt}\Big] \le 0 ,\forall n \in [K], k \in [K]\\
&& \nu^{k}_{ij}\beta_k + \varphi^{k}_{ijt} + \varpi^{k}_{ijt} \ge 0, \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \nu^{k}_{ij}\beta_k + \varphi^{k}_{ijt} - \varpi^{k}_{ijt} \le 0, \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k\in[K]\\
&& \halpha,\hbeta, \hgamma, \hlambda \le \mathbf{0},  \\
&& \htheta, \hdelta, \hvartheta, \htau \ge \mathbf{0}, \label{model:dual-S}
\end{eqnarray}
where $({\halpha}, {\hbeta}, {\hphi}, {\hpi}, {\hvarphi}, {\hvarpi}, {\hrho}, {\hvarrho}, {\htheta}, {\hgamma}, {\hdelta}, {\hvartheta}, {\hlambda}, {\htau})$ are auxiliary variables.

Let $D(\x)$ be the optimal solution of the above dual program \eqref{dual}-\eqref{model:dual-S} given $\x$, it is noted that computing $D(\x)$ solves a linear program. Furthermore, This linear program problem $D(\x)$ can be decomposed into $K$ subproblems by splitting the constraints and objective based on $k \in [K]$.  Therefore, the computation of $D(\x)$ can be efficiently even if $K$ is very large.

%\begin{eqnarray}
%&\!\!\!\!\!\! D_k(\x) = \min &  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \left(\alpha^{k}_{ijt}\underline{\mu}^{k}_{ij} + \beta^{k}_{ijt}\overline{\mu}^{k}_{ij}\right) + \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\epsilon^{k}_{ij}\lambda^{k}_{ij} - \tau_{k} \\
% &\!\!\!\!\!\!{\rm s.t.} & \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \Big[\phi^{lk}_{ijt}\underline{z}_{ij}+\varphi^{lk}_{ijt}\overline{z}_{ij}+{\nu^{k}_{ij}\left(\pi^{lk}_{ijt}-\varpi^{lk}_{ijt} \right)}  \nonumber\\
%  &&+  \sum\limits_{n \in [K]}\psi^{lk}_{n}({(\mu^{n}_{ijt})}^2 - {(\mu^{k}_{ijt})}^2)\Big] + \sum\limits_{j\in \mathbf{M}^{\rm a}_{l}}\sum\limits_{t\in \mathbf{T}_{lj}} q^{lk}_{jt}\mathcal{T}_R+ \tau_k \geq p_k, \forall l \in \mathbf{N}\\
%  && \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \Big[\rho^{k}_{ijt}\underline{z}_{ij}+\varrho^{k}_{ijt}\overline{z}_{ij}+{\nu^{k}_{ij}\left(\gamma^{k}_{ijt}-\theta^{k}_{ijt} \right)} \nonumber\\ && + \sum\limits_{n \in [K]}\varsigma^{k}_{n}({(\mu^{n}_{ijt})}^2 - {(\mu^{k}_{ijt})}^2) \Big] + \tau_k \geq0\\
%&&  {q^{lk}_{jt}}x_{l jt}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+\pi^{lk}_{ljt}-\varpi^{lk}_{ljt} \nonumber \\ &&  = \alpha^{k}_{ljt}+\beta^{k}_{ljt}, \forall l \in \mathbf{N}, j\in \mathbf{M}^{\rm a}_l, t \in \mathbf{T}_{l j} \\[0.3 cm]
%&&  x_{l jt}\sum\limits_{j \in \mathbf{M}^{\rm a}_l}\sum\limits_{t \in \mathbf{T}_{l j}}{q^{lk}_{jt}}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+\pi^{lk}_{ljt}-\varpi^{lk}_{ljt} \nonumber \\ &&  = \alpha^{k}_{ljt}+\beta^{k}_{ljt}, \forall l \in \mathbf{N}, j\in \mathbf{M}^{\rm c}_l, t \in \mathbf{T}_{l j} \\[0.3 cm]
%&&   \sum\limits_{n \in [K]}2(\mu^{n}_{ijt} - \mu^{k}_{ijt})\psi^{lk}_{n}+  \phi^{lk}_{ijt}+\varphi^{lk}_{ijt}+\pi^{lk}_{ijt}-\varpi^{lk}_{ijt} = \alpha^{k}_{ijt}+\beta^{k}_{ijt},\nonumber\\ && ~ \forall l \in \mathbf{N}, i \in \mathbf{N}\setminus\{l \}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij} \\
%&&{|\mathbf{T}_{ij} |\sigma^{k}_{ij}}(\pi^{lk}_{ijt}+\varpi^{lk}_{ijt}) =\lambda^{k}_{ij},  ~ \forall l  \in \mathbf{N}, i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij} \\
%&& \sum\limits_{n \in [K]}2(\mu^{n}_{ijt} - \mu^{k}_{ijt})\varsigma^{k}_{n}+ \rho^{k}_{ijt}+\varrho^{k}_{ijt}+\gamma^{k}_{ijt}-\theta^{k}_{ijt} = \alpha^{k}_{ijt}+\beta^{k}_{ijt},\nonumber\\ && ~ \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}  \\
%&&  {|\mathbf{T}_{ij} |\sigma^{k}_{ij}}(\gamma^{k}_{ijt}+\theta^{k}_{ijt}) =\lambda^{k}_{ij},  ~ \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij} \\
%%&&  q_{l jk}\le y_{{l jk}}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}_{l}, k\in \mathcal{N}(l,j) \\[0.3 cm]
%%&& y_{{l jk}} \ge M x_{l jk}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}_{l}, k\in \mathcal{N}(l,j) \\[0.3 cm]
%%&& y_{{l jk}} \le  q_{l jk }+(x_{l jk}-1)M, ~\forall l  \in \mathbf{N}, j \in \mathcal{J}({l}), k \in \mathcal{N}(l,j)\\[0.3 cm]
%%&&  \sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\ge  L_{ij}, ~  \forall  i \in \mathbf{N}, j\in \mathbf{M}_i \\
%&& \halpha,\q, \hpsi, \hvarphi, \hvarsigma, \hvarrho \le \mathbf{0}, \htau_{k} \in \mathbb{R}  \\
%&& \hbeta, \hlambda, \hphi, \hrho, \hpi,\hvarpi, \hgamma, \htheta \ge \mathbf{0}
%\end{eqnarray}. \fi

Now the problem \eqref{HP1-ambiguity-X} can be rewritten as
\begin{equation}\label{HP1-ambiguity-XI}
\begin{array}{rcll}
& \min\limits_{\x} &  C(\x)=\sum\limits_{i\in \mathbf{N}} \sum\limits_{j \in \mathbf{M}_i}\left[ \sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\right]c_{ij} \\[0.5 cm]
&{\rm s.t.} & D(\x) \ge R_{0} & \\[0.3 cm]
&& L_{i} \le \sum\limits_{j\in \mathbf{M}_i}\sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\le  U_{i}, \forall i \in \mathbf{N}\\[0.3 cm]
&& \x \in \{0,1\}^H
\end{array}
\end{equation}
which is termed the {\it master} problem.


 We now approach this master problem \eqref{HP1-ambiguity-XI} by solving a sequence of {\it supergradient} based relaxed problems iteratively to approximate the optimality. If its optimal solution is feasible in the original problem, then the solution is also optimal in the original problem. Otherwise, we add more constraints into the problem. Repeat this process until either the problem becomes infeasible, or an optimal solution is obtained.


 It is noted that $D(\cdot)$ is a concave function which always has a {\it supergradient}. In particular, we denote by $\s(\x)$ a {\it supergradient} of $D(\x)$ at $\x$, which by definition means that $D(\x) \le D(\y) + s(\y)^{\top}(\x-\y), \forall \y \in S$. Note that when $\x = \y$, the two sides are equal. With supergradient $\s(\x)$, we can formulate the {\it relaxed master problem} for \eqref{HP1-ambiguity-XI} as
\begin{equation}\label{HP1-ambiguity-XIII}
\begin{array}{rcll}
& \min\limits_{\x} &  C(\x)=\sum\limits_{i\in \mathbf{N}} \sum\limits_{j \in \mathbf{M}_i}\left[ \sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\right]c_{ij} \\[0.5 cm]
&{\rm s.t.} & D(\y) + s(\y)^{\top}(\x-\y) \ge R_{0}, & \forall \y \in \mathcal{Y} \subset \{0,1\}^H \\[0.3 cm]
&& L_{i} \le \sum\limits_{j\in \mathbf{M}_i}\sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\le  U_{i}, & \forall i \in \mathbf{N}\\[0.3 cm]
&& \x \in \{0,1\}^H.
\end{array}
\end{equation}


The following lemma computes the supergradient $\s(\x)$ of $D(\x)$.
\begin{lemma}
Given any feasible $\x$ and $(\overline{\halpha}, \overline{\hbeta}, \overline{\hphi}, \overline{\hpi}, \overline{\hvarphi}, \overline{\hvarpi}, \overline{\hrho}, \overline{\hvarrho}, \overline{\htheta}, \overline{\hgamma}, \overline{\hdelta}, \overline{\hvartheta}, \overline{\hlambda}, \overline{\htau})$ is the optimal solution of the dual problem \eqref{dual}-\eqref{model:dual-S} given $\x$. Let
$$\s(\x)=\Big(s_{ijt}(\x)\Big)_{i\in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij}}$$
with
\begin{equation}
s_{ijt}(\x) := \left\{
                \begin{array}{ll}
                \sum\limits_{k\in[K]}\left(\overline{\gamma}^{lk}_{jt}+\overline{\delta}^{lk}_{jt}\right)M, \forall l \in \mathbf{N}, j \in \mathbf{M}^a_l, t \in \mathbf{T}_{lj}\\
                \sum\limits_{k\in[K]}\left(\overline{\lambda}^{lk}_{jt}+\overline{\tau}^{lk}_{jt}\right)M, \forall l \in \mathbf{N}, j \in \mathbf{M}^c_l, t \in \mathbf{T}_{lj}\end{array}\right.,
\end{equation}
where  and define $\overline{\hgamma}^{l}_j = \overline{\hdelta}^{l}_j = \mathbf{0}, \forall j \in \mathbf{M}_{l}^{c}$ and $\overline{\hlambda}^{l}_j = \overline{\htau}^{l}_j = \mathbf{0}, \forall j \in \mathbf{M}_{l}^{a}$. Then $\s(\x)$ is a supergradient of $D(\x)$ at $\x$.
\end{lemma}

{\bf Proof. }
Since $D(\x)$ is concave, it is equivalent to prove the following inequality:
\begin{equation}\label{subgrad}
D(\hat{\x}) \le D(\x) + s(\x)^{\top}(\hat{\x}-\x), \forall \hat{\x} \in \mathcal{X},
\end{equation}
where $$\mathcal{X}:= \left\{\x \in \{0,1\}^H:  L_{i} \le \sum\limits_{j\in \mathbf{M}_i}\sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\le  U_{i}, \forall i \in \mathbf{N}\right\}.$$


After obtaining $(\overline{\halpha}, \overline{\hbeta}, \overline{\hphi}, \overline{\hpi}, \overline{\hvarphi}, \overline{\hvarpi}, \overline{\hrho}, \overline{\hvarrho}, \overline{\htheta}, \overline{\hgamma}, \overline{\hdelta}, \overline{\hvartheta}, \overline{\hlambda}, \overline{\htau})$ as the optimal solution for given $\x$, we can rewrite the right side of \eqref{subgrad} as:
\begin{eqnarray}
&& \sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{a}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}(\overline{\gamma}^{lk}_{jt}+\overline{\delta}^{lk}_{jt})M\hat{x}_{ljt} +
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{c}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}(\overline{\lambda}^{lk}_{jt}+\overline{\tau}^{lk}_{jt})M\hat{x}_{ljt} \nonumber\\
& +& \sum\limits_{l\in \mathbf{N}}\sum\limits_{k\in[K]}{p_k}{\overline{\alpha}^{l}_k} -
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{a}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}\overline{\delta}^{lk}_{jt}M -
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{c}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}\overline{\tau}^{lk}_{jt}M,
\end{eqnarray}
and the left side as
\begin{eqnarray*}
&\!\!\!\!\!\! \min &
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{a}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}(\gamma^{lk}_{jt}+\delta^{lk}_{jt})M\hat{x}_{ljt} +
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{c}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}(\lambda^{lk}_{jt}+\tau^{lk}_{jt})M\hat{x}_{ljt} \nonumber\\
&& + \sum\limits_{l\in \mathbf{N}}\sum\limits_{k\in[K]}{p_k}{\alpha^{l}_k} -
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{a}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}\delta^{lk}_{jt}M -
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{c}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}\tau^{lk}_{jt}M\\
&\!\!\!\!\!\!{\rm s.t.} & \eqref{model-c-1}-\eqref{model:dual-S}.
\end{eqnarray*}

Since the constraints are independent of $\hat{\x}$, $(\overline{\halpha}, \overline{\hbeta}, \overline{\hphi}, \overline{\hpi}, \overline{\hvarphi}, \overline{\hvarpi}, \overline{\hrho}, \overline{\hvarrho}, \overline{\htheta}, \overline{\hgamma}, \overline{\hdelta}, \overline{\hvartheta}, \overline{\hlambda}, \overline{\htau})$ is a feasible solution of $D(\hat{\x})$. Since $D(\hat{\x})$ is the solution to a minimization problem,

\begin{eqnarray*}
D(\hat{\x})&\le& \sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{a}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}(\overline{\gamma}^{lk}_{jt}+\overline{\delta}^{lk}_{jt})M\hat{x}_{ljt} +
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{c}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}(\overline{\lambda}^{lk}_{jt}+\overline{\tau}^{lk}_{jt})M\hat{x}_{ljt} \nonumber\\
&+& \sum\limits_{l\in \mathbf{N}}\sum\limits_{k\in[K]}{p_k}{\overline{\alpha}^{l}_k} -
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{a}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}\overline{\delta}^{lk}_{jt}M -
\sum\limits_{l\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}^{c}_l} \sum\limits_{t\in \mathbf{T}_{lj}}\sum\limits_{k\in[K]}\overline{\tau}^{lk}_{jt}M, \forall \hat{\x} \in \mathcal{X}.
\end{eqnarray*}

It is equivalent to \eqref{subgrad}, so we have proved that $\s(\x)$ is a supergradient of $D(\x)$ at $\x$. \blot\\

%Furthermore, if we compute $D(\y^m)$ and the supergradient $\s(\y^m)$ with $M$ points $\y^m \in S, \forall m \in [M](M \le |S|)$, we form the following relaxed problem:
%\begin{equation}\label{HP1-ambiguity-XIII}
%\begin{array}{rcll}
%& \min\limits_{\x} &  C(\x) \\[0.5 cm]
%&{\rm s.t.} & D(\y^m) + s(\y^m)^{\top}(\x-\y^m) \ge R_{0}, & \forall m \in [M]\\[0.3 cm]
%&& \x \in S.
%\end{array}
%\end{equation}

It is noted that if the optimal solution to the relaxed problem \eqref{HP1-ambiguity-XIII} is feasible in the original problem, it is also the optimal solution of the original problem. We present the overall algorithm below:

\noindent\rule{\textwidth}{0.1em}\vspace{-5pt}\\
\noindent {\bf Algorithm 2.} Supergradient-based iterative optimization framework for problem \eqref{HP1-ambiguity-XI}\vspace{-10pt}\\
\noindent\rule{\textwidth}{0.05em} \\
{\bf Input:} $y^0 \in S, m=M=0, LB=-\infty, UB=\infty$. \\
{\bf Output:} Design $\x^{*}$ and associated cost.

\begin{enumerate}
\item  Compute $D(\y^m)$;
\item  Compute supergradient $\s(\y^m)$ at $\y^m$, and solve \eqref{HP1-ambiguity-XIII}: if it is infeasible, STOP and declare the overall problem infeasible; otherwise, obtain $\x^{*}$ as the optimal solution;
\item  If $D(\x^*) \ge R_0$, STOP; else, let $m = m + 1$, $M = M + 1$, $\y^m = \x^{*}$ and go to STEP 1;
\item  {\bf return} Design $\x^{*}$ and associated cost $C(\x^{*})$.
\end{enumerate}

\vspace{-5pt}
\noindent\rule{\textwidth}{0.1em}\vspace{-17pt}\\
}
\iffalse
Note that the algorithm above produce exact solution to the problem. By relaxing the condition $UB \le LB$ in STEP 3 to $UB - LB \le \epsilon$ for some $\epsilon > 0$, we can also get an algorithm that produces approximate solution to the problem, potentially with much less time.
\fi


%\section{Computational Study}
%In this section we present numerical experiments of our model, as well as a case study. The computational study consists of six parts: (i) visualizing the result of dimension reduction and clustering; (ii) testing how design changes when parameters vary, including $K$ and other hyperparameters; (iii) choosing best parameter $K$ by cross validation; (iv) comparing our design with a baseline probabilistic model;  (v) experimenting on the value of side information and (vi) a real-life case. The distribution used in experiment (i)-(v) is generated by a real data set.
%All computational result were done on a PC with Intel(R) Core(TM) i7-7500U CPU at 2.7 GHz, coded in Python. The MIP models were solved by library called Gurobi, version 8.1.1.
%
%\subsection{Visualizing clusters}
%In this section we present a visualization of dimension reduction and clustering. We first apply tSNE algorithm, a state-of-art algorithm for dimension reduction and visualization, \iffalse (refer to Maaten L, Hinton G. Visualizing data using t-SNE[J]. Journal of machine learning research, 2008, 9(Nov): 2579-2605.)\fi to reduce the lifetime data from 28 dimensions to 2 dimensions, and then perform K-means clustering. We present figures of different choice of number of clusters, $K=2$ and $K=5$, respectively. As we can see in the figure, the original multi-dimensional data can be well clustered after dimension reduction. The clustering results, including results from other choices of $K$, are used in %following experiments.
%
%\begin{figure}[H]
%\centering
%%\includegraphics{TSNE_1.png}
%\includegraphics[width=\columnwidth]{2D_tsne.jpg}
%\caption{\footnotesize Visualization of dimension reduction by tSNE algorithm and clustering by K-means algorithms, into 2 clusters on the left and 5 clusters on the right. Note that the two figures are produced by different data.}
%\label{figure_TSNE_1}
%\end{figure}
%
%\subsection{System design variation with different parameters}
%
%\begin{figure}[H]
%\centering
%\includegraphics[scale=0.8]{KVARIATION11.pdf}
%\caption{ The multi-type series-parallel system we are experimenting in this section. It consists of 3 subsystems, with 1 type of active parallel component and 2 types of cold-standby component in each subsystem. Each type of component can have up to 3 redundancies.}
%\label{figure1}
%\end{figure}
%
%In this subsection we experiment on adjusting parameters of the model and observe how system design $x$ changed accordingly.  we consider the following setting: a series-parallel system with  subsystems ($|\mathbf{N}|=3$), with each subsystem containing 3 types of components ($|\mathbf{M}_i|= 3, \forall i \in \mathbf{N}$), among which 1 types are active-parallel and 2 types are cold-standby, and each type is of 3 redundant components ($|\mathbf{T}_{ij}|=3, \forall i \in \mathbf{N}, j\in \mathbf{M}_i$). The lifetime requirement $\mathcal{T}_R = 29$.  The parameters we are adjusting includes $K \in [1, 10], R_0 \in \{0.95, 0.97, 0.99\}$, $\epsilon_{ij} \in \{0.05, 0.075, 0.1\},\forall i \in [3], j \in [3]$, and $\Delta$ to adjust $\underline{\hmu}$ and $\overline{\hmu}$:.
%$$
%\underline{\mu}^{k}_{ij}={\nu}^{k}_{ij}-\Delta,
%\quad \overline{\mu}^{k}_{ij}={\nu}^{k}_{ij}+\Delta,
%$$
%where $\Delta \in \{0.025, 0.05, 0.075, 0.1\}$.  Therefore, by changing the values of $\Delta$, we can have different sets of parameters $\underline{\hmu}$ and $\overline{\hmu}$. Specifically, large values of $\Delta$ correspond to the large gaps between $\underline{\hmu}$ and $\overline{\hmu}$.
%
%\begin{table}[!htbp]\label{d-table}\scriptsize%\footnotesize%\small%
%\caption{ \footnotesize  The design table for different $K$ under $\mathcal{T}_R = 29$, $\epsilon = 0.05$. AP, CS-I and CS-II represents the active parallel type, cold-standby type 1 and cold-standby type 2, respectively.}
%\begin{center}
%\begin{adjustbox}{angle=270}
%\scalebox{1}{
%\begin{tabular}{|c|c|c| c|| ccccc|| ccccc|| ccccc||}\hline
% \multirow{2}{*}{$\epsilon$} & \multirow{2}{*}{$\Delta$} & \multirow{2}{*}{Subsystem} & \multirow{2}{*}{Type} & \multicolumn{5}{c||}{$R_0 = 0.95, K$} & \multicolumn{5}{c||}{$R_0 = 0.97, K$} & \multicolumn{5}{c||}{$R_0 = 0.99, K$}\\
% \cline{5-19}
%      &&&&1 & 3 & 5 &  8 &  10 &1 & 3 & 5 &  8 &  10 & 1 & 3 & 5 &  8 &  10  \\
%     \hline
%                    &&& AP & 1 & 1 & 1 &1 & 1 & 0 &0 & 0 & 0 & 1 & 1 & 1 & 1 &1 & 1\\
%        &&1         & CS-l & 1 & 1 & 1 &1 & 1 & 2 & 2 & 2 & 2& 1 & 2 & 2 & 2 &2 & 2\\
%                    &&& CS-ll & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0\\
%     \cline{3-19}
%                    &&& AP & 1 & 1 & 1 &1 & 1 & 1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1 & 1\\
%        &0.025&2    & CS-l & 0& 0 & 0 & 0 & 1 & 0 & 0 & 0 &0& 0& 0 & 0 & 0 &0 & 0\\
%                    &&& CS-ll & 2 & 2 & 2 &2 & 2 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2\\
%     \cline{3-19}
%                    &&& AP &0 & 0 & 1& 1 & 1 & 0 &1 & 1 &1 & 1 & 1 &0 &0 &0 & 0\\
%        &&3         & CS-l & 1 & 1& 0& 0 & 0 & 1 & 0 & 0& 0 & 1 & 1 & 1 & 1 &1 & 1\\
%                    &&& CS-ll & 3 & 3 & 3 &3 & 3&  3  & 3 & 3& 3 & 3& 3 & 3 & 3 &3 & 3\\
%\cline{2-19}
%\multicolumn{1}{|c|}{} & \multicolumn{3}{c||}{Design cost (k\$)} & 39.5& 39.5&\!\! 39    &39& 39 & 44.5&44&44&\!\!44 & 41 &55 & 54.5 &\!\! 54.5 &54.5& 54.5 \\
%     \cline{2-19}
%                    &&& AP& 1 & 1 & 1 &1 & 1 & 0 & 0 & 0 &0 & 0 & 1& 1 & 1 & 1 &1\\
%        &&1         & CS-l& 1 & 1 & 1 &1 & 1 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2 \\
%                    &&& CS-ll & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0& 0\\
%     \cline{3-19}
%                    &&& AP & 1 & 1 & 1 &1 & 1 & 1 & 1 & 1 &1 & 1 & 1 & 1 & 1 &1 & 1 \\
%        &0.05&2     & CS-l &1 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0\\
%                    &&& CS-ll & 2 & 2 & 2 &2 & 2 & 2 & 2 &2 &2 &2& 2 & 2 & 2 &2 &2\\
%     \cline{3-19}
%                    &&& AP & 1 & 1 & 1 &1 & 0 & 1& 1 & 0& 0 & 0 & 1 &1 & 1 &1 & 1\\
%        &&3         & CS-l & 3 & 3 & 3 &3 & 1 & 3 & 3 & 1& 1 & 1 &  3 & 3 & 3 & 3 & 3\\
%                    &&& CS-ll & 1 & 1 & 1 &1 & 3 & 1 & 1& 3& 3 & 3 & 1 & 1 & 1 &1 & 1\\
%\cline{2-19}
%\multicolumn{1}{|c|}{0.05} & \multicolumn{3}{c||}{Design cost (k\$)} & 40&40 &\!\! 40  &40& 39.5 & 45&45&\!\! 44.5 & 44.5 & 44.5 & 55&55&\!\! 55  & 55 & 55\\
%     \cline{2-19}
%                     &&& AP& 1& 1 & 1 & 1 &1 & 0 & 0 & 0 &0 & 0 & 1& 1 & 1 & 1 &1\\
%        &&1           &CS-l2 & 1& 1 & 1 & 1 &1 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2\\
%                    &&& CS-ll & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0 \\
%     \cline{3-19}
%                    &&& AP & 1& 1 & 1 & 1 &1 & 1 &1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1  \\
%        &0.075&2    & CS-l & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0 \\
%                    &&& CS-ll & 2 & 2 & 2 &2 & 2 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2 \\
%     \cline{3-19}
%                    &&& AP & 1& 1 & 1 & 1 &1  & 1 &1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1\\
%        &&3           & CS-l & 3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 &3 & 3& 3 & 3 & 3 &3 & 3\\
%                    &&& CS-ll & 1& 1 & 1 & 1 &1  & 1 &1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1\\
%\cline{2-19}
%\multicolumn{1}{|c|}{} & \multicolumn{3}{c||}{Design cost (k\$)} & 40&40&\!\! 40 & 40 & 40 & 45&45&\!\! 45& 45&45 & 55&55 &\!\! 55    & 55& 55\\
%
%     \cline{2-19}
%                    &&& AP& 1& 1 & 1 & 1 &1 & 0 & 0 & 0 &0 & 0 & 1& 1 & 1 & 1 &1\\
%
%        &&1         & CS-l & 1& 1 & 1 & 1 &1 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2\\
%                    &&&CS-ll3 & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0\\
%     \cline{3-19}
%                    &&& AP & 1& 1 & 1 & 1 &1  &  1 &1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1 \\
%        &0.1&2          & CS-l & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0 \\
%                    &&& CS-ll & 2 & 2 & 2 &2 & 2 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2\\
%     \cline{3-19}
%                    &&& AP &1 & 1& 1 & 1 & 1 & 1  & 1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1 \\
%        &&3         & CS-l & 3 & 3 & 3 &3 & 3 & 3 & 3 & 3 &3 & 3& 3 & 3 & 3 &3 & 3\\
%                    &&& CS-ll& 1& 1 & 1 & 1 &1  & 1 & 1 & 1 & 1 &1 & 1& 1 & 1 & 1 & 1 \\
%\cline{2-19}
%\multicolumn{1}{|c|}{} & \multicolumn{3}{c||}{Design cost (k\$)} & 40&\!\! 40   & 40&40 & 40 & 45&45&\!\! 45 &45& 45 & 55&55&\!\! 55 & 55 & 55 \\
%\hline
%\end{tabular}}
%\end{adjustbox}
%\end{center}
%\end{table}
%
%The resulting system design in the case of $\epsilon_{ij} = 0.05$ are shown in the table. We leave the rest of experiment results in the Electronic Companion. Note that when $K = 1$, the model reduces to the case of robust model without clustering by Wang, et. al.
%
%The observations of the experimental results are the following:
%(i) the cost increases when the variation range $[\underline{\hmu}, \overline{\hmu}]$ of expected lifetimes increases, or the dispersion parameter $\epsilon$ of the lifetimes of components enlarges. Such increased cost is due to the enlarged ambiguity set $\mathbb{F}_K$ resulting from the change of above distributional parameters $([\underline{\hmu}, \overline{\hmu}]$. (ii) Intuitively, cost also increases if the required reliability level $R_0$ is increased.
%(iii) the cost decreases as number of clusters $K$ increases. This is due to to the fact that since the mean and dispersion information of each cluster are included in the ambiguity set, more clusters implies more information and therefore smaller ambiguity set.
%
%We then perform out-of-sample tests on the designs $K = 1, 4, 8$. To test the robustness of the design, we generate testing data with smaller lifetime mean and much larger standard deviation. In particular, we let $\hmu_{test} = (1 - \Delta_{m})\hmu_{train}$, and $\hsigma_{test} = \Delta_{s}\hsigma_{train}$. $\Delta_m$ is set to 10\%, and $\Delta_s$ is set to 2000\%, 3000\% and 4000\%, respectively.
%
%\begin{table}[htp]\footnotesize%\small%
%\caption{\label{tab-compare2} \footnotesize The out of sample reliability of designs generated by $K = 1, 4, 8$ models under different $\Delta_{s}$. $R_0$ is set to $0.9$. In the design columns, the 3 columns are corresponding to different subsystems. The three numbers in each column indicates the number of redundancies used that is active parallel, cold-standby type 1, or cold-standby type 2, respectively. }
%\begin{center}
%\begin{tabular}{|c||c|c|| c|c|c| c |  c |c   |}\hline
%\multirow{2}{*}{$\mathcal{T}_R$}  & \multirow{2}{*}{$(\Delta_{m}, \Delta_{s})$} & \multirow{2}{*}{Model} & \multicolumn{3}{c|}{Design} & \multirow{2}{*}{cost} & \multirow{2}{*}{Mean of out-of-sample reliability level} & \multirow{2}{*}{StD} \\
%\cline{4-6}
%&&& 1 & 2 & 3 &&&\\
%\hline
%\multirow{9}{*}{25} & \multirow{3}{*}{(10\%, 2000\%)} &  K=1 Model & (1,1,0) & (1,2,0) & (1,0,3)& 38.5 &0.999 & 0.035   \\
%     & & K=4 Model & (1,1,0) & (1,2,0) & (0,3,1)& 38.0 &0.966 & 0.182 \\
%     & & K=8 Model & (1,1,0) & (1,2,0) & (0,0,3)& 37.0 &0.607 & 0.489  \\
%    \cline{2-9}
%     &\multirow{3}{*}{(10\%, 3000\%)}  &  K=1 Model & (1,1,0) & (1,2,0) & (1,0,3)& 38.5 &  0.989 & 0.103   \\
%     & & K=4 Model & (1,1,0) & (1,2,0) & (0,3,1)& 38.0 &0.925 & 0.263 \\
%     & & K=8 Model & (1,1,0) & (1,2,0) & (0,0,3)& 37.0 &0.576 & 0.494  \\
%    \cline{2-9}
%     &\multirow{3}{*}{(10\%, 4000\%)}  &  K=1 Model & (1,1,0) & (1,2,0) & (1,0,3)& 38.5 &  0.975 & 0.157   \\
%     & & K=4 Model & (1,1,0) & (1,2,0) & (0,3,1)& 38.0 &0.894 & 0.308 \\
%     & & K=8 Model & (1,1,0) & (1,2,0) & (0,0,3)& 37.0 &0.551 & 0.497  \\
%\hline
%
%\end{tabular}
%\end{center}
%\end{table}
%
%\begin{figure}[H]
%\centering
%\includegraphics[width=\columnwidth]{Out_of_sample_K.png}
%\caption{\footnotesize Figure (a)(b)(c) represents the out of sample reliability of designs generated by $K = 1, 4, 8$ models under different $\Delta_{s}$, respectively. The vertical beam represents $\mathcal{T}_R$. The fraction of the lifetime histogram on the right side of beam represents the out-of-sample reliability level.}
%\label{figureK}
%\end{figure}
%
%From the results, we can observe that with moderate number of clusters ($K = 4$), we can obtain designs with less costs than designs generated by robust model with no clustering ($K = 1$), while retaining robustness even when there is a significant shrink in mean lifetime and much larger standard deviation. This shows that our framework can produce designs that are robust enough and cheaper, by the incorporation of clustering. However, if the number of clusters becomes too large ($K = 8$), the resulting model lost robustness in these extreme tests. Thus, it is crucial to choose the optimal $K$. We present an experiment of choosing $K$ by cross validation in the next subsection.
%
%\subsection{Choosing $K$ by cross validation}
%In this subsection, we present a cross validation experiment by applying Algorithm 2. In particular, we choose $m = 10$ and do a 10-fold cross validation. The number of instances of constraints violation, as well as the cost of designs, are plotted in the figure below on the left.  We also offers a combined metric for cross validation. First, the cost and number of validation are both normalized to the range $[0, 1]$. Then, compute $(1-\lambda)cost(K) + \lambda{{\#}violation(K)}$, where $\lambda \in [0,1]$. By assigning different $\lambda$, we can adapt to scenarios in of different cost-violation tradeoffs. In particular, high $\lambda$ means that robustness of the design is more valued than the cost; low $\lambda$ indicates the contrary. The combined metric under different $\lambda$ are plotted in the figure below on the right. Observe that when $\lambda$ is low, large $K$ such as 7 and 9 are preferred; when $\lambda$ is high, moderate $K$ like 5 are better. Since large $K$ generally correspond to less cost, this result matches the intuition that people are willing to pay more cost if robustness is more valued.
% %least constraint violation occurs, so $K = 5$ is the ideal parameter to cluster this data set. $K = 5$ will be used in the following subsections.
%\iffalse
%\begin{figure}[H]
%\begin{subfigure}{0.5\textwidth}
%\includegraphics[scale=0.65]{cross_validation.png}
%\caption{\footnotesize}
%\label{figure4-1}
%\end{subfigure}
%\begin{subfigure}{0.5\textwidth}
%\centering
%\includegraphics[scale=0.37]{cross_validation_lambda.png}
%\caption{\footnotesize }
%\label{figure4-2}
%\end{subfigure}
%\caption{\footnotesize (a) The number of violations and costs with different $K$. (b) Costs penalized by $\lambda$ with different $K$, with each line associated with a different $\lambda$.}
%\end{figure}
%\fi
%
%\begin{figure}[H]
%\centering
%\includegraphics[width=\columnwidth]{cv.jpg}
%\caption{\footnotesize (a) The number of violations and costs with different $K$. (b) Costs penalized by $\lambda$ with different $K$, with each line associated with a different $\lambda$.}
%\label{figure4-2}
%\end{figure}
%
%
%
%
%\iffalse
%\begin{table}[h*]\footnotesize%\small%
%\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.85$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
%\begin{center}
%\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
%    $\mathcal{T}_R$ & Out of sample $\sigma$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
%     \hline
%    \multirow{6}{*}{7.625} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.834 & 0.372   \\
%     && C-DRO-Model & (1,1,3) &  &0.997 & 0.053  \\
%        \cline{2-7}
%     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.724 & 0.447   \\
%     && C-DRO-Model & (1,1,3) &  &0.982 & 0.132  \\
%        \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.673 & 0.469   \\
%     && C-DRO-Model & (1,1,3) &  &0.971 & 0.168  \\
%\hline
%    \multirow{6}{*}{7.75} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.781 & 0.414   \\
%     && C-DRO-Model & (1,1,4) &  &0.999 & 0.028  \\
%                 \cline{2-7}
%     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.690 & 0.462   \\
%     && C-DRO-Model & (1,1,4) &  &0.991 & 0.095  \\
%                  \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.637 & 0.481   \\
%     && C-DRO-Model & (1,1,4) &  &0.989 & 0.105  \\
%\hline
%    \multirow{6}{*}{7.875} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.716 & 0.451   \\
%     && C-DRO-Model & (1,1,7) &  &1.000 & 0.000  \\
%               \cline{2-7}
%     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.640 & 0.480   \\
%     && C-DRO-Model & (1,1,7) &  &0.998 & 0.040  \\
%               \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.602 & 0.490   \\
%     && C-DRO-Model & (1,1,7) &  &0.999 & 0.035  \\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%\begin{table}[h*]\footnotesize%\small%
%\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.90$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
%\begin{center}
%\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
%    $\mathcal{T}_R$ & Out of sample $\sigma$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
%     \hline
%    \multirow{6}{*}{7.625} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.618 & 0.486   \\
%     && C-DRO-Model & (1,1,3) &  &0.998 & 0.047  \\
%        \cline{2-7}
%     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.576 & 0.494   \\
%     && C-DRO-Model & (1,1,3) &  &0.989 & 0.103   \\
%        \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.560 & 0.496   \\
%     && C-DRO-Model & (1,1,3) &  &0.988 & 0.111   \\
%\hline
%    \multirow{6}{*}{7.75} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.528 & 0.499   \\
%     && C-DRO-Model & (1,1,4) &  &0.999 & 0.037   \\
%                 \cline{2-7}
%     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.538 & 0.499   \\
%     && C-DRO-Model & (1,1,4) &  &0.996 & 0.063   \\
%                  \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.530 & 0.499   \\
%     && C-DRO-Model & (1,1,4) &  &0.994 & 0.080   \\
%\hline
%    \multirow{6}{*}{7.875} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.458 & 0.498   \\
%     && C-DRO-Model & (1,1,7) &  &0.996 & 0.060  \\
%               \cline{2-7}
%     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.481 & 0.500   \\
%     && C-DRO-Model & (1,1,7) &  &0.996 & 0.060   \\
%               \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.496 & 0.500   \\
%     && C-DRO-Model & (1,1,7) &  &0.995 & 0.069  \\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%\begin{table}[h*]\footnotesize%\small%
%\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.90$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
%\begin{center}
%\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
%    $\mathcal{T}_R$ & Out of sample $\sigma$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
%     \hline
%    \multirow{6}{*}{7.625} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.143 & 0.350   \\
%     && C-DRO-Model & (1,1,3) &  &0.653 & 0.476  \\
%        \cline{2-7}
%     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.576 & 0.494   \\
%     && C-DRO-Model & (1,1,3) &  &0.989 & 0.103   \\
%        \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.560 & 0.496   \\
%     && C-DRO-Model & (1,1,3) &  &0.988 & 0.111   \\
%\hline
%    \multirow{6}{*}{7.75} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.528 & 0.499   \\
%     && C-DRO-Model & (1,1,4) &  &0.999 & 0.037   \\
%                 \cline{2-7}
%     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.538 & 0.499   \\
%     && C-DRO-Model & (1,1,4) &  &0.996 & 0.063   \\
%                  \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.530 & 0.499   \\
%     && C-DRO-Model & (1,1,4) &  &0.994 & 0.080   \\
%\hline
%    \multirow{6}{*}{7.875} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.458 & 0.498   \\
%     && C-DRO-Model & (1,1,7) &  &0.996 & 0.060  \\
%               \cline{2-7}
%     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.481 & 0.500   \\
%     && C-DRO-Model & (1,1,7) &  &0.996 & 0.060   \\
%               \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.496 & 0.500   \\
%     && C-DRO-Model & (1,1,7) &  &0.995 & 0.069  \\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%\begin{table}[h*]\footnotesize%\small%
%\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.90$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
%\begin{center}
%\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
%    $\mathcal{T}_R$ & Out of sample $\sigma$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
%     \hline
%    \multirow{6}{*}{7.625} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.348 & 0.476   \\
%     && C-DRO-Model & (1,1,3) &  &0.930 & 0.256  \\
%        \cline{2-7}
%     & \multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.427 & 0.495   \\
%     && C-DRO-Model & (1,1,3) &  &0.948 & 0.221  \\
%        \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.445 & 0.497   \\
%     && C-DRO-Model & (1,1,3) &  &0.957 & 0.203   \\
%\hline
%    \multirow{6}{*}{7.75} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.278 & 0.448   \\
%     && C-DRO-Model & (1,1,4) &  &0.933 & 0.250   \\
%                 \cline{2-7}
%     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.381 & 0.486   \\
%     && C-DRO-Model & (1,1,4) &  &0.974 & 0.159   \\
%                 \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.409 & 0.492   \\
%     && C-DRO-Model & (1,1,4) &  &0.979 & 0.143   \\
%\hline
%    \multirow{6}{*}{7.875} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.214 & 0.410   \\
%     && C-DRO-Model & (1,1,7) &  &0.909 & 0.287  \\
%               \cline{2-7}
%     & \multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.345 & 0.475   \\
%     && C-DRO-Model & (1,1,7) &  &0.973 & 0.163  \\
%               \cline{2-7}
%     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.383 & 0.486   \\
%     && C-DRO-Model & (1,1,7) &  &0.985 & 0.122   \\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%\fi
%
%\iffalse
%From the experiment result, we can observe that despite a smaller ambiguity set, our design ($x^{(2)}$) can achieve robustness level that is comparable to the design without clustering $x^{(1)}$, and are far better than the baseline probabilistic model ($x^{(3)}$).
%\fi
%
%\subsection{Comparison with a baseline probabilistic model}
%
%\begin{figure}[H]
%\centering
%\includegraphics[scale=0.8]{KVARIATION55.pdf}
%\caption{\footnotesize The series-parallel system we are studying in this section, consists of a single type of component, with active parallel strategy only. This simplicity is due to the limitation of baseline probabilistic model.}
%\label{figure1}
%\end{figure}
%
%To illustrate the performance of our robust reliability model, we compare the design ($x^{(1)}$) obtained from the proposed robust redundancy optimization model with the design ($x^{(2)}$) obtained from a probabilistic redundancy optimization model. We choose $K = 5$, correspond to $\lambda = 0.8$ in the previous subsection. As mentioned in the Introduction and Literature Review, when the situation involves multiple types ({\it i.e.,} $|\mathbf{M}_i|>1$), or both the cold-standby and active parallel redundant subsystems are considered, the probabilistic model generally becomes intractable. Therefore, for a fair comparison, we consider a series-parallel system with $|\mathbf{N}| = 3$ and $|\mathbf{M}_i|=1, \forall i \in [3]$, which preserves a linear MIP formulation for the probabilistic model. For a coherent exposition of the experimental study, we place the details of the probabilistic redundancy model as well as its MIP %transformation in the Electronic Companion.
%
%\iffalse
%In particular, we first randomly generate lifetime samples (size=2500) and then compute the probability levels $\P[\tilde{z}_{i}\le \mathcal{T}_R ], \forall i \in [3]$ and the parameters $(\hnu, \underline{\hmu}, \overline{\hmu}, \bm{\hsigma}, \p)$ from the generated lifetime samples for parameter inputs of probabilistic and robust models, respectively, where $R_0=0.95$ and $\mathcal{T}_R=7.5$. We obtain the designs by solving the perspective redundancy models. \fi We perform out-of-sample experiments in the similar way as the out-of-sample-test performed in Section 5.2. First, we keep the lifetime mean as the same ($\Delta_m = 0$), but increases the out-of-sample standard deviation ($\Delta_s = 500\%, 1000\%, 1500\%$, respectively). This corresponds to the first row of the figure. Then, we shrink the lifetime mean ($\Delta_m = 10\%, 12.5\%, 15\%$) as the same time as $\Delta_s$ enlarges. This corresponds to the second row of the figure. The out-of-sample system lifetimes are compared and plotted in figure, and the comparison of out-of-sample reliability levels is provided in table.
%\begin{table}[H]\footnotesize%\small%
%\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.95$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
%\begin{center}
%\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
%    $\mathcal{T}_R$ & $(\Delta_{m}, \Delta_{s})$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
%     \hline
%    \multirow{12}{*}{7.5} & \multirow{2}{*}{(0, 500\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.952 & 0.214   \\
%     && C-DRO-Model & (1,2,8) &  &1.000 & 0.000  \\
%        \cline{2-7}
%     &\multirow{2}{*}{(0, 1000\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.886 & 0.318   \\
%     && C-DRO-Model & (1,2,8) &  &1.000 & 0.000  \\
%        \cline{2-7}
%     &\multirow{2}{*}{(0, 1500\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.800 & 0.400   \\
%     && C-DRO-Model & (1,2,8) &  &1.000 & 0.000  \\
%     \cline{2-7}
%     & \multirow{2}{*}{(10\%, 500\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.408 & 0.492   \\
%     && C-DRO-Model & (1,2,8) &  &0.995 & 0.069  \\
%        \cline{2-7}
%     &\multirow{2}{*}{(12.5\%, 1000\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.328 & 0.469   \\
%     && C-DRO-Model & (1,2,8) &  &0.970 & 0.172  \\
%        \cline{2-7}
%     &\multirow{2}{*}{(15\%, 1500\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.291 & 0.454   \\
%     && C-DRO-Model & (1,2,8) &  &0.959 & 0.198  \\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%\begin{figure}[H]
%\centering
%\includegraphics[width=\columnwidth]{Out_of_sample_def.png}
%\caption{\footnotesize The out-of-sample system lifetime scenarios of robust model with clustering and probabilistic model under different $\Delta_{m}$ and $\Delta_{s}$. Figure (a), (b), (c) represents the scenario that the mean of out-of-sample data is kept the same, with different $\Delta_{s}$ that is much larger than the training set; figure (d), (e), (f) represents the scenario that the mean of out-of-sample data shrinks ($\Delta_{m} > 0$), in additional to much larger $\Delta_{s}$. The vertical beam represents $\mathcal{T}_R$. The fraction of the lifetime histogram on the right side of beam represents the out-of-sample reliability level.}
%\label{figure_out_of_sample_test_def}
%\end{figure}
%
%We can observe that when the mean lifetime are kept the same, the robust model produces very robust designs, while the baseline probabilistic model produces design that works fine for moderate $\Delta_s$, but becomes unsatisfiable under larger $\Delta_s$; when the lifetime mean shrinks, however, the out-of-sample reliability of the design from baseline model becomes extremely awful, while the design from robust model can still keep the reliability level intact. This illustrates that our model outperforms the baseline model in that it is significantly more robust, especially under extreme circumstances. In addition, as mentioned before, our model is tractable for multi-type mixed strategy systems, while the probabilistic model will become intractable. Thus our model is superior in both robustness and computability.
%
%
%\subsection{Value of side information}
%In this subsection we experiment on clustering according to side information. The system we are studying is the same as the one in Section 5.2. We choose $K = 5$, corresponding to $\lambda = 0.8$ in the cross validation section. When generating samples from distributions, we also obtain the side information of which of the 5 distributions the sample is drawn from. We then cluster the data set by $K = 5$, based solely on the side information, and compute parameters $(\hnu, [\underline{\hmu}, \overline{\hmu}], \bm{\hsigma}, \p)$ of the model from it. We obtain design ($x^{(2)}$) from the model, and compare it with design ($x^{(1)}$) obtained from the model in which $K$ is also 5, but is directly clustered based on lifetime information instead of the side information.
%
%\begin{table}[H]\label{d-table}\scriptsize%\footnotesize%\small%
%\caption{ \footnotesize  The design table for $K = 5$ model with and without side information (S.I). AP, CS-I and CS-II represents the active parallel type, cold-standby type 1 and cold-standby type 2, respectively.}
%\begin{center}
%\begin{tabular}{|c|c| c|| ccccc || ccccc || ccccc |}\hline
%\multirow{2}{*}{S.I} & \multirow{2}{*}{Subsystem} & \multirow{2}{*}{Type} & \multicolumn{5}{c||}{$R_0=0.95$,~$\mathcal{T}_R$ (yrs) }& \multicolumn{5}{c||}{$R_0=0.97$,~$\mathcal{T}_R$ (yrs)}& \multicolumn{5}{c|}{$R_0=0.99$,~$\mathcal{T}_R$ (yrs)} \\
% \cline{4-18}
%      &&&28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30  \\
%     \cline{1-18}
%                    && AP & 1 & 1 &1 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 1 & 1 & 1 & 1 & 1 \\                       &
%        1            & CS-l & 1 & 1 &1 & 2 & 2  & 2 & 2 &2 & 2 & 2    & 2 & 2 & 2 & 2 & 2 \\
%                    &&CS-ll & 0 & 0 &0 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 0 & 0 & 0 & 0 & 0 \\
%     \cline{2-18}
%                    && AP  & 1 & 1 &1 & 1 & 1  & 1 & 1 &1 & 1 & 1    & 1 & 1 & 1 & 1 & 1 \\ With  &
%        2            & CS-l & 1 & 0 &0 & 0 & 0  & 1 & 0 &0 & 0 & 3    & 1 & 0 & 0 & 0 & 3 \\ S.I &
%                     & CS-ll & 1 & 2 &2 & 2 & 2  & 1 & 2 &2 & 2 & 0    & 1 & 2 & 2 & 2 & 0 \\
%     \cline{2-18}
%                    && AP & 1 & 0 &1 & 1 & 0  & 1 & 1 &0 & 1 & 1    & 0 & 0 & 0 & 1 & 1 \\                       &
%        3            & CS-l & 0 & 1 &3 & 0 & 1  & 0 & 0 &1 & 3 & 2    & 1 & 1 & 1 & 3 & 2 \\
%                    && CS-ll & 3 & 3 &1 & 3 & 3  & 3 & 3 &3 & 1 & 2    & 3 & 3 & 3 & 1 & 2 \\
%\hline
%\multicolumn{3}{|c||}{Design cost (k\$)} & 38.5& 39.5 &40&\!\! 44 &\!\! 44.5    & 43.5 & 44 &44.5& 45 & 46    & 54 & \!\! 54.5\!\! &54.5\!\!& 55\!\! &56 \\
%\hline
%\multicolumn{3}{|c||}{Cost saved (k\$)} & 1 & 4.5 &4&\!\! 0 &\!\! 1    & 0.5 & 1 &0.5& 1 & 1.5    & 0 & \!\! 0.5\!\! &0.5\!\!& 0\!\! &0 \\
%\cline{1-18}
%     \cline{2-18}
%                    && AP & 1 & 0 &0 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 1 & 1 & 1 & 1 & 1 \\                          &
%        1            & CS-l & 1 & 2 &2 & 2 & 2  & 2 & 2 &2 & 2 & 2    & 2 & 2 & 2 & 2 & 2 \\
%                    && CS-ll & 0 & 0 &0 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 0 & 0 & 0 & 0 & 0 \\
%     \cline{2-18}
%                    && AP & 1 & 1 &1 & 1 & 1  & 1 & 1 &1 & 1 & 1    & 1 & 1 & 1 & 1 & 1 \\ Without &
%        2            & CS-l & 1 & 2 &0 & 0 & 3  & 1 & 0 &0 & 2 & 2    & 1 & 0 & 0 & 0 & 3 \\ S.I &
%                     & CS-ll & 1 & 0 &2 & 2 & 0  & 1 & 2 &2 & 1 & 1    & 1 & 2 & 2 & 2 & 0 \\
%     \cline{2-18}
%                    && AP  & 1 & 1 &1 & 1 & 1  & 0 & 1 &1 & 1 & 0    & 0 & 1 & 1 & 1 & 1 \\                          &
%       3             & CS-l & 3 & 3 &0 & 0 & 3  & 1 & 3 &3 & 3 & 2    & 1 & 3 & 3 & 3 & 2 \\
%                    && CS-ll& 1 & 1 &3 & 3 & 1  & 3 & 1 &1 & 1 & 3    & 3 & 1 & 1 & 1 & 2 \\
%\hline
%\multicolumn{3}{|c||}{Design cost (k\$)} & 39.5& 44 &44&\!\! 44 &\!\! 45.5    & 44 & 45 &45& 46 & 47.5    & 54 & \!\! 55\!\! &55\!\!& 55\!\! &56 \\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%The result shows that when side information is incorporated, we can achieve a design with much lower cost. We then choose $\mathcal{T}_R = 28.5$, which correspond to the largest cost saved, and perform out-of-sample tests, in the similar way as previous experiments. We can observe that even with this significant cost saved, the design obtained by clustering by side information still performs well enough under mean shrink and large standard deviation.
%
%\begin{table}[htp]\footnotesize%\small%
%\caption{\label{tab-compare3} \footnotesize The out-of-sample result of designs obtained at $\mathcal{T}_R = 28.5$. $R_0$ is set to $0.9$. In the design columns, the 3 columns are corresponding to different subsystems. The three numbers in each column indicates the number of redundancies used that is active parallel, cold-standby type I, or cold-standby type II, respectively. }
%\begin{center}
%\begin{tabular}{|c||c|c|| c|c|c| c |  c |c  |}\hline
%\multirow{2}{*}{$\mathcal{T}_R$} & \multirow{2}{*}{($\Delta_{m}, \Delta_{s}$)} &\multirow{2}{*}{Model} & \multicolumn{3}{c|}{Design} & \multirow{2}{*}{cost}  & \multirow{2}{*}{Mean of out-of-sample reliability level} & \multirow{2}{*}{StD} \\
%\cline{4-6}
%&&& 1 & 2 & 3 &&&\\
%\hline
%\multirow{6}{*}{28.5} & \multirow{2}{*}{(5\%, 800\%)} &  With S.I & (1,1,0) & (1,0,2) & (0,1,3)& 39.5 &0.985 & 0.122   \\
%     & & Without S.I & (0,2,0) & (1,2,0) & (1,3,1)& 44  &1.000 & 0.000 \\
%    \cline{2-9}
%     &\multirow{2}{*}{(5\%, 1000\%)}  &  With S.I & (1,1,0) & (1,0,2) & (0,1,3)& 39.5 & 0.970 & 0.172   \\
%     & & Without S.I & (0,2,0) & (1,2,0) & (1,3,1)& 44  &0.999 & 0.028 \\
%    \cline{2-9}
%     &\multirow{2}{*}{(5\%, 1200\%)}  &  With S.I & (1,1,0) & (1,0,2) &(0,1,3)& 39.5 & 0.968 & 0.175   \\
%     & & Without S.I & (0,2,0) & (1,2,0) & (1,3,1)& 44  &0.996 & 0.060 \\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%\begin{figure}[H]
%\centering
%\includegraphics[width=\columnwidth]{Out_of_sample_side.png}
%\caption{\footnotesize The out-of-sample test result of designs obtained at $\mathcal{T}_R = 28.5$, with and without side information (S.I).  The vertical beam represents $\mathcal{T}_R$. The fraction of the lifetime histogram on the right side of beam represents the out-of-sample reliability level.}
%\label{figure4}
%\end{figure}
%
%
%
%\iffalse
%\begin{table}[H]\footnotesize%\small%
%\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.95$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
%\begin{center}
%\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
%    $\mathcal{T}_R$ & $\Delta_{m}, \Delta_{s}$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
%     \hline
%    \multirow{12}{*}{7.5} & \multirow{2}{*}{1, 5}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.952 & 0.214   \\
%     && C-DRO-Model & (1,2,8) &  &1.000 & 0.000  \\
%        \cline{2-7}
%     &\multirow{2}{*}{1, 10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.886 & 0.318   \\
%     && C-DRO-Model & (1,2,8) &  &1.000 & 0.000  \\
%        \cline{2-7}
%     &\multirow{2}{*}{1, 20}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.800 & 0.400   \\
%     && C-DRO-Model & (1,2,8) &  &1.000 & 0.000  \\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%\begin{table}[!htbp]\label{d-table}\scriptsize%\footnotesize%\small%
%\caption{ \footnotesize  The design table for $K = 10$ model with side information}
%\begin{center}
%\begin{tabular}{|c| c|| ccccc || ccccc || ccccc |}\hline
%\multirow{2}{*}{Subsystem} & \multirow{2}{*}{Type} & \multicolumn{5}{c||}{$R_0=0.95$,~$\mathcal{T}_R$ (yrs) }& \multicolumn{5}{c||}{$R_0=0.97$,~$\mathcal{T}_R$ (yrs)}& \multicolumn{5}{c|}{$R_0=0.99$,~$\mathcal{T}_R$ (yrs)} \\
% \cline{3-17}
%      &&28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30  \\
%     \hline
%                    & 1 & 1 & 1 &1 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 1 & 1 & 1 & 1 & 1 \\
%        1           & 2 & 1 & 1 &1 & 2 & 2  & 2 & 2 &2 & 2 & 2    & 2 & 2 & 2 & 2 & 2 \\
%                    & 3 & 0 & 0 &0 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 0 & 0 & 0 & 0 & 0 \\
%     \hline
%                    & 1 & 1 & 1 &1 & 1 & 1  & 1 & 1 &1 & 1 & 1    & 1 & 1 & 1 & 1 & 1 \\
%        2           & 2 & 1 & 0 &0 & 0 & 0  & 1 & 0 &0 & 0 & 3    & 1 & 0 & 0 & 0 & 3 \\
%                    & 3 & 1 & 2 &2 & 2 & 2  & 1 & 2 &2 & 2 & 0    & 1 & 2 & 2 & 2 & 0 \\
%     \hline
%                    & 1 & 1 & 0 &1 & 1 & 0  & 1 & 1 &0 & 1 & 1    & 0 & 0 & 0 & 1 & 1 \\
%       3            & 2 & 0 & 1 &3 & 0 & 1  & 0 & 0 &1 & 3 & 2    & 1 & 1 & 1 & 3 & 2 \\
%                    & 3 & 3 & 3 &1 & 3 & 3  & 3 & 3 &3 & 1 & 2    & 3 & 3 & 3 & 1 & 2 \\
%\hline
%\multicolumn{2}{|c||}{Design cost (k\$)} & 38.5& 39.5 &40&\!\! 44 &\!\! 44.5    & 43.5 & 44 &44.5& 45 & 46    & 54 & \!\! 54.5\!\! &54.5\!\!& 55\!\! &56\\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%
%\begin{table}[!htbp]\label{d-table}\scriptsize%\footnotesize%\small%
%\caption{ \footnotesize  The design table for $K = 10$ model without side information}
%\begin{center}
%\begin{tabular}{|c| c|| ccccc || ccccc || ccccc |}\hline
%\multirow{2}{*}{Subsystem} & \multirow{2}{*}{Type} & \multicolumn{5}{c||}{$R_0=0.95$,~$\mathcal{T}_R$ (yrs) }& \multicolumn{5}{c||}{$R_0=0.97$,~$\mathcal{T}_R$ (yrs)}& \multicolumn{5}{c|}{$R_0=0.99$,~$\mathcal{T}_R$ (yrs)} \\
% \cline{3-17}
%      &&28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30  \\
%     \hline
%                    & 1 & 1 & 0 &0 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 1 & 1 & 1 & 1 & 1 \\
%        1           & 2 & 1 & 2 &2 & 2 & 2  & 2 & 2 &2 & 2 & 2    & 2 & 2 & 2 & 2 & 2 \\
%                    & 3 & 0 & 0 &0 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 0 & 0 & 0 & 0 & 0 \\
%     \hline
%                    & 1 & 1 & 1 &1 & 1 & 1  & 1 & 1 &1 & 1 & 1    & 1 & 1 & 1 & 1 & 1 \\
%        2           & 2 & 1 & 2 &0 & 0 & 3  & 1 & 0 &0 & 2 & 2    & 1 & 0 & 0 & 0 & 3 \\
%                    & 3 & 1 & 0 &2 & 2 & 0  & 1 & 2 &2 & 1 & 1    & 1 & 2 & 2 & 2 & 0 \\
%     \hline
%                    & 1 & 1 & 1 &1 & 1 & 1  & 0 & 1 &1 & 1 & 0    & 0 & 1 & 1 & 1 & 1 \\
%       3            & 2 & 3 & 3 &0 & 0 & 3  & 1 & 3 &3 & 3 & 2    & 1 & 3 & 3 & 3 & 2 \\
%                    & 3 & 1 & 1 &3 & 3 & 1  & 3 & 1 &1 & 1 & 3    & 3 & 1 & 1 & 1 & 2 \\
%\hline
%\multicolumn{2}{|c||}{Design cost (k\$)} & 39.5& 44 &44&\!\! 44 &\!\! 45.5    & 44 & 45 &45& 46 & 47.5    & 54 & \!\! 55\!\! &55\!\!& 55\!\! &56\\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%\fi
%
%\iffalse
%Data creating process finised
%Academic license - for non-commercial use only
%cost is 38.5003857478587 , \mathcal{T}_R is 26, R_0 is 0.95
%
%(((1.0, -0.0, -0.0), (1.0, -0.0, -0.0), (7.563683503866358e-06, 7.563683503866358e-06, 7.563683503866358e-06)), ((-0.0, -0.0, 1.0), (-0.0, 1.0, -0.0), (1.0, -0.0, -0.0)), ((-0.0, -0.0, -0.0), (1.0, -0.0, 1.0), (1.0, 1.0, -0.0)))
%cost is 43.0 , \mathcal{T}_R is 26, R_0 is 0.97
%
%(((-0.0, -0.0, -0.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, -0.0), (1.0, 1.0, -0.0), (1.0, -0.0, 1.0)))
%cost is 53.0 , \mathcal{T}_R is 26, R_0 is 0.99
%
%(((-0.0, 1.0, -0.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, 1.0, -0.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, -0.0), (1.0, 1.0, -0.0), (-0.0, 1.0, 1.0)))
%cost is 44.0 , \mathcal{T}_R is 27, R_0 is 0.95
%
%(((-0.0, -0.0, -0.0), (1.0, -0.0, 1.0), (0.0, -0.0, -0.0)), ((-0.0, 1.0, -0.0), (1.0, -0.0, -0.0), (1.0, -0.0, -0.0)), ((-0.0, -0.0, -0.0), (-0.0, -0.0, 1.0), (1.0, 1.0, 1.0)))
%cost is 44.0 , \mathcal{T}_R is 27, R_0 is 0.97
%
%(((-0.0, -0.0, -0.0), (1.0, 1.0, -0.0), (0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (-0.0, 1.0, -0.0), (0.0, 1.0, -0.0)), ((-0.0, -0.0, -0.0), (-0.0, 1.0, -0.0), (1.0, 1.0, 1.0)))
%cost is 54.0 , \mathcal{T}_R is 27, R_0 is 0.99
%
%(((-0.0, 1.0, -0.0), (-0.0, 1.0, 1.0), (0.0, -0.0, -0.0)), ((-0.0, 1.0, -0.0), (1.0, -0.0, -0.0), (-0.0, -0.0, 1.0)), ((-0.0, -0.0, -0.0), (-0.0, 1.0, -0.0), (1.0, 1.0, 1.0)))
%cost is 44.0 , \mathcal{T}_R is 28, R_0 is 0.95
%
%(((-0.0, -0.0, -0.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, 1.0, -0.0), (-0.0, 1.0, -0.0), (-0.0, 1.0, -0.0)), ((-0.0, -0.0, -0.0), (-0.0, -0.0, 1.0), (1.0, 1.0, 1.0)))
%cost is 45.00017123897305 , \mathcal{T}_R is 28, R_0 is 0.97
%
%(((5.707965768311441e-06, 5.707965768311441e-06, 5.707965768311441e-06), (1.0, 1.0, -0.0), (0.0, 0.0, -0.0)), ((0.0, 1.0, -0.0), (1.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 0.0), (-0.0, 1.0, -0.0), (1.0, 1.0, 1.0)))
%cost is 54.0 , \mathcal{T}_R is 28, R_0 is 0.99
%
%(((-0.0, -0.0, 1.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, 1.0, -0.0), (-0.0, 1.0, -0.0), (-0.0, 1.0, -0.0)), ((-0.0, -0.0, -0.0), (-0.0, -0.0, 1.0), (1.0, 1.0, 1.0)))
%cost is 45.0 , \mathcal{T}_R is 29, R_0 is 0.95
%
%(((-0.0, -0.0, -0.0), (1.0, -0.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (-0.0, -0.0, -0.0), (1.0, -0.0, 1.0)), ((-0.0, 1.0, -0.0), (1.0, 1.0, 1.0), (-0.0, -0.0, 1.0)))
%cost is 54.5 , \mathcal{T}_R is 29, R_0 is 0.97
%
%(((-0.0, -0.0, 1.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((1.0, -0.0, -0.0), (-0.0, -0.0, -0.0), (1.0, 1.0, -0.0)), ((-0.0, -0.0, -0.0), (1.0, -0.0, -0.0), (1.0, 1.0, 1.0)))
%cost is 55.0 , \mathcal{T}_R is 29, R_0 is 0.99
%
%(((-0.0, -0.0, 1.0), (1.0, 1.0, -0.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (-0.0, -0.0, -0.0), (1.0, -0.0, 1.0)), ((-0.0, 1.0, -0.0), (1.0, 1.0, 1.0), (-0.0, 1.0, -0.0)))
%cost is 55.0 , \mathcal{T}_R is 30, R_0 is 0.95
%
%(((-0.0, -0.0, 1.0), (1.0, 1.0, -0.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (1.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, -0.0), (1.0, -0.0, -0.0), (1.0, 1.0, 1.0)))
%cost is 55.5 , \mathcal{T}_R is 30, R_0 is 0.97
%
%(((-0.0, -0.0, 1.0), (-0.0, 1.0, 1.0), (0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (1.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (1.0, 1.0, 1.0), (-0.0, 1.0, -0.0)))
%cost is 56.0 , \mathcal{T}_R is 30, R_0 is 0.99
%
%(((1.0, -0.0, -0.0), (1.0, 1.0, -0.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (1.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((1.0, 0.0, -0.0), (1.0, 1.0, -0.0), (-0.0, 1.0, 1.0)))
%\fi
%
%\subsection{Case study}
%
%
%\section{Conclusion}
%
%\begin{thebibliography}{}
%\bibitem{Aggarwal2012}Aggarwal, C. C., Y. Zhao, P. S. Yu. 2012. On Text Clustering with Side Information. {\em 2012 IEEE 28th International Conference on Data Engineering}, 894--904.
%
%\bibitem{Ardakan2014} Ardakan, M. A., A. Z. Hamadani. 2014. Reliability-redundancy allocation problem with cold-standbyredundancy strategy. {\em Simulation Modelling Practice and Theory}~{\bf 42}:107--118.
%
%\bibitem{Bertsimas2011}Bertsimas, D., M. Sim. 2004. The price of robustness. {\em Operations Research}~{\bf 52}(1):35--53.
%
%\bibitem{Bhunia2010} Bhunia, A. K., L. Sahoo,  D. Roy. 2010. Reliability stochastic optimization
%for a series system with interval component reliability via genetic
%algorithm.  {\em Appl.Math. Computat.}~{\bf 216}~(3): 929--939, 2010.
%
%
%\bibitem{Chern1992}Chern, M.S. 1992. On the computational complexity of reliability redundancy allocation in a series system. {\em Operations research letters}~{\bf 11}~(5):309--315.
%
%\bibitem{Cheng2009}Cheng, Z., X. Wang, C. Tian, F. Wang. 2009. Mission reliability simulation of High-speed EMU service braking system. {\em Proceedings of the 8th International Conference on Reliability, Maintainability and Safety} (ICRMS 2009), 253--256.
%
%\bibitem{Coit1996}Coit, D. W., A.E. Smith. 1996. Solving the redundancy allocation problem using a combined neural network/genetic algorithm approach. {\em Computers \& Operations Research}~{\bf 23}~(6):515--526.
%
%\bibitem{Coit1998}Coit, D. W., A.E. Smith. 1998. Redundancy allocation to maximize a lower percentile of the system time-to-failure distribution. {\em IEEE Transactions on Reliability}~ {\bf 47}~(1):79--87.
%
%\bibitem{Coit2003}Coit, D. W. 2003. Maximization of system reliability with a choice of redundancy strategies. {\em IIE Transactions}~{\bf 35}~(6):535-543.
%
%\bibitem{Coit2004} Coit, D.W.,  T. Jin,  N. Wattanapongsakorn. 2004. System optimization
%with component reliability estimation uncertainty: A multi-criteria approach. {\em IEEE Trans. Rel.}~ {\bf 53}~(3) :  369--380, 2004.
%
%
%
%
%\bibitem{Elsayed2012}Elsayed, E A. 2012. {\em Reliability Engineering}. 2nd Edition. Wiley.
%
%
%\bibitem{Govindan2017}Govindan, K., A. Jafarian, M.E. Azbari, T.M. Choi. 2017. Optimal bi-objective redundancy allocation for systems reliability and risk management. {\em IEEE Transactions on Cybernetics}~{\bf 46}~(8):1735--1748.
%
%
%
%%\bibitem{Lam2012}Lam SW, T.S. Ng, and M. Sim. (2012). Multiple objectives satisficing under uncertainty. To appear in Operations Research, 2012.
%
%%\bibitem{Lin2011}Lin J, Muthuraman K, Lawley M (2011) Optimal and approximate algorithms for sequential clinical scheduling with no-shows. {\it IIE Transactions on Healthcare Systems Engineering} 1:20--36.
%
% %\bibitem{McCarthy2000} McCarthy K, McGee HM, O'Boyle CA. 2000. Outpatient clinic waiting times and non-attendance as indicators of quality. {\it Psychology, Health and Medicine} 5: 287--293.
%
%\bibitem{Grani2017} Hanasusanto, G. A., V. Roitch, D. Kuhn, W. Wiesemann. 2017. Ambiguous joint chance constraints under mean and dispersion information. {\em Operations Research}~{\bf 65}~(3):715--767.
%
%
%
%
%
%\bibitem{Elegbede2003}Elegbede, A.C., C. Chu, K.H. Adjallah, F. Yalaoui. 2003. Reliability allocation through cost minimization. {\em IEEE Transactions on reliability}~{\bf 52}~(1):106--111.
%
%\bibitem{Feizollahi2012} Feizollahi, M.J., M. Modarres. 2012. The robust deviation redundancy allocation problem with interval component reliabilities. {\em IEEE Transactions on reliability}~{\bf 61}~(4):957--965.
%
%
%
%
%
%\bibitem{Feizollahi2014}Feizollahi, M.J., S. Ahmed, M. Modarres. 2014. The robust redundancy allocation problem in series-parallel systems with budgeted uncertainty.  {\em IEEE Transactions on reliability}~{\bf 63}~(1):239--250.
%
%\bibitem{Feizollahi2015} Feizollahi, M.J., R. Soltan, H. Feyzollahi. 2015. The robust cold standby redundancy allocation in series-parallel systems with budgeted uncertainty. {\em IEEE Transactions on reliability}~{\bf 64}~(2):799--806.
%
%\bibitem{Hasegawa1999}Hasegawa, I., Uchida, S. 1999. Braking systems. {\em Japan Railway and Transport Review}~{\bf 20}:52-59.
%
%\bibitem{Kulturel-Konak2003}Kulturel-Konak, S., A. Smith., D., Coit. 2003. Efficiently Solving the Redundancy Allocation Problem Using Tabu Search. {\em IIE Transactions.}~{\bf 35}:515--526.
%
%\bibitem{Kuo2001}Kuo, W., V.R. Prasad, F.A. Tillman, C.L. Hwang. 2001. {\em Optimal Reliability Design: Fundamentals and Applications.} Cambridge university press. Cambridge.
%
%\bibitem{Kuo2007}Kuo, W., R. Wan. 2007. Recent advances in optimal reliability allocation. {\em IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans}, {\bf 37}~(2):143-156.
%
%\bibitem{Li2014}Li, Y.F., Y. Ding, E. Zio. 2014. Random fuzzy extension of the universal generating function approach for the
%reliability assessment of multi-state systems under aleatory and epistemic uncertainties. {\em IEEE Transactions on Reliability}~{\bf 63}~(1):13--25.
%
%\bibitem{Li2011}Li, C.Y., X. Chen, X.S. Yi, J.Y. Tao. 2011. Interval-valued reliability analysis of multi-state systems. {\em IEEE Transactions on Reliability}~{\bf 60}~(1):323--330.
%
%\bibitem{Li2008} Li, X.,  X. Hu. 2008. Some new stochastic comparisons for redundancy
%allocations in series and parallel systems.~{\em  Statist. Probabil. Lett.}~{\bf 78}~(18): 3388--3394.
%
%\bibitem{Liang2004} Liang, Y., A. E. Smith. 2004 An ant colony optimization algorithm for the redundancy allocation problem (RAP). {\em IEEE Transactions on Reliability}~{\bf 53}~(3):417--423.
%
%\bibitem{Liao2014}Liao, L., F. K\"{o}ttig. 2014. Review of hybrid prognostics approaches for remaining useful life prediction of engineered systems, and an application to battery life prediction. {\em IEEE Transactions on Reliability}. {\bf 63}~(1):191--207.
%
%\bibitem{Liu2015 }Liu, H., Y. Fu. 2015. Clustering with Partition Level Side Information. {\em 2015 IEEE International Conference on Data Mining}, 877--882.
%
%\bibitem{Military1992}Military, U.S. 1992. Reliability prediction of electronic equipment. MIL-HDBK-217F Notice 1.
%
%\bibitem{Marseguerra2005} Marseguerra, M., E. Zio, L. Podofillini, D. W. Coit. 2005. Optimal design
%of reliable network systems in presence of uncertainty. {\em IEEE
%Trans. Rel.}~{\bf 54}~(2): 243--253.
%
%
%\bibitem{Ng2014} Ng, S. Y., Xing Y., K. L. Tsui. 2014. A naive Bayes model for robust remaining useful life prediction of lithium-ion battery. {\em Applied Energy}~{\bf 118}: 114-123.
%
%
%
%
%\bibitem{Prasad2001}Prasad, V. R., W. Kuo, K. O. Kim. 2001. Maximization of a percentile life of a series system through component redundancy allocation. {\em IIE Transactions}~{\bf 33}~(12):1071--1079.
%
%\bibitem{Pecht2008} Pecht, M. 2008. {\em Prognostics and Health Management of Electronics.} John Wiley \& Sons, Ltd.
%
%\bibitem{Shang2017} Shang, C., X. Huang, F. You. 2017. Data-driven robust optimization based on kernel learning. {\em Computers \& Chemical Engineering}~{\bf 106}:464--479.
%
%\bibitem{Shapiro2001}Shapiro~A.~2001.~On duality theory of conic linear problems. In {\em Semi-Infinite Programming}, chapter 7, 135--165, Kluwer Academic Publishers, 2001.
%
%\bibitem{Soltani2015}Soltani R., J. Safari, S.J. Sadjadi. 2015. Robust counterpart optimization for the redundancy allocation problem in series-parallel systems with component mixing under uncertainty. {\em Applied Mathematics \& Computation}~{\bf 271}~(C): 80--88.
%
%
%
%
%\bibitem{Sun2017} Sun, M. X., Y. F. Li, E. Zio. 2017. On the optimal redundancy allocation for multi-state series-parallel systems under epistemic uncertainty. {\em Reliability Engineering \& System Safety}. Accepted.
%
%\bibitem{Tang2014}Tang, S., C. Yu, X. Wang, X. Guo, X. Si. 2014. Remaining useful life prediction of lithium-ion batteries based on the wiener process with measurement error. {\em Energies}~{\bf 7}~(2):520--547.
%
%\bibitem{Tekiner-Mogulkoc2011}Tekiner-Mogulkoc, H., D. W. Coit. 2011. System reliability optimization
%considering uncertainty: Minimization of the coefficient of variation
%for series-parallel systems.~{\em  IEEE Trans. Rel.}~{\bf 60}~(30): 667--674, 2011.
%
%
%\bibitem{Wang2012} Wang, Y., L. Li, S. Huang, Q. Chang. 2012. Reliability and covariance estimation of weighted k-out-of-n multi-state Systems. {\em European Journal of Operational Research}~{\bf 221}:~138--147.
%
%\bibitem{Wang2019} Wang, S., Y. Li, T. Jia.  2019. Distributionally Robust Design for Redundancy Allocation. {\em INFORMS Journal on Computing}, in press.
%
%\bibitem{Wisemann2014} Wiesemann, W.,  D. Kuhn, M. Sim. 2014. Distributionally robust convex optimization. {\em Operations Research}~ {\bf 62} ~(6)~ 1358--1376.
%
%\bibitem{Xie2017} Xie, W., Ahmed, S. 2017. Distributionally robust chance constrained optimal power flow with renewables: A conic reformulation. {\em IEEE Transactions on Power Systems.} Accepted.
%
%\bibitem{Xing2002} Xing, E.P., A.Y, Ng, M.I., Jordan, S., Russell. 2002. Distance metric learning, with application to clustering with side-information. {\em Proceedings of the 15th International Conference on Neural Information Processing Systems}, ~521--528
%
%
%\bibitem{Yalaoui2005}Yalaoui, A., E.  Chatelet, C. Chu. 2005. A new dynamic programming method for reliability redundancy allocation in a parallel-series system. {\em IEEE transactions on reliability}.~{\bf 54}~(2):254--261.
%
%
%\bibitem{Zaretalab2015}Zaretalab, A., V. Hajipour, M. Sharifi, M. R. Shahriari. 2015. A knowledge-based archive multi-objective simulated annealing algorithm to optimize series-parallel system with choice of redundancy strategies. {\em Computers \& Industrial Engineering}~{\bf 80}:33-44.
%
%\bibitem{Zhao2003} Zhao, R., B. Liu. 2003. Stochastic programming models for general redundancy-optimization problems.~{\em IEEE Trans. Rel.}~{\bf 52}~(2): 181--191, 2003.
%
%\bibitem{Zhao2011} Zhao, P., P.S. Chan, H.K.T. Ng. 2011. Optimal allocation of redundancies in series systems. {\em European Journal of Operational Research}~{\bf 220}~(3):673--683.
%
%\bibitem{Kuhn2013}Zymler, S., D. Kuhn, B. Rustem. 2013. Distributionally robust joint chance constraints with second-order moment information, {\em Mathematical Programming}~{\bf 137}~(1-2):167--198.
%\end{thebibliography}
%\newpage
%\ECSwitch
%\ECHead{Electronic Companion}\small
%\iffalse
%\section{Implementation Of K-means}
%The detail of K-means algorithm:
%
%1. Initialization: Randomly pick $K$ data points in $\mathcal{Z}$ to be the initial means $\hmu_1, \hmu_2, ... \hmu_K$.
%
%2. Assignment step: Assign each data point to one cluster, according to the rule
%\begin{equation}\label{k-means-assignment}
%\bm{z} \in \mathcal{Z}_{\argmin\limits_{k \in [K]}\|\bm{z} - \hmu_k\|^2}, \forall \bm{z} \in \mathcal{Z}
%\end{equation} and some tie-breaking rule.
%
%3. Update step: Compute the mean of each cluster:
%\begin{equation}\label{k-means-update}
%\hmu_k = \frac{1}{|\mathcal{Z}_k|}\sum\limits_{\bm{z} \in \mathcal{Z}_k}\bm{z}, \forall k \in [K]
%\end{equation}
%
%4. Check for convergence: if the new means computed in step 3 is the same as the before, then the algorithm converges and stops. Otherwise, go back to step 2.
%\fi
%
%\section{Discussion Of The Parallel Case}
%In this section, we discuss the case in which the cold-standby and active-parallel parts of a subsystem is activated together. First, consider the following worst-case probabilistic chance function in the problem given system design $\x$:
%\begin{equation}\label{Prob-2}
%\displaystyle \inf\limits_{\P \in \mathbb{F}_K}\P\left[\min\limits_{i \in \mathbf{N}}\left(\sum_{j \in \mathbf{M}^{\rm c}_i}\sum_{t\in \mathbf{T}_{ij}} \tilde{z}_{ijt}x_{ijt} \bigvee \max_{j \in \mathbf{M}^{\rm a}_i}  \max_{t \in \mathbf{T}_{ij}}\tilde{z}_{ijt}x_{ijt}\right)> \mathcal{T}_R \right].
% \end{equation}
%For a better exposition of our approach, we denote by
%\begin{equation}\label{constraint-set-2}
%\mathcal{Z}_i(\x):=\left\{\bm{z} \in \mathbb{R}^H_+ ~\left|~ \sum_{j \in \mathbf{M}^{\rm c}_i}\sum_{t\in \mathbf{T}_{ij}} \tilde{z}_{ijt}x_{ijt} \bigvee \max_{j \in \mathbf{M}^{\rm a}_i}  \max_{t \in \mathbf{T}_{ij}}\tilde{z}_{ijt}x_{ijt} > \mathcal{T}_R \right.\right \}, \forall i \in \mathbf{N},
%\end{equation}
%while the complimentary set of $\mathcal{Z}_i$ for each $i \in \mathbf{N}$, denoted by $\overline{\mathcal{Z}}_i$,  is
%\begin{equation}\label{set-LT-2}
%\overline{\mathcal{Z}}_i(\x)=\overline{\mathcal{Z}}^{\rm c}_i(\x) \cap \overline{\mathcal{Z}}^{\rm a}_i(\x),
%\end{equation}
%where
%$$
%\overline{\mathcal{Z}}^{\rm c}_i(\x):= \left\{\bm{z} \in \mathbb{R}^H_+ ~\left|~ \sum_{j\in \mathbf{M}^{\rm c}_i}\sum_{t\in \mathbf{T}_{ij}} z_{ijt} x_{ijt}\le \mathcal{T}_R \right.\right\}
%$$
%and
%$$
%\overline{\mathcal{Z}}^{\rm a}_i(\x):=\bigcap\limits_{j\in \mathbf{M}^{\rm a}_i}\bigcap\limits_{t\in \mathbf{T}_{ij}} \Big\{\bm{z} \in \mathbb{R}^H_+ ~\left|~z_{ijt} x_{ijt}  \le \mathcal{T}_R \right.\Big\}.
%$$
%
%The following result establishes an equivalent formulation of regular robust optimization for the above worst-case probabilistic chance  function~\eqref{Prob-2}.
%
%\begin{lemma}\label{lem1-LT-2}
%Given system design $\x$, the worst-case probabilistic chance function (\ref{Prob-2}) is equivalent to the optimal value of the following  optimization problem:
%\begin{equation}\label{P2-ambiguity-sup-dual1-2}
%\begin{array}{rcll}
%&\!\!\!\!\!\!\!\!\!\!\!\! \max\limits_{\bm{\alpha}, \bm{\beta}, \bm{\lambda}, \bm{\tau}} &  1-\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}\sum\limits_{k\in[K]} \left(\alpha^{k}_{ijt}\underline{\mu}^{k}_{ij} + \beta^{k}_{ijt}\overline{\mu}^{k}_{ij}\right) - \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\sum\limits_{k\in[K]}\epsilon^{k}_{ij}\lambda^{k}_{ij} - \sum\limits_{k\in[K]}\tau_{k} \\[0.3 cm]
%&\!\!\!\!\!\!\!\!\!\!\!\!{\rm s.t.} & \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge{p_k}, & \!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\forall \bm{z} \in  \mathcal{Z}_k\cap \overline{\mathcal{Z}}_i(\x), i \in \mathbf{N}, k \in [K]\\
%&& \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ij{t}}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge 0, & \!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\forall \bm{z} \in  \mathcal{Z}_k, k\in [K]\\
%&& \halpha \le \mathbf{0}, \hbeta, \hlambda \ge \mathbf{0}, \bm{\tau} \in \mathbb{R}^K,
%\end{array}
%\end{equation}
%where
%\begin{equation}\label{equ:W-2}
%\mathcal{Z}_k = \big\{\bm{z} \in \mathcal{Z}\:\big|\: 2(\hmu_i - \hmu_k)^{\top}\bm{z} \leq \hmu_{i}^{\top}\hmu_{i} - \hmu_{k}^{\top}\hmu_{k}, \forall i \in [K] \big\}, \forall k \in [K].
%\end{equation}
%\end{lemma}
%\begin{proof}
%With the notation in (\ref{constraint-set-2}), the worst-case probabilistic chance function (\ref{Prob-2})  can be rewritten in terms of the probability of its complementary event:
%\begin{equation}\label{1minus-2}
%\inf\limits_{\P\in \mathbb{F}_{K}} \P\Big[\tilde{\bm{z} }\in \mathcal{Z}_i(\x),\forall i \in \mathbf{N} \Big]=1-\sup\limits_{\P \in \mathbb{F}_K } \P\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big].
%\end{equation}
%
%Given the probability distribution of  $\tilde{s}$ as
%$$
%\P\Big[\tilde{k}=k\Big]=p_k, \forall k \in [K].
%$$
%We now define $\P_k$ as the conditional distribution of $\bm{\tilde{z}}$ given $\tilde{k}=k$ for $k \in [K]$, we then can decompose any distribution $\P \in \mathbb{F}_K$ using $\{\P_k, k\in [K]\}$ and rewrite the worst-case chance
%$$
%\sup\limits_{\P \in \mathbb{F}_K } \P\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big]
%$$
%using total probability law as following formulation:
%\begin{eqnarray}\label{P2-ambiguity-sup-2}
%&&\sup\limits_{\P \in \mathbb{F}_K } \P\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big]\\[0.35 cm]
%&=&\sup\limits_{\P_k, \forall k\in[K]}\sum\limits_{k\in[K]}p_k\P_k\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big]\\[0.25 cm]
%&=&\sup\limits_{\P_k}\sum\limits_{k\in[K]}\displaystyle \int_{\cup_{i\in \mathbf{N} }\left\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\right\}}p_k {\rm d}\P_k(\bm{\tilde{z}}) \\[0.25 cm]
%&{\rm s.t.} &\displaystyle \int_{\bm{\tilde{z}} \in \mathcal{Z}_k} \tilde{z}_{ijt} {\rm d}\P_k(\bm{\tilde{z}}) \ge \underline{\mu}^{k}_{ij},    \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k\in [K]\\ [0.35 cm]
%&& \displaystyle \int_{\bm{\tilde{z}} \in \mathcal{Z}_k} \tilde{z}_{ijt} {\rm d}\P_k(\bm{\tilde{z}}) \le \overline{\mu}^{k}_{ij},    \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij},k\in [K]\\ [0.35 cm]
%&&  \displaystyle \int_{\bm{\tilde{z}} \in \mathcal{Z}_k} \sum\limits_{t\in \mathbf{T}_{ij}}\left|\frac{\tilde{z}_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right| {\rm d}\P_k(\bm{\tilde{z}}) \le \epsilon^{k}_{ij},   \forall i \in \mathbf{N}, j\in \mathbf{M}_i,k\in [K]\\[0.35 cm]
%&& \displaystyle \int_{\bm{\tilde{z}} \in \mathcal{Z}_k} {\rm d}\P_k(\bm{\tilde{z}}) =1, \forall k\in[K],
%\end{eqnarray}
%where the support $\mathcal{Z}_k$ is given by (\ref{equ:W-2}). The Lagrange dual of above moment problem (\ref{P2-ambiguity-sup-2}) has the following formulation (Wiesemann~et al.~2014):
%\begin{equation}\label{P1-ambiguity-sup-dual0-2}
%\begin{array}{rcl}
%&\!\!\!\!\!\!\!\!\!\!\!\! \min\limits_{\bm{\alpha}, \bm{\beta}, \bm{\lambda}, \bm{\tau}} &  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}\sum\limits_{k\in[K]} \left(\alpha^{k}_{ijt}\underline{\mu}^{k}_{ij}+ \beta^{k}_{ijt}\overline{\mu}^{k}_{ij}\right)+\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\sum\limits_{k\in[K]}\epsilon^{k}_{ij}\lambda^{k}_{ij} + \sum\limits_{k\in[K]}\tau_{k} \\[0.3 cm]
%&\!\!\!\!\!\!\!\!\!\!\!\!{\rm s.t.} & \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k\\
%&& \ge p_k\mathbb{I}\Big({\cup_{i\in \mathbf{N} }\left\{ \bm{z} \in \overline{\mathcal{Z}}_i\right\}} \Big), \forall \bm{z} \in \mathcal{Z}_k, k \in [K]\\
%&& \halpha \le \mathbf{0}, \hbeta, \hlambda \ge \mathbf{0}, \bm{\tau} \in \mathbb{R}^K,
%\end{array}
%\end{equation}
%where $\mathbb{I}(\{\cdot\})$ is the indicator function with respect to set $\{\cdot\}$, and  $(\halpha, \hbeta, \hlambda, \htau)$ are the dual variables associated with the constraints of the primal problem (\ref{P2-ambiguity-sup-2}).
%
%
%
%Furthermore, we show the strong duality holds. Since ${\mu^{k}_{ij}}$ is the expectation of $\bm{\tilde{z}}_{ijt}$, we can always find a Dirac probability distribution $\P^{\dag}_{\bm{\mu}}$ with $\underline{\hmu}<\hmu<\overline{\hmu}$ which is relative interior point of the feasible set of problem (\ref{P2-ambiguity-sup-2}). Therefore, the Slater condition holds, and then the optimal value of (\ref{P1-ambiguity-sup-dual0-2}) is equivalent to that of problem (\ref{P2-ambiguity-sup-2}).
%
%
%
%
%
%Next, expanding the indication function $\mathbb{I}\left({\cup_{i\in \mathbf{N} }\left\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\right\}} \right)$ for different cases of $\bm{z}$, the above problem (\ref{P1-ambiguity-sup-dual0-2}) is also equivalent to the following formulation:
%\begin{equation}\label{P1-ambiguity-sup-dual00-2}
%\begin{array}{rcll}
%&\!\!\!\!\!\!\!\!\!\!\!\! \min\limits_{\bm{\alpha}, \bm{\beta}, \bm{\lambda}, \bm{\tau}} &  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}\sum\limits_{k\in[K]} \left(\alpha^{k}_{ijt}\underline{\mu}^{k}_{ij}+ \beta^{k}_{ijt}\overline{\mu}^{k}_{ij}\right)+\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\sum\limits_{k\in[K]}\epsilon^{k}_{ij}\lambda^{k}_{ij} + \sum\limits_{k\in[K]}\tau_{k} \\[0.3 cm]
%&\!\!\!\!\!\!\!\!\!\!\!\!{\rm s.t.} & \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge p_k, \forall \bm{z} \in \mathcal{Z}_k\cap\overline{\mathcal{Z}}_i, i \in \mathbf{N}, k \in [K]\\
%&& \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge0, \forall \bm{z} \in \mathcal{Z}_k, k \in [K]\\\nonumber
%&& \halpha \le \mathbf{0}, \hbeta, \hlambda \ge \mathbf{0}, \bm{\tau} \in \mathbb{R}^K.
%\end{array}
%\end{equation}
%Finally, plugging this formulation into the equation (\ref{1minus-2}), we arrive at the the formulation of (\ref{P2-ambiguity-sup-dual1-2}) whose optimal objective value is exactly the worst-case value of probabilistic chance function (\ref{Prob-2}) for all the $\mathbb{P}_k \in \mathbb{F}_k$, given system design $\x$.  The proof is completed.
%\end{proof}
%
%It is noted that the derived optimization problem (\ref{P2-ambiguity-sup-dual1-2}) in current version still belongs to semi-infinitely dimensional optimization problems which are not directly computable.  In the following, we show that by duality argument the problem can be further transformed into a computationally tractable formulation of linear program.
%
%\begin{proposition}\label{P-proposition1b-2}
%Given a system design $\x$, the worst-case probabilistic chance function (\ref{Prob-2}) solves the following linear program (LP):
%\begin{eqnarray}
%&\!\!\!\!\!\! \max &  1-\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}\sum\limits_{k\in[K]} \left(\alpha^{k}_{ijt}\underline{\mu}^{k}_{ij} + \beta^{k}_{ijt}\overline{\mu}^{k}_{ij}\right) - \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\sum\limits_{k\in[K]}\epsilon^{k}_{ij}\lambda^{k}_{ij} - \sum\limits_{k\in[K]}\tau_{k} \label{HP1-ambiguity-LP-FL-2} \\
% &\!\!\!\!\!\!{\rm s.t.} & \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \Big[\phi^{lk}_{ijt}\underline{z}_{ij}+\varphi^{lk}_{ijt}\overline{z}_{ij}+{\nu^{k}_{ij}\left(\pi^{lk}_{ijt}-\varpi^{lk}_{ijt} \right)}  \nonumber\\
%  &&+  \sum\limits_{n \in [K]}\psi^{lk}_{n}({(\mu^{n}_{ijt})}^2 - {(\mu^{k}_{ijt})}^2)\Big] + \sum\limits_{j\in \mathbf{M}^{\rm a}_{l}}\sum\limits_{t\in \mathbf{T}_{lj}} q^{lk}_{jt}\mathcal{T}_R+ s^{lk}\mathcal{T}_R + \tau_k \geq p_k, \forall l \in \mathbf{N}, k \in [K]  \label{HP1-ambiguity-LP-FL1-3}\\
%  && \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \Big[\rho^{k}_{ijt}\underline{z}_{ij}+\varrho^{k}_{ijt}\overline{z}_{ij}+{\nu^{k}_{ij}\left(\gamma^{k}_{ijt}-\theta^{k}_{ijt} \right)} \nonumber\\ && + \sum\limits_{n \in [K]}\varsigma^{k}_{n}({(\mu^{n}_{ijt})}^2 - {(\mu^{k}_{ijt})}^2)\Big] + \tau_k \geq0, \forall k \in [K]\label{HP1-ambiguity-LP-FL1-2-2}\\
% &&  q^{lk}_{jt}x_{l jt}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+{ \pi^{lk}_{ljt}-\varpi^{lk}_{ljt} } = \alpha^{k}_{ljt}+\beta^{k}_{ljt},\nonumber\\ &&  \forall l \in \mathbf{N}, j\in \mathbf{M}^{\rm a}_l, t \in \mathbf{T}_{l j}, k \in [K] \\
%&&  s^{lk}x_{l jt}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+\pi^{lk}_{ljt}-\varpi^{lk}_{ljt}  = \alpha^{k}_{ljt}+\beta^{k}_{ljt},\nonumber\\ &&  \forall l \in \mathbf{N}, j\in \mathbf{M}^{\rm c}_l, t \in \mathbf{T}_{l j}, k \in [K] \\
%&&   \sum\limits_{n \in [K]}2(\mu^{n}_{ijt} - \mu^{k}_{ijt})\psi^{lk}_{n}+  \phi^{lk}_{ijt}+\varphi^{lk}_{ijt}+\pi^{lk}_{ijt}-\varpi^{lk}_{ijt} = \alpha^{k}_{ijt}+\beta^{k}_{ijt},\nonumber\\ && ~ \forall l \in \mathbf{N}, i \in \mathbf{N}\setminus\{l \}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k \in [K], \label{HP2-ambiguity-LP-FL2-2} \\
%&&{|\mathbf{T}_{ij} |\sigma^{k}_{ij}}(\pi^{lk}_{ijt}+\varpi^{lk}_{ijt}) =\lambda^{k}_{ij},  ~ \forall l  \in \mathbf{N}, i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k \in [K] \\
%&& \sum\limits_{n \in [K]}2(\mu^{n}_{ijt} - \mu^{k}_{ijt})\varsigma^{k}_{n}+ \rho^{k}_{ijt}+\varrho^{k}_{ijt}+\gamma^{k}_{ijt}-\theta^{k}_{ijt} = \alpha^{k}_{ijt}+\beta^{k}_{ijt},\nonumber\\ && ~ \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k \in [K]  \\
%&&  {|\mathbf{T}_{ij} |\sigma^{k}_{ij}}(\gamma^{k}_{ijt}+\theta^{k}_{ijt}) =\lambda^{k}_{ij},  ~ \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}, k \in [K] \\
%%&&  q_{l jk}\le y_{{l jk}}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}_{l}, k\in \mathcal{N}(l,j) \\[0.3 cm]
%%&& y_{{l jk}} \ge M x_{l jk}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}_{l}, k\in \mathcal{N}(l,j) \\[0.3 cm]
%%&& y_{{l jk}} \le  q_{l jk }+(x_{l jk}-1)M, ~\forall l  \in \mathbf{N}, j \in \mathcal{J}({l}), k \in \mathcal{N}(l,j)\\[0.3 cm]
%%&&  \sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\ge  L_{ij}, ~  \forall  i \in \mathbf{N}, j\in \mathbf{M}_i \\
%&& \halpha,\q, \s, \hpsi, \hvarphi, \hvarsigma, \hvarrho \le \mathbf{0}, \htau \in \mathbb{R}^K,  \\
%&& \hbeta, \hlambda, \hphi, \hrho, \hpi,\hvarpi, \hgamma, \htheta \ge \mathbf{0}, \label{HP2-ambiguity-LP-FL-2}
%\end{eqnarray}
%where $\halpha, \hbeta, \hlambda, \htau, \q, \s, \hpsi, \hphi, \hvarphi, \hpi, \hvarpi, \hrho, \hvarrho, \hvarsigma, \hgamma, \htheta$ are auxiliary variables.
%\end{proposition}
%
%{\bf Proof.}
%First of all, for a given $l  \in \mathbf{N}$ we deal with the infinitely dimensional constraints
%$$
%\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge p_k, \forall \bm{z} \in \mathcal{Z}_k\cap\overline{\mathcal{Z}}_l, k \in [K]\\
%$$
%Recall that
%$$
%\overline{\mathcal{Z}}_l(\x)=\overline{\mathcal{Z}}^{\rm c}_l(\x) \cap \overline{\mathcal{Z}}^{\rm a}_l(\x)= \left\{\bm{z} \in \mathbb{R}^{H}_+ ~\left|~\begin{array}{l}
%\displaystyle \sum_{j\in \mathbf{M}^{\rm c}_{l}}\sum_{t\in \mathbf{T}_{lj}} z_{ljt} x_{ljt}\le \mathcal{T}_R \\
% z_{ljt} x_{ljt}  \le \mathcal{T}_R,  j\in \mathbf{M}^a_{l}, t \in \mathbf{T}_{lj}
%                                                                                                                                             \end{array}
%\right.\right\},
%$$
%and
%$$
%\mathcal{Z}_k = \big\{\bm{z} \in \mathcal{Z}\:\big|\: 2(\hmu_i - \hmu_k)^{\top}\bm{z} \leq \hmu_{i}^{\top}\hmu_{i} - \hmu_{k}^{\top}\hmu_{k}, \forall i \in [K] \big\}, \forall k \in [K],
%$$
%where
%$$
%\mathcal{Z} := \big\{\bm{z} \in \mathbb{R}^{H}| z_{ijt} \in [\underline{z}_{ij},\overline{z}_{ij}], \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij} \big\}.
%$$
%
%First of all, we claim that for any $k \in [K]$
%\begin{equation}\label{Lifting-3}
%\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge p_k, \forall \bm{z} \in \mathcal{Z}_k\cap\overline{\mathcal{Z}}_l
%\end{equation}
%is equivalent to
%\begin{equation}\label{Lifting-4}
%\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + u^{k}_{ijt}\lambda^{k}_{ij}\right] + \tau_k \ge p_k, \forall (\bm{z},\u) \in \mathcal{W}_k,
%\end{equation}
%where
%$$
%\mathcal{W}_k := \left\{(\bm{z}, \u) \middle| \: \bm{z} \in \mathcal{Z}_k\cap\overline{\mathcal{Z}}_l, \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right| \leq u^{k}_{ijt}, \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij} \right\}, \forall k \in [K].
%$$
%
%Assume \eqref{Lifting-3} holds, since $\hlambda \geq \mathbf{0}$,
%$$
%u^{k}_{ijt}\lambda^{k}_{ij} \geq \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}, \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij}, (\bm{z}, \u) \in \mathcal{W}_k.
%$$
%Therefore \eqref{Lifting-4} holds.
%
%Assume \eqref{Lifting-4} holds, then for any $\hat{\bm{z}} \in \mathcal{Z}_k\cap\overline{\mathcal{Z}}_l$,
%$$
%\min\limits_{(\hat{\bm{z}}, \u) \in \mathcal{W}_k}u^{k}_{ijt} = \left|\frac{ \hat{z}_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|, \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t \in \mathbf{T}_{ij}.
%$$
%Since $\hlambda \geq \mathbf{0}$,
%$$
%\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[\hat{z}_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ \hat{z}_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k
%$$
%$$
%=\min\limits_{(\hat{\bm{z}}, \u) \in \mathcal{W}_k}\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[\hat{z}_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + u^{k}_{ijt}\lambda^{k}_{ij}\right] + \tau_k \geq p_k.
%$$
%Therefore \eqref{Lifting-3} holds.
%
%Therefore \eqref{Lifting-3} and \eqref{Lifting-4} are equivalent.
%
%Then, by introducing auxiliary variables $u^{k}_{ijt}, \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}$, we can equivalently lift the above constraints into the following optimization-based formulation:
%\begin{equation}\label{H-system2}
%\left.\begin{array}{rcll}
%  p_k-\tau_k\le & \min\limits_{\bm{z}, \u} & \displaystyle \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + u^{k}_{ijt} \lambda^{k}_{ij}\right]  \\[0.3 cm]
%&{\rm s.t.} &    z_{l jt}x_{l jt}\le \mathcal{T}_R, &\forall j\in \mathbf{M}^{\rm a}_{l}, t\in \mathbf{T}_{l j}  \\[0.3 cm]
%&& \displaystyle \sum_{j\in \mathbf{M}^{\rm c}_{l}}\sum_{t\in \mathbf{T}_{lj}} z_{ljt} x_{ljt}\le \mathcal{T}_R, &   \\[0.3 cm]
%&&  {|\mathbf{T}_{ij} |\sigma^{k}_{ij}}u^{k}_{ijt}- { z_{ijt}} \ge { -\nu^{k}_{ij}}, & \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij} \\[0.3 cm]
%&&  {|\mathbf{T}_{ij} |\sigma^{k}_{ij}}u^{k}_{ijt}+  { z_{ijt}} \ge  {\nu^{k}_{ij} }, & \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij} \\[0.3 cm]
%&&  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} 2(\mu^{n}_{ijt} - \mu^{k}_{ijt})z_{ijt}\\[0.3 cm]
%&& \leq \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} {(\mu^{n}_{ijt})}^2 - {(\mu^{k}_{ijt})}^2, & \forall n \in [K]\\[0.3 cm]
%&& \underline{z}_{ij} \leq z_{ijt} \leq \overline{z}_{ij} & \forall i \in \mathbf{N}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}\\[0.3cm]
%&& u^{k}_{ijt} \in \mathbb{R}, & \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}.
%\end{array}\right\}, \forall k \in [K]
%\end{equation}
%
%By the strong duality of linear programming, the above constraint is also equivalent to the following system: for all $k$ in $[K]$,
%\begin{equation*}
%\left\{  \begin{array}{rl}
%& p_k-\tau_k\le \sum\limits_{j\in \mathbf{M}^{\rm a}_{l}}\sum\limits_{t\in \mathbf{T}_{lj}} q^{lk}_{jt}\mathcal{T}_R+ s^{lk}\mathcal{T}_R\\[0.3 cm]
%& +\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \Big[\phi^{lk}_{ijt}\underline{z}_{ij}+\varphi^{lk}_{ijt}\overline{z}_{ij}+{\nu^{k}_{ij}\left(\pi^{lk}_{ijt}-\varpi^{lk}_{ijt} \right)}  + \sum\limits_{n \in [K]}\psi^{lk}_{n}({(\mu^{n}_{ijt})}^2 - {(\mu^{k}_{ijt})}^2)\Big]  \\[0.3 cm]
%&  q^{lk}_{jt}x_{l jt}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+{ \pi^{lk}_{ljt}-\varpi^{lk}_{ljt} } = \alpha^{k}_{ljt}+\beta^{k}_{ljt}, \forall j\in \mathbf{M}^{\rm a}_l, t \in \mathbf{T}_{l j} \\[0.3 cm]
%&  s^{lk}x_{l jt}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+\pi^{lk}_{ljt}-\varpi^{lk}_{ljt}  = \alpha^{k}_{ljt}+\beta^{k}_{ljt}, \forall j\in \mathbf{M}^{\rm c}_l, t \in \mathbf{T}_{l j} \\[0.3 cm]
%&   \sum\limits_{n \in [K]}2(\mu^{n}_{ijt} - \mu^{k}_{ijt})\psi^{lk}_{n}+  \phi^{lk}_{ijt}+\varphi^{lk}_{ijt}+\pi^{lk}_{ijt}-\varpi^{lk}_{ijt} = \alpha^{k}_{ijt}+\beta^{k}_{ijt}, ~ \forall i \in \mathbf{N}\setminus\{l \}, j \in \mathbf{M}_i, t\in \mathbf{T}_{ij}  \\[0.3 cm]
%&{|\mathbf{T}_{ij} |\sigma^{k}_{ij}}(\pi^{lk}_{ijt}+\varpi^{lk}_{ijt}) =\lambda^{k}_{ij},  ~ \forall i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij} \\[0.3 cm]
%&   q^{lk}_{jt}\le 0, s^{lk}\le 0, \psi^{lk}_{n} \le 0, \phi^{lk}_{ijt} \geq 0, \varphi^{lk}_{ijt} \leq 0, \pi^{lk}_{ijt}\ge 0,\varpi^{lk}_{ijt}\ge 0, ~\forall n \in [K], i \in \mathbf{N}, j\in \mathbf{M}_i, t\in \mathbf{T}_{ij}.
%\end{array}
%\right\}
%\end{equation*}
%
%
%
%
%Likewise, the constraints
%$$
%\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}  \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right|\lambda^{k}_{ij}\right] + \tau_k \ge0, \forall \bm{z} \in \mathcal{Z}_k, k \in [K]\\
%$$
%can also be dualized similarly.  Leveraging on the derived formulation (\ref{P2-ambiguity-sup-dual1-2}) in Lemma~\ref{lem1-LT-2}, we can arrive at the formulation of the linear program (\ref{HP1-ambiguity-LP-FL-2})--(\ref{HP2-ambiguity-LP-FL-2}). We are done.
%\blot
%
%
%Although the derived formulation (\ref{HP1-ambiguity-LP-FL-2})-(\ref{HP2-ambiguity-LP-FL-2}) can be considered as a linear program given a system design $\mathbf{x}$, in the original problem $\mathbf{x}$ is the decision variable, so there are bilinear terms $q^{lk}_{jt}x_{l jt}$ and $s^{lk}x_{l jt}$ in the overall formulation after the derived formulation is plugged back into the original problem. We can linearize these terms.
%
%
%\begin{proposition}\label{proposition1b-2-2}
%The overall problem  is equivalent to the following mixed integer linear program:
%\begin{eqnarray}
% & \min\limits_{\x} &  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \left[\sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\right]c_{ij} \label{HP1-ambiguity-MILP-FL1-4-2}\\
% &{\rm s.t.} & 1-\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}}\sum\limits_{k\in[K]} \left(\alpha^{k}_{ijt}\underline{\mu}^{k}_{ij} + \beta^{k}_{ijt}\overline{\mu}^{k}_{ij}\right)\nonumber\\
% && - \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\sum\limits_{k\in[K]}\epsilon^{k}_{ij}\lambda^{k}_{ij} - \sum\limits_{k\in[K]}\tau_{k}\ge R_{0}   \\
% &&  L_{i} \le \sum\limits_{j\in \mathbf{M}_i}\sum\limits_{t\in \mathbf{T}_{ij}} x_{ijt}\le  U_{i}, ~  \forall  i \in \mathbf{N} \\
%%&& \sum\limits_{j\in \mathbf{M}_{l}} \sum\limits_{k\in \mathbf{K}_{l j}}q_{l jk}\mathcal{T}_R\nonumber\\
%% &&+\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{k\in \mathbf{K}_{ij}} \left[ \phi^{l }^{k}_{ij}\underline{z}_{ij}+ \varphi^{\varsigma}^{k}_{ij}\overline{z}_{ij}  + {\nu_{ij}\left(\pi^{l }^{k}_{ij}-\varpi^{l }^{k}_{ij} \right)} \right]+\tau \ge 1,~\forall {l  \in \mathbf{N}}  \\
%&& y^{\rm a}_{l jk}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+{ \pi^{lk}_{ljt}-\varpi^{lk}_{ljt} } = \alpha^{k}_{ljt}+\beta^{k}_{ljt},\nonumber\\ &&  \forall l \in \mathbf{N}, j\in \mathbf{M}^{\rm a}_l, t \in \mathbf{T}_{l j}, k \in [K]  \label{con:32}  \\
%&& y^{\rm c}_{l jk}  +\sum\limits_{n \in [K]}2(\mu^{n}_{ljt} - \mu^{k}_{ljt})\psi^{lk}_{n}+ \phi^{lk}_{ljt}+\varphi^{lk}_{ljt}+\pi^{lk}_{ljt}-\varpi^{lk}_{ljt}  = \alpha^{k}_{ljt}+\beta^{k}_{ljt},\nonumber\\ &&  \forall l \in \mathbf{N}, j\in \mathbf{M}^{\rm c}_l, t \in \mathbf{T}_{l j}, k \in [K]    \\
% && (\ref{HP1-ambiguity-LP-FL1-3}-\ref{HP1-ambiguity-LP-FL1-2-2}); (\ref{HP2-ambiguity-LP-FL2-2})-(\ref{HP2-ambiguity-LP-FL-2})\\
%%&&\phi^{l }^{k}_{ij}+\varphi^{l }^{k}_{ij}+ { \pi^{l }^{k}_{ij}-\varpi^{l }^{k}_{ij} }= \alpha^{k}_{ij}+\beta^{k}_{ij}, ~ \forall {l  \in \mathbf{N}}, i \in \mathbf{N}\setminus\{l \}, j \in \mathbf{M}_i, k\in \mathbf{K}_{ij}  \\
%%&&{|\mathbf{K}_{ij} |\sigma_{ij}} (\pi^{l }^{k}_{ij}+\varpi^{l }^{k}_{ij}) =\lambda_{ij},  ~ \forall {l  \in \mathbf{N}}, i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij} \\
%%&& \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{k\in \mathbf{K}_{ij}} \left[\left(\varsigma^{k}_{ij}\underline{z}_{ij}+ \vartheta^{k}_{ij}\overline{z}_{ij} \right) +  {\nu_{ij}\left(\gamma^{k}_{ij}-\theta^{k}_{ij} \right)}\right]+\tau \ge 0\\
%%&& \varsigma^{k}_{ij}+ \vartheta^{k}_{ij} + { \gamma^{k}_{ij}-\theta^{k}_{ij} }= \alpha^{k}_{ij}+\beta^{k}_{ij}, ~ \forall  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij} \\
%%&&  {|\mathbf{K}_{ij} |\sigma_{ij}}(\gamma^{k}_{ij}+\theta^{k}_{ij}) = \lambda_{{ij}}, ~  \forall  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij} \\
%&&  q^{lk}_{jt}\le y^{{\rm a}lk}_{jt}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm a}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \\
%&& y^{{\rm a}lk}_{jt} \ge M x_{l jt}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm a}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \\
%&& y^{{\rm a}lk}_{jt} \le  q^{lk}_{jt}+(x_{l jt}-1)M, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm a}_{l}, t \in \mathbf{T}_{l j}, k \in [K]\\
%&&  s^{lk}\le y^{{\rm c}lk}_{jt}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm c}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \\
%&& y^{{\rm c}lk}_{jt} \ge M x_{l jt}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm c}_{l}, t\in \mathbf{T}_{l j}, k \in [K] \\
%&& y^{{\rm c}lk}_{jt} \le  s^{lk}+(x_{l jt}-1)M, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}^{\rm c}_{l}, t \in \mathbf{T}_{l j}, k \in [K]\\
%%&& \alpha^{k}_{ij}\le 0, \beta^{k}_{ij}\ge 0, \lambda_{{ij}} \ge 0, \tau \in \Re, ~\forall i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij}\\
%%&&q_{l jk}\le 0, y_{l jk} \le 0, ~\forall {l  \in \mathbf{N}}, j\in \mathbf{M}_{l}, k\in \mathbf{K}_{l j} \\
%%&& \phi^{l }^{k}_{ij} \ge 0,  \varphi^{l }^{k}_{ij} \le 0,  \pi^{l }^{k}_{ij}\ge 0,\varpi^{l }^{k}_{ij}\ge 0, ~\forall {l  \in \mathbf{N}},  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij}\\
%%&& \theta^{k}_{ij}\ge 0, \gamma^{k}_{ij}\ge 0, \varsigma^{k}_{ij}\ge 0, \vartheta^{k}_{ij} \le 0, ~\forall  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij}\\
%&& \y^{\rm a}, \y^{\rm c} \le \mathbf{0}, \x\in \{0,1\}^{H},  \label{HP1-ambiguity-MILP-FL2-4-2}
%\end{eqnarray}
%where $\halpha, \hbeta, \hlambda, \htau, \q, \s, \hpsi, \hphi, \hvarphi, \hpi, \hvarpi, \hrho, \hvarrho, \hvarsigma, \hgamma, \htheta, \y^{\rm a}$ and $\y^{\rm c}$ are auxiliary variables and $M$ is a sufficiently small negative number.
%\end{proposition}
%{\bf Proof. }
%In the proof of Proposition~\ref{P-proposition1b-2}, the feasible set $\mathcal{Z}_k\cap \overline{\mathcal{Z}}_{l }(\x)$ of the minimization problem
%$$
%\min\limits_{\bm{z}} \displaystyle \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{t\in \mathbf{T}_{ij}} \left[z_{ijt}\left(\alpha^{k}_{ijt}+ \beta^{k}_{ijt} \right) + \left|\frac{ z_{ijt}-\nu^{k}_{ij}}{|\mathbf{T}_{ij} |\sigma^{k}_{ij}} \right| \lambda^{k}_{ij}\right]
%$$
%is bounded. Assuming that it is nonempty, then its lifted equivalent form of the inner minimization problem in (\ref{H-system2}) is also bounded and nonempty. Therefore, the dual variables $q^{lk}_{jt}$ and $s^{lk}$ are also bounded. Therefore, we can linearize the bilinear terms $q^{lk}_{jt}x_{l jt}$ and $s^{lk}x_{l jt}$ by introducing new variables $y^{{\rm a}lk}_{jt}$ and $y^{{\rm c}lk}_{jt}$, such that
%\begin{equation}\label{equ:linear1}
%q^{lk}_{jt}\le y^{{\rm a}lk}_{jt}\le 0,~y^{{\rm a}lk}_{jt} \ge M x_{l jt},~y^{{\rm a}lk}_{jt} \le  q^{lk}_{jt}+(x_{l jt}-1)M,
%\end{equation}
%and
%\begin{equation}\label{equ:linear2}
%s^{lk}\le y^{{\rm c}lk}_{jt}\le 0,~y^{{\rm c}lk}_{jt} \ge M x_{l jt},~y^{{\rm c}lk}_{jt} \le  s^{lk}+(x_{l jt}-1)M,
%\end{equation}
%respectively, where $M$ is a sufficiently small negative number (in numerical computation, M can be set to a negative number with very large absolute value).  Using this linearization technique, we can arrive at the following formulation of mixed integer linear program (MILP) for the distributionally robust redundancy allocation problem. \blot\\
% Note that in the linearized MIP formulation~\eqref{HP1-ambiguity-MILP-FL1-4-2}-\eqref{HP1-ambiguity-MILP-FL2-4-2}, the integer variables are still $x_{ijt}$, the original redundancy allocation decision variables, therefore the problem remains tractable.
%
%\section{A baseline probabilistic reliability model}\label{EC4}
%For comparison, we consider the following probabilistic reliability model, which is an MIP with binaries (Feizollahi and Modarres~2012, Wang et.al. 2019):
%\begin{eqnarray}
%\begin{array}{rcll}
%& \min\limits_{\x} & \displaystyle \sum\limits_{i\in \mathbf{N}} \left[L_{i}+\sum_{k=0}^{U_{i}-L_{i}}kx_{ik}\right]c_{i} \\[0.3 cm]
%& {\rm s.t.} &  \displaystyle \sum_{i \in \mathbf{N}} \sum_{k=0}^{U_{i}-L_{i}}x_{ik}\ln\left[1-r_{i}^{L_{i}+k} \right]\ge \ln R_0  & \\[0.3 cm]
%&& \displaystyle \sum_{k=0}^{U_{i}-L_{i}}x_{ik}=1, & \forall i \in \mathbf{N}\\
%&& x_{ik} \in \{0,1\},  & \forall i \in \mathbf{N}, k \in [0; U_{i}-L_{i}],
%\end{array}
%\end{eqnarray}
%which can be solved by off-the-shelf MIP solvers. Note that the above linear MIP transformation holds only for the regular series-parallel redundant systems with a single type of component; as for the situation that involves multiple types ({\it i.e.,} $|\mathbf{M}_i|>1$ for some $i \in \mathbf{N}$) or the cold-standby subsystems are considered, the probabilistic model, in general, becomes intractable.
%
%\iffalse
%
%Furthermore, to enhance the scalability, we can harness both the structures of formulation (\ref{HP1-ambiguity-LP-FL})-(\ref{HP2-ambiguity-LP-FL}) and linearization \eqref{equ:linear1}-\eqref{equ:linear2} and design a Benders decomposition styled algorithm to solve the overall problem \eqref{HP1-ambiguity-X} iteratively. We briefly describe the procedures as follows.
%
%
%To ease the exposition, we denote by
%$$
%c(\x):=\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \left[\sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\right]c_{ij},
%$$
%and
%$$
%R(\x):=\inf\limits_{\P \in \mathbb{F}}\P\left[\min\limits_{i \in \mathbf{N}}\left(\sum_{j \in \mathbf{M}^{\rm c}_i}\sum_{k\in \mathbf{K}_{ij}} \tilde{z}^{k}_{ij}x^{k}_{ij} \bigvee \max_{j \in \mathbf{M}^{\rm a}_i}  \max_{k \in \mathbf{K}_{ij}}\tilde{z}^{k}_{ij}x^{k}_{ij}\right)> \mathcal{T}_R \right].
%$$
%If we applying the linearizations \eqref{equ:linear1}-\eqref{equ:linear2} to replace the bilinear terms $x_{l jk}q_{l jk}$ and $x_{l jk}p_{l}$ in the formulation (\ref{HP1-ambiguity-LP-FL})-(\ref{HP2-ambiguity-LP-FL})  of the worst-case probabilistic chance function (\ref{Prob-1}), we can re-express the chance function (\ref{Prob-1}) given $\x$ as the following compact form:
%$$
%R(\x)=\max_{\mathbf{A}\mathbf{d}=\mathbf{B}\mathbf{x}+\mathbf{r},\mathbf{d}\ge \mathbf{0}}\mathbf{d}'\mathbf{b},
%$$
%where $\mathbf{A}\mathbf{d}=\mathbf{B}\mathbf{x}+\mathbf{r},\mathbf{d}\ge \mathbf{0}$ represents the constraints in the problem (\ref{HP1-ambiguity-LP-FL})-(\ref{HP2-ambiguity-LP-FL}) after the linearization in a standardized form of linear program with $\mathbf{d}$ being the standardized decision variables, and $\mathbf{A},\mathbf{B}$ and $\b,\mathbf{r}$ being the matrix and vector coefficients or inputs with comfortable dimensions. Then by the duality argument, we can rewrite $R(\x)$ as
%\begin{equation}\label{algo-1}
%R(\x)=\min_{\mathbf{A}'\mathbf{g}\ge \mathbf{b}}\x'(\mathbf{B}'\mathbf{g})+ \mathbf{r}'\mathbf{g},
%\end{equation}
%where $\mathbf{g}$ are the dual variables. Hence the {\em relaxed master problem} of overall problem \eqref{HP1-ambiguity-X} can be expressed as
%\begin{equation}\label{algo-master}
%\begin{array}{rcll}
%& \min\limits_{\x} &  c(\x) \\[0.3 cm]
%& {\rm s.t.} & \x'(\mathbf{B}'\mathbf{g})+ \mathbf{r}'\mathbf{g} \ge R_0,& \forall \mathbf{g} \in \mathcal{G}\\
%&& L_{i} \le \sum\limits_{j\in \mathbf{M}_i}\sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\le  U_{i}, &\forall  i \in \mathbf{N} \\
%&& \x\in \{0,1\}^K,
%\end{array}
%\end{equation}
%where $\mathcal{G}\subset \mathcal{V}$ with $\mathcal{V}$ being the set of vertices of polyhedron $\{\g: \mathbf{A}'\mathbf{g}\ge \mathbf{b}\}$.  In particular, we have the following optimality condition:
%\begin{proposition}\label{optimality}
%Let $\x$ is a solution of the relaxed master problem \eqref{algo-master}, if $R(\x) \ge R_0$ then it is the optimal solution of \eqref{HP1-ambiguity-X}.
%\end{proposition}
%\begin{proof}
%Let $c^*$ be the optimal cost of overall problem \eqref{HP1-ambiguity-X}. Since $\x$ is a solution of relaxed master problem \eqref{algo-master}, we have $c(\x) \le c^*$. Now if $R(\x) \ge R_0$, $\x$ is also a feasible solution of \eqref{HP1-ambiguity-X}, it is then optimal.
%\end{proof}
%
%In our algorithm, for a given design $\x$, the linear program \eqref{algo-1} is used to establish the separation problem and optimality condition, where its solution $\mathbf{g}^*$ is used to generate cuts. Since $|\mathcal{V}|$ is finite, the algorithm can achieve the optimality in finite steps. The procedure of the algorithm is summarized as follows.
%
%\noindent\rule{\textwidth}{0.1em}\vspace{-5pt}\\
%\noindent {\bf A Benders Decomposition Algorithm for \eqref{HP1-ambiguity-X}.} \vspace{-10pt}\\
%\noindent\rule{\textwidth}{0.05em} \\
%{\bf Initialization:} $\mathcal{G}=\emptyset$.
%
%\begin{enumerate}
%\item  Solve the relaxed master problem \eqref{algo-master}, and obtain solution $\x$.
%\item  Compute $R(\x)$ with linear program \eqref{algo-1}:
%\begin{enumerate}
%  \item If $R(\x) \ge R_0$, then STOP and let $\x^*=\x$.
%  \item Otherwise, obtain solution $\mathbf{g}^*$ and update $\mathcal{G}=\mathcal{G} \cup \{\mathbf{g}^*\}$. Go to STEP 1.
%\end{enumerate}
%\end{enumerate}
%\noindent{\bf Output:} The optimal redundancy design $\x^*$.
%
%\vspace{-5pt}
%\noindent\rule{\textwidth}{0.1em}\vspace{-17pt}\\
%%decision vector $\mathbf{d}$ represents the variables $\halpha, \hbeta, \hlambda, \tau, \q, \s, \y^{\rm a}, \y^{\rm c}, \hphi, \hvarphi, \hpi, \hvarpi, \htheta, \hvartheta, \hvarsigma$ and $\hvartheta$ of the,  $\r$ is the input vector with entries of $1, 0$ and $M$.
%
%
%%\begin{remark}\color{blue}
%%If we impose the independence of the lifetimes, then the problem becomes
%%\begin{eqnarray*}
%%&&\max_{\x}\P\left[\min\limits_{i \in \mathbf{N}} \left(\max\limits_{j \in \mathbf{M}_i}  \max\limits_{k \in \mathbf{K}_{ij}}   \tilde{z}^{k}_{ij} x^{k}_{ij}\right)> \mathcal{T}_R \right]\\
%%&=&\max_{\x}\prod_{i \in \mathbf{N}}\left[1-\prod_{j \in \mathbf{M}_i}\prod_{k \in \mathbf{K}_{ij}} \P\Big[\tilde{z}^{k}_{ij} x^{k}_{ij}\le \mathcal{T}_R \Big]\right]
%%\end{eqnarray*}
%%which is non-convex with respect to $\P\Big[\tilde{z}^{k}_{ij} x^{k}_{ij}\le \mathcal{T}_R \Big]$.
%%\end{remark}
%
%
%
%
%
%
%
%
%
%
%
%
%
%\section{Extreme Lifetime Distributions}\label{sec:ELD}
%In this section, we develop another model that can recover efficiently the extreme joint probability distribution of component lifetimes $\bm{\tilde{z}}$ that achieves the worst-case probability level (\ref{Prob-1}) for a given system design $\x$.
%
%\begin{proposition}\label{extreme}
%Given a system design $\x$, the extreme probability distribution of the component lifetimes  $\bm{\tilde{z}}$ can be constructed by the following discrete distribution with $|\mathbf{N}|$+1 scenarios:
%\begin{equation}\label{equ:extreme1-1H}
%\P^*\!\!\left[\tilde{z}^{k}_{ij}=\frac{b^*^{k}_{ij}}{p^*_i}, \forall j \in \mathbf{M}_i, k\in \mathbf{K}_{ij}; \tilde{z}_{ljk}=\frac{d^*_{iljk}}{p^*_i},\forall l\in \mathbf{N}\backslash \{i\}, j \in \mathbf{M}_l, k\in \mathbf{K}_{lj} \right]=p^*_i, \forall i \in \mathbf{N},
%\end{equation}
%and
%\begin{equation}\label{equ:extreme1-2H}
%\P^*\!\!\left[\tilde{z}^{k}_{ij}=\frac{s^*^{k}_{ij}}{p^*_{|\mathbf{N}|+1}}, \forall i \in \mathbf{N}, j \in \mathbf{M}_i, k\in \mathbf{K}_{ij}\right]=p^*_{|\mathbf{N}|+1},
%\end{equation}
%where $\b^*, \d^*, \s^*$ and $\p^*$ can be obtained by solving the following linear program:
%\begin{equation}\label{equ:extreme2H}\left.
% \begin{array}{rcll}
%& \max\limits_{\b,\d,\s,\p}  & \displaystyle \sum\limits_{i\in \mathbf{N}}p_i\\[0.2 cm]
%&{\rm s.t.} & \displaystyle \sum\limits_{i\in \mathbf{N}}p_i + p_{|\mathbf{N}|+1}=1 & \\[0.2 cm]
%&&\displaystyle b^{k}_{ij} x^{k}_{ij} \le p_i \mathcal{T}_R, \forall i \in \mathbf{N}, j\in \mathbf{M}^{\rm a}_{i}, k\in \mathbf{K}_{i j}   \\[0.2 cm]
%&&\displaystyle \sum_{j\in \mathbf{M}^{\rm c}_{i}}\sum_{k\in \mathbf{K}_{i j}}b^{k}_{ij} x^{k}_{ij} \le p_i \mathcal{T}_R,  \forall i \in \mathbf{N} \\[0.2 cm]
%&&\displaystyle \underline{\mu}_{ij}\le b^{k}_{ij}+\sum_{l\in \mathbf{N}\backslash \{i\}}d_{lijk} + s^{k}_{ij} \le \overline{\mu}_{ij}, \forall i \in \mathbf{N}, j\in \mathbf{M}_{i}, k\in \mathbf{K}_{i j}   \\[0.2 cm]
%&&\displaystyle \underline{z}_{ij}p_i\le b^{k}_{ij}\le  \overline{{z}}_{ij}p_i, \forall i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij} \\[0.3 cm]
%&&\displaystyle \underline{z}_{lj}p_i\le d_{iljk}\le  \overline{{z}}_{lj}p_i,  \forall i \in \mathbf{N}, l\in \mathbf{N}\backslash \{i\}, j\in \mathbf{M}_l, k\in \mathbf{K}_{lj}\\[0.2 cm]
%&&\displaystyle  \underline{{z}}_{{ij}}p_{|\mathbf{N}|+1} \le s^{k}_{ij}\le \overline{{z}}_{{ij}}p_{|\mathbf{N}|+1},  \forall i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij}  \\[0.2 cm]
%%&&\displaystyle  \sum_{k \in \mathbf{K}_{ij}}\frac{|b^{k}_{ij}-\nu_{ij}p_i|}{|\mathbf{K}_{ij}||\sigma_{ij}|}+\\
%%&&\displaystyle\sum_{l\in \mathbf{N}\backslash \{i\}}\sum_{k \in \mathbf{K}_{ij}}\frac{|d_{lijk}-\nu_{ij}p_l|}{|\mathbf{K}_{ij}||\sigma_{ij}|}+&\\
%%&&\displaystyle\sum_{k \in \mathbf{K}_{ij}}\frac{|s^{k}_{ij}-\nu_{ij}p_{|\mathbf{N}|+1}|}{|\mathbf{K}_{ij}||\sigma_{ij}|}\le \epsilon_{ij}, & \forall i \in \mathbf{N}, j\in \mathbf{M}_i\\
%&&\displaystyle  \sum_{k \in \mathbf{K}_{ij}} \xi^{k}_{ij}+\sum_{l\in \mathbf{N}\backslash \{i\}}\sum_{k \in \mathbf{K}_{ij}} \eta_{lijk}+ \sum_{k \in \mathbf{K}_{ij}} \bm{z}eta^{k}_{ij}\le \epsilon_{ij}, \forall i \in \mathbf{N}, j\in \mathbf{M}_{i}\\[0.2 cm]
%&& \displaystyle b^{k}_{ij}-\nu_{ij}p_i  -\xi^{k}_{ij}|\mathbf{K}_{ij}||\sigma_{ij}|\le 0,  \forall i \in \mathbf{N}, j\in \mathbf{M}_{i}, k\in \mathbf{K}_{ij} \\[0.2 cm]
%&& \displaystyle \nu_{ij}p_i- b^{k}_{ij} -\xi^{k}_{ij}|\mathbf{K}_{ij}||\sigma_{ij}|\le 0,  \forall i \in \mathbf{N}, j\in \mathbf{M}_{i}, k\in \mathbf{K}_{ij} \\[0.2 cm]
%&& \displaystyle  d_{lijk}-\nu_{ij}p_l -\eta_{lijk}|\mathbf{K}_{ij}||\sigma_{ij}|\le 0, \forall i \in \mathbf{N}, l\in \mathbf{N}\backslash \{i\},  j\in \mathbf{M}_{l}, k\in \mathbf{K}_{lj}\\[0.2 cm]
%&& \displaystyle \nu_{ij}p_l -d_{lijk} -\eta_{lijk}|\mathbf{K}_{ij}||\sigma_{ij}|\le 0, \forall i \in \mathbf{N}, l\in \mathbf{N}\backslash \{i\}, j\in \mathbf{M}_{l}, k\in \mathbf{K}_{lj}\\[0.2 cm]
%&& \displaystyle  s^{k}_{ij}-\nu_{ij}p_{|\mathbf{N}|+1}- \bm{z}eta^{k}_{ij}|\mathbf{K}_{ij}||\sigma_{ij}|\le 0, \forall i \in \mathbf{N}, j\in \mathbf{M}_{i}, k\in \mathbf{K}_{ij}  \\[0.2 cm]
%&& \displaystyle  \nu_{ij}p_{|\mathbf{N}|+1}-s^{k}_{ij}-\bm{z}eta^{k}_{ij}|\mathbf{K}_{ij}||\sigma_{ij}| \le 0, \forall i \in \mathbf{N}, j\in \mathbf{M}_{i}, k\in \mathbf{K}_{ij} \\[0.2 cm]
%&& \p\in \mathbb{R}^{|\mathbf{N}|+1}_+,  \b \in \mathbb{R}^{K}, \s \in \mathbb{R}^{K}, \d \in  \mathbb{R}^{K(|\mathbf{N}|+1)}, \hxi \in \mathbb{R}_+^{K}, \heta \in \mathbb{R}_+^{K(|\mathbf{N}|+1)}, \hzeta \in \mathbb{R}_+^{K},
%%&& \p\in \mathbb{R}^{|\mathbf{N}|+1}_+,  \b \in \mathbb{R}^{K}, \s \in \mathbb{R}^{K}, \d \in  \mathbb{R}^{K(|\mathbf{N}|+1)}.
%\end{array}
%\right\}
%\end{equation}
%where $\hxi,\heta$ and $\hzeta$ are auxiliary variables.
%\end{proposition}
%\begin{proof}
%For the given system design $\x$, recall that
%\begin{eqnarray*}
%\inf\limits_{\P \in \mathbb{F}}\P\Big[\tilde{\bm{z} }\in \mathcal{Z}_i(\x),\forall i \in \mathbf{N} \Big]=\P^*\!\Big[\tilde{\bm{z} }\in \mathcal{Z}_i(\x),\forall i \in \mathbf{N} \Big]
%&=&1-\P^*\!\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big]\\
%&=&1-\sup\limits_{\P \in \mathbb{F}}\P\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big].
%\end{eqnarray*}
%It suffices to prove that the probability distribution $\P^*$ obtained in (\ref{equ:extreme1-1H})-(\ref{equ:extreme1-2H}) is the extreme distribution for
%\begin{equation}\label{equ:sup-1H}
%\sup\limits_{\P \in \mathbb{F}}\P\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big].
%\end{equation}
%In fact, by proposition~\eqref{P-proposition1b}, the above supremum (\ref{equ:sup-1H}) is equivalent to the optimal objective of
%$$
%\min\left\{\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{k\in \mathbf{K}_{ij}} \left(\alpha^{k}_{ij}\underline{\mu}_{{ij}}+ \beta^{k}_{ij}\overline{\mu}_{{ij}}\right)+\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\epsilon_{ij}\lambda_{ij} + \tau: (\ref{HP1-ambiguity-LP-FL1})-(\ref{HP2-ambiguity-LP-FL}) \right\},
%$$
%which by duality argument is also equivalent the optimal objective of (\ref{equ:extreme2H}). Now we are left to show the probability distribution (\ref{equ:extreme1-1H})-(\ref{equ:extreme1-2H}) formed by the solution $(\b^*, \d^*, \s^*,\p^*)$ of (\ref{equ:extreme2H}) is a qualified distribution in the ambiguity set (\ref{ambiguity-set}). This can be readily seen from the constraints in (\ref{equ:extreme2H}), by which the probability distribution $\mathbb{P}^*$ identified in (\ref{equ:extreme1-1H})-(\ref{equ:extreme1-2H}) satisfies the following constraints:
%$$
%\left\{ \begin{array}{rcll}
%&& \displaystyle \sum\limits_{i\in \mathbf{N}}p^*_i + p^*_{|\mathbf{N}|+1}=1 & \\[0.3 cm]
%%&&\displaystyle \left(\frac{b_{igjk}}{p_i}\right)x^g^{k}_{ij} \le \mathcal{T}_R, \forall i \in \mathbf{N}, g \in \mathbf{G}_i, j\in \mathbf{M}^{\rm a}_{ig}, k\in \mathbf{K}^g_{i j}   \\[0.3 cm]
%%&&\displaystyle \sum_{j\in \mathbf{M}^{\rm c}_{ig}}\sum_{k\in \mathbf{K}^g_{i j}}\left(\frac{b_{igjk}}{p_i }\right) x^g^{k}_{ij} \le \mathcal{T}_R,  \forall i \in \mathbf{N}, g \in \mathbf{G}_i \\[0.3 cm]
%&&\displaystyle \underline{\mu}_{ij}\le p^*_i\left[\frac{b^*^{k}_{ij}}{p^*_i}\right]+\sum_{l\in \mathbf{N}\backslash \{i\}} p^*_l\left[\frac{d^*_{lijk}}{p^*_l}\right] + p^*_{|\mathbf{N}|+1}\left[\frac{s^*^{k}_{ij}}{p^*_{|\mathbf{N}|+1}}\right] \le \overline{\mu}_{ij}, \forall i \in \mathbf{N},  j\in \mathbf{M}_{i}, k\in \mathbf{K}_{i j}   \\[0.3 cm]
%&&\displaystyle \underline{z}_{ij}\le \frac{b^*^{k}_{ij}}{p^*_i}\le  \overline{{z}}_{ij}, \forall i \in \mathbf{N}, j\in \mathbf{M}_{i}, k\in \mathbf{K}_{ij} \\[0.3 cm]
%&&\displaystyle \underline{z}_{lj} \le \frac{d^*_{iljk}}{p^*_i}\le  \overline{{z}}_{lj},  \forall i \in \mathbf{N}, l\in \mathbf{N}\backslash \{i\}, j\in \mathbf{M}_{l}, k\in \mathbf{K}_{lj}\\[0.3 cm]
%&&\displaystyle  \underline{{z}}_{{ij}} \le \frac{s^*^{k}_{ij}}{p_{|\mathbf{N}|+1}}\le \overline{{z}}_{{ij}},  \forall i \in \mathbf{N}, j\in \mathbf{M}_{i}, k\in \mathbf{K}_{ij}  \\[0.3 cm]
%&&\displaystyle  p^*_i\left[\sum_{k \in \mathbf{K}_{ij}}\frac{|b^*^{k}_{ij}/p^*_i-\nu_{ij}|}{|\mathbf{K}_{ij}||\sigma_{ij}|}\right]+\sum_{l\in \mathbf{N}\backslash \{i\}}p^*_l\left[\sum_{k \in \mathbf{K}_{ij}}\frac{|d^*_{lijk}/p^*_l-\nu_{ij}|}{|\mathbf{K}_{ij}||\sigma_{ij}|}\right]+&\\
%&&\displaystyle p^*_{|\mathbf{N}|+1}\left[\sum_{k \in \mathbf{K}_{ij}}\frac{|s^*^{k}_{ij}/p^*_{|\mathbf{N}|+1}-\nu_{ij}|}{|\mathbf{K}_{ij}||\sigma_{ij}|}\right]\le \epsilon_{ij}, \forall i \in \mathbf{N},  j\in \mathbf{M}_{i},
%\end{array}\right\}
%$$
%which is exactly a qualified distribution in the ambiguity set $\mathbb{F}$ in (\ref{ambiguity-set}).
%% of $\P^*\!\Big[\tilde{\bm{z} }\in \mathcal{Z}_i(\x),\forall i \in \mathbf{N} \Big]$, by the duality argument, we can have the extreme probability level of the complementary event
%%$$
%%\P^*\!\Big[\cup_{i\in \mathbf{N} }\{\bm{\tilde{z}} \in \overline{\mathcal{Z}}_i(\x)\} \Big]=\P^*\!\left[\bigcup_{i\in \mathbf{N} }\Big\{z^{k}_{ij}x^{k}_{ij}\le \mathcal{T}_R, \forall j \in \mathbf{M}_i, k\in \mathbf{K}_{ij}\Big\} \right]
%%$$
%%solves the optimization problem (\ref{equ:extreme2}). It can be seen that the problem (\ref{equ:extreme2}) solves exactly for the discrete probability distribution of $\bm{\tilde{z}}$ in (\ref{equ:extreme1-1})-(\ref{equ:extreme1-2}) with $|\mathbf{N}|+1$ scenarios that achieves the highest (extreme) probability level of event
%%$$
%%\bigcup_{i\in \mathbf{N} }\Big\{z^{k}_{ij}x^{k}_{ij}\le \mathcal{T}_R, \forall j \in \mathbf{M}_i, k\in \mathbf{K}_{ij}\Big\}
%%$$
%%and satisfies the distributional requirement in the ambiguity set (\ref{ambiguity-set}).
%\end{proof}
%
%
%
%\section{Computational Study}\label{CS3}
%
%In this section, we demonstrate the performance of the developed models by performing sufficient numerical experiments and a case study. The computational study consists of three parts: (i) Evaluating the worst-case probability level, i.e., model (\ref{HP1-ambiguity-LP-FL})-(\ref{HP2-ambiguity-LP-FL}), for a given system design under different  parameter settings. (ii) Robust reliability optimization using model (\ref{HP1-ambiguity-MILP-FL1})-(\ref{HP1-ambiguity-MILP-FL2}). (iii) Performance comparison with probabilistic RAP model using out-of-sample testing. (iv) A real-life case of hybrid cold-standby and parallel high speed train system.
%
%
%
%All experiments were run on a PC with an Intel Core(TM) i7-4510U processor at 2.60 Ghz. All  MIP models were coded in Python and solved using the callback library of Mosek version 8.1.0.34.
%
%
%
%
%
%
%
%
%\subsection{Computing the reliability level}\label{sec:comp1}
%In the parts (i) and (ii) of numerical experiments (Sections~\ref{sec:comp1} and~\ref{sec:comp2}), we consider the following setting: a series-parallel system with 5 subsystems ($|\mathbf{N}|=5$), and each subsystem contains 3 types of components ($|\mathbf{M}_i|\equiv|\mathbf{M}|= 3, \forall i \in \mathbf{N}$) where each type is of 3 redundant components ($|\mathbf{K}_{ij}|\equiv 3, \forall i \in \mathbf{N}, j\in \mathbf{M}_i$). Furthermore, we set the distributional parameters of component lifetimes $([\underline{\hmu}, \overline{\hmu}], [\underline{\bm{z}}, \overline{\bm{z}}], \bm{\hsigma}, \hepsilon)$ in the ambiguity set $\mathbb{F}$  as follows.
%
%We utilize parameters $\chi \ge 0$ and $t \ge 0$ to adjust the gap level of $[\underline{\mu}_{ij}, \overline{\mu}_{ij}]$ and $[\underline{z}_{ij}, \overline{z}_{ij}]$, respectively, as follows
%$$
%\underline{\mu}_{ij}=\underline{\mu}^o_{ij}-\chi(\overline{\mu}^o_{ij}-\underline{\mu}^o_{ij}),\quad \overline{\mu}_{ij}=\overline{\mu}^o_{ij}+\chi(\overline{\mu}^o_{ij}-\underline{\mu}^o_{ij})
%$$
%where $\underline{\mu}^o_{ij} \in \mathbf{U}[10,13], \overline{\mu}^o_{ij} \in \mathbf{U}[15,18], \forall i \in [5], j \in [3]$, and
%$$
%\underline{z}_{ij}=\underline{z}^o_{ij}-t(\overline{z}^o_{ij}-\underline{z}^o_{ij}),\quad \overline{z}_{ij}=\overline{z}^o_{ij}+t(\overline{z}^o_{ij}-\underline{z}^o_{ij}),
%$$
%where $\underline{z}^o_{ij} \in \mathbf{U}[5,10], \overline{z}^o_{ij} \in \mathbf{U}[30,50], \forall i \in [5], j \in [3]$. Also we let
%$$\sigma_{ij}=\kappa \sigma^o_{ij}, \quad \epsilon_{ij}=\theta\epsilon^o_{ij},$$
%where $\sigma^o_{ij} \in \mathbf{U}[4,5], \epsilon^o_{ij} \in \mathbf{U}[0, 0.5], \forall i\in [5], j\in [3]$ and $\kappa, \theta \ge 0$ are the adjustable parameters. Therefore, by changing the values of $(\chi,t,\kappa,\theta)$, we can have different sets of parameters $([\underline{\hmu}, \overline{\hmu}], [\underline{\bm{z}}, \overline{\bm{z}}], \bm{\hsigma}, \hepsilon)$. Specifically, large values of $(\chi,t)$ correspond to the large gaps of $[\underline{\mu}_{ij}, \overline{\mu}_{ij}]$ and $[\underline{z}_{ij}, \overline{z}_{ij}], \forall i \in [5], j \in [3]$, while large values of $(\kappa,\theta)$ correspond to large values of $\sigma_{ij}$ and $\epsilon_{ij}, \forall i \in [5], j \in [3].$ Finally, we set the system lifetime requirement $\mathcal{T}_R \in [16, 19]$.
%
%
%In this first part of numerical experiments, we fix a system design $\x$, such that
%$$
%\left[\sum_{k \in \mathbf{K}_{ij}}x^{k}_{ij}\right]_{|\mathbf{N}||\mathbf{M}|}=\left[
%\begin{matrix}
%1 & 1 & 1  \\
%1 & 1 & 1 \\
%2 & 1 & 1 \\
%1 & 1 & 1  \\
%1 & 2 & 2  \\
%\end{matrix}
%\right],
%$$
%where each subsystem (out of five) is assigned with one or two components for each type (out of three). Using model (\ref{HP1-ambiguity-LP-FL})-(\ref{HP2-ambiguity-LP-FL}), we compute the worst-case probability level (\ref{Prob-1}) with different settings of  parameters $([\underline{\hmu}, \overline{\hmu}], [\underline{\bm{z}}, \overline{\bm{z}}], \bm{\hsigma}, \hepsilon)$ in the ambiguity set $\mathbb{F}$ and the parameter $\mathcal{T}_R$ of system lifetime requirement. The worst-case probability levels under different testing purposes are provided in the Figure~\ref{fig:exp-1}.
%
%
%
%
%\begin{figure}[h!]
%   \centering
%   \includegraphics[width=1\linewidth]{exp-set-1}
%   \caption{\footnotesize The worst-case probability (reliability) level of a given system design with different settings of distributional parameters $([\underline{\hmu}, \overline{\hmu}], [\underline{\bm{z}}, \overline{\bm{z}}], \bm{\hsigma}, \hepsilon)$ and system lifetime requirement level $\mathcal{T}_R$. (a) The probability level vs. gap of $[\underline{\hmu}, \overline{\hmu}]$ under different $\mathcal{T}_R$ levels, where the gap enlarges as the parameter $\chi$ (`chi') increases. (b) The probability level vs. gap of $[\underline{\bm{z}}, \overline{\bm{z}}]$ under different $\mathcal{T}_R$ levels, where the gap enlarges as the parameter $t$ ('psi') increases. (c)The probability level vs. values of $\hsigma$ under different $\mathcal{T}_R$ levels, where the values of $\hsigma$ increase as the parameter $\kappa$ ('kappa') increases. (d) The probability level vs. values of $\hepsilon$ under different $\mathcal{T}_R$ levels, where the values of $\hepsilon$ increase as the parameter $\theta$ ('theta') increases.}
%   \label{fig:exp-1}
%\end{figure}
%
%The observations of the experimental results are the following:
%(i) the worst-case probability (reliability) level decreases when the variation range $[\underline{\hmu}, \overline{\hmu}]$ of expected lifetimes increases (Figure~\ref{fig:exp-1}-(a)), the variation range $[\underline{\bm{z}}, \overline{\bm{z}}]$ of the lifetimes increases (Figure~\ref{fig:exp-1}-(b)), or the difference in average among the lifetimes of the same type of components enlarges (Figure~\ref{fig:exp-1}-(c) and (d)). Such decreased probability is due to the enlarged ambiguity set $\mathbb{F}$ resulted by the changed of above distributional parameters $([\underline{\hmu}, \overline{\hmu}], [\underline{\bm{z}}, \overline{\bm{z}}], \bm{\hsigma}, \hepsilon)$. (ii) Intuitively, the probability level also decreases if the system lifetime requirement $\mathcal{T}_R$ is increased.
%(iii) The results in Figure~\ref{fig:exp-1} also provide the variation limits for the distributional parameters for an aspirational probability level with the given system lifetime requirement $\mathcal{T}_R$. For instance, if the decision maker needs the (worst-case) probability higher than 0.8 for system lifetime above $\mathcal{T}_R=16.2$, then the components lifetime range $[\underline{\bm{z}}, \overline{\bm{z}}]$ cannot be larger than the case such that parameter $t \ge 0.01$.


%\subsubsection{Experiment on $\underline{z}$ and $\overline{z}$}
%In this subsection, wo do experiments with different dispersion levels between the parameters $\bm{\underline{z}}$ and $\bm{\overline{z}}$. For simplicity, we also introduce an auxiliary variable $t\in[1,+\infty)$, and control the values of specified parameters as follow:
%\begin{eqnarray}
%\widetilde{\underline{z}}^{k}_{ij}' = \frac{1}{t} * \widetilde{\underline{z}}^{k}_{ij}&&\forall i\in I, j_i\in J_i, k_{ij}\in N_{ij}
%\end{eqnarray}
%\begin{eqnarray}
%\widetilde{\overline{z}}^{k}_{ij}' = t * \widetilde{\overline{z}}^{k}_{ij}&&\forall i\in I, j_i\in J_i, k_{ij}\in N_{ij}
%\end{eqnarray}
%where $\widetilde{\overline{z}}^{k}_{ij}'$ and $\widetilde{\overline{z}}^{k}_{ij}'$ are the experiment observation values of parameters $\underline{z}_{ij}$ and $\overline{z}_{ij}$, so the greater value of $t$, the greater of dispersion level between $\underline{z}_{ij}$ and $\overline{z}_{ij}$. The comparsion is shown in Figure~\ref{fig:xi}.
%
%\begin{figure}[h!]
%   \centering
%   \includegraphics[width=0.5\linewidth]{xi}
%   \caption[Obj value under $t$]{The objective values under different value of $t\in[1,2]$ of five $\mathcal{T}_R$ value cases ($\mathcal{T}_R\in\{15, 20, 25, 30, 35\}$), in each case, the result shows that the greater dispersion level between $\bm{\underline{z}}$ and $\bm{\overline{z}}$, the smaller obj value is.}
%   \label{fig:xi}
%\end{figure}
%
%\subsubsection{Experiment on $\mathcal{T}_R$}
%In this subsection, we do experiments with different $\mathcal{T}_R$ in candidate set $\{0, 5, 10, ..., 80, 85, 90\}$, the result is shown in Figure~\ref{fig:dl}.
%
%\begin{figure}[h!]
%   \centering
%   \includegraphics[width=0.5\linewidth]{Dl}
%   \caption[Obj value under $\mathcal{T}_R$]{The objective values under different value of parameter $\mathcal{T}_R$, with the value of $\mathcal{T}_R$ increase from 0 to 90, the objective value decrease from 1 to 0.}
%   \label{fig:dl}
%\end{figure}
%
%\subsubsection{Experiment on $\hsigma$}
%In this subsection, we do experiments with different value of $\bm{\sigma}$. For simplicity of experiments, we introduce an
%auxiliary variable $\kappa\in(0,+\infty)$ to control the value of $\sigma_{ij}$, $\forall i\in I$, $j_i\in J_i$:
%\begin{eqnarray}
%\widetilde{\sigma}_{ij}' = \kappa * \widetilde{\sigma}_{ij}&&\forall i\in I, j_i\in J_i, k_{ij}\in N_{ij}
%\end{eqnarray}
%where $\widetilde{\sigma}_{ij}'$ is the experiment observation values of parameters $\sigma_{ij}$ and $\widetilde{\sigma}_{ij}$ is the initial setting value of $\sigma_{ij}$.
%The result is shown in Figure~\ref{fig:sigma}.
%\begin{figure}[h!]
%   \centering
%   \includegraphics[width=0.5\linewidth]{sigma}
%   \caption[Obj value under $\bm{\sigma}$]{The objective values under different value of $\kappa\in[0,2.5]$ of nine $\mathcal{T}_R$ value cases ($\mathcal{T}_R\in\{10, 20, 30, 40, 50, 60, 70, 80, 90\}$), in each case, the result shows that the greater value of $\bm{\sigma}$ corresponds to the smaller objective value. All objective values will be 1 if $\mathcal{T}_R<10$.}
%   \label{fig:sigma}
%\end{figure}
%
%\subsubsection{Experiment on $\epsilon$}
%In this section, we do experiments with different values of parameter $\bm{\epsilon}$. Here we set
%$\epsilon_{ij}\in\{0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5\}$, $\forall i\in I$, $j_i\in J_i$ for simplicity and clear. The results is shown in Figure~\ref{fig:epsilon}.
%\begin{figure}[h!]
%   \centering
%   \includegraphics[width=0.5\linewidth]{epsilon}
%   \caption[Obj value under $\bm{\epsilon}$]{The objective values under different value of $\epsilon_{ij}$, $\forall i\in I$, $j_i\in J_i$ of nine $\mathcal{T}_R$ value cases ($\mathcal{T}_R\in\{10, 20, 30, 40, 50, 60, 70, 80, 90\}$), in each case, the result shows that the greater value of $\epsilon_{ij}$ corresponds to the smaller objective value.}
%   \label{fig:epsilon}
%\end{figure}
%%
%
%


%\subsection{Reliability Optimization}\label{sec:comp2}
%Next we perform the robust redundancy optimization using  model (\ref{HP1-ambiguity-MILP-FL1})-(\ref{HP1-ambiguity-MILP-FL2}) to find the optimal system design $\x^*$ with different combinations $(\mathcal{T}_R,R_0)$ of system lifetime requirement level and reliability requirement level. Specifically, we set the minimum number of components of each type as $L_{ij}=1, \forall i \in [5], j \in [3]$, and allow the $\mathcal{T}_R$ to vary from 16.5 to 19.0, and $R_0$ from 0.1 to 0.9.  The  optimal (minimal) cost is plotted in Figure~\ref{fig:exp-2}.
%\begin{figure}[h!]
%   \centering
%   \includegraphics[width=0.6\linewidth]{Figure_R0}
%   \caption{\footnotesize  Optimal cost with different system lifetime requirement $\mathcal{T}_R$ and reliability requirement $R_0$, where the vertical line implies the case of infeasibility.}
%   \label{fig:exp-2}
%\end{figure}
%
%From the results, the optimal cost and the number of components increase as the $\mathcal{T}_R$ or $R_0$ increases. In particular, Figure~\ref{fig:exp-2} implies, for each system lifetime requirement $\mathcal{T}_R$, there are several threshold levels of system reliability requirement $R_0$ beyond which no system design would be feasible. For instance, for $\mathcal{T}_R=18$, there is no feasible system design for the reliability level $R_0>0.7$, while when $\mathcal{T}_R$ is raised to $18.5$, there would be no feasible system design can be found even for the reliability level $R_0>0.2$. In other words, Figure~\ref{fig:exp-2} provides the physical limits of both the lifetime and reliability requirements for the system to meet, under the lifetime distributional ambiguity of components.
%
%%On the other hand, the Table~\ref{table-design} provides with the system design information (and also the cost information) with different combinations of $(\mathcal{T}_R, R_0)$. The designer can then choose the most suitable system designs with these information.
%%\begin{table}[h]\scriptsize%\tiny%\footnotesize
%%\begin{center}
%%  \caption{\footnotesize The optimal system redundancy design $\x^*$: the total number of components (of all types) in each  subsystem along with different system lifetime requirement $\mathcal{T}_R$ and reliability requirement $R_0$, where `$-$' indicates a no-solution case due to the overly high $\mathcal{T}_R$ and/or $R_0$. }\label{table-design}
%%  \begin{tabular}{|c|c||cccccccc c|}
%%      \hline
%%  %   \rule{0pt}{12pt} & && & & & $R_0$ & & & & \\[0pt]       \hline
%%      \multirow{2}{*}{\scriptsize  System Lifetime $\mathcal{T}_R$} & \multirow{2}{*}{\scriptsize  Subsystem $i$} & \multicolumn{9}{c|}{$R_0$}\\
%%\cline{3-11}
%%        && 0.1 & 0.2 & 0.3 & 0.4 & 0.5 & 0.6 & 0.7 & 0.8 & 0.9 \\[0pt]
%%\hline
%%      % DL = 16.5
%%                  &1& 3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 & 4 \\[0pt]
%%                &2& 3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 \\[0pt]
%%        $16.5$  &3& 3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 \\[0pt]
%%                &4& 3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 \\[0pt]
%%                &5& 3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 \\
%%      \hline
%%        % \multicolumn{2}{|c|}{Cost} &&&&&&&&--&-- \\ \hline
%%      % DL = 17.0
%%                &1 & 3 & 3 & 4 & 3 & 4 & 5 & 4 & \multirow{5}{*}{--} & \multirow{5}{*}{--}  \\[0pt]
%%                &2 & 3 & 3 & 3 & 4 & 4 & 4 & 5 &   &   \\[0pt]
%%       $17.0$   &3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 &   &   \\[0pt]
%%                &4 & 3 & 3 & 3 & 3 & 3 & 3 & 3 &   &   \\[0pt]
%%                &5 & 3 & 3 & 3 & 3 & 3 & 3 & 4 &   &   \\
%%      \hline
%%      %\multicolumn{2}{|c|}{Cost} &&&&&&&&--&-- \\ \hline
%%                &1& 3 & 3 & 4 & 4 & 4 & 4 & 5 & \multirow{5}{*}{--} & \multirow{5}{*}{--} \\[0pt]
%%                &2& 3 & 3 & 3 & 3 & 4 & 4 & 5 &   &   \\[0pt]
%%        $17.5$  &3& 3 & 3 & 3 & 3 & 3 & 3 & 3 &   &   \\[0pt]
%%                &4& 3 & 3 & 3 & 3 & 3 & 3 & 3 &   &   \\[0pt]
%%                &5& 3 & 3 & 3 & 3 & 3 & 4 & 4 &   &   \\
%%      \hline
%%      %\multicolumn{2}{|c|}{Cost} &&&&&&&&--&-- \\ \hline
%%                &1& 3 & 4 & 3 & 4 & 5 & 5 & 5 & \multirow{5}{*}{--}  & \multirow{5}{*}{--} \\[0pt]
%%                &2& 3 & 3 & 4 & 4 & 4 & 4 & 5 &   &   \\[0pt]
%%       $18.0$   &3& 3 & 3 & 3 & 3 & 3 & 3 & 3 &   &   \\[0pt]
%%                &4& 3 & 3 & 3 & 3 & 3 & 3 & 3 &   &   \\[0pt]
%%                &5& 3 & 3 & 3 & 3 & 3 & 4 & 5 &   &   \\
%%      \hline
%%      %\multicolumn{2}{|c|}{Cost} &&&&&&&&--&-- \\ \hline
%%                &1& 4 & 5 & \multirow{5}{*}{--} & \multirow{5}{*}{--} & \multirow{5}{*}{--} & \multirow{5}{*}{--} & \multirow{5}{*}{--} & \multirow{5}{*}{--} & \multirow{5}{*}{--} \\[0pt]
%%                &2& 5 & 5 &   &   &   &   &   &   &   \\[0pt]
%%       $18.5$   &3& 5 & 5 &   &   &   &   &   &   &   \\[0pt]
%%                &4& 4 & 5 &   &   &   &   &   &   &   \\[0pt]
%%                &5& 4 & 5 &   &   &   &   &   &   &   \\
%%      \hline
%%%\multicolumn{2}{|c|}{Cost} &&&&&&&&--&-- \\ \hline
%%                &1& 5 & \multirow{5}{*}{--} & \multirow{5}{*}{--} & \multirow{5}{*}{--} & \multirow{5}{*}{--} & \multirow{5}{*}{--} & \multirow{5}{*}{--} & \multirow{5}{*}{--} & \multirow{5}{*}{--} \\[0pt]
%%                &2& 5 &   &   &   &   &   &   &   &   \\[0pt]
%%        $19.0$  &3& 5 &   &   &   &   &   &   &   &   \\[0pt]
%%                &4& 5 &   &   &   &   &   &   &   &   \\[0pt]
%%                &5& 5 &   &   &   &   &   &   &   &   \\
%%      \hline
%%%\multicolumn{2}{|c|}{Cost} &&&&&&&&--&-- \\ \hline
%%  \end{tabular}
%%\end{center}
%%\end{table}
%
%%\subsubsection{Performance comparison with probabilistic reliability model}
%%\begin{equation}\label{RO-Model}
%%\begin{array}{rcll}
%%& \min\limits_{\x} &  \sum\limits_{i\in \mathbf{N}} \sum\limits_{j \in \mathbf{M}_i}\left[ \sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\right]c_{ij} \\[0.3 cm]
%%&{\rm s.t.} &  \min\limits_{i \in \mathbf{N}} \left(\max\limits_{j \in \mathbf{M}_i}  \max\limits_{k \in \mathbf{K}_{ij}}   z^{k}_{ij} x^{k}_{ij}\right)\ge \mathcal{T}_R   & \forall \bm{z} \in \mathcal{U}(\Gamma) \\[0.3 cm]
%%&&  \sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\ge  L_{ij}, & \forall i \in \mathbf{N}, j\in \mathbf{M}_i\\
%%&& x^{k}_{ij} \in \{0,1\},  & \forall i \in \mathbf{N}, j\in \mathbf{M}_i, k \in \mathbf{K}_{ij}.
%%\end{array}
%%\end{equation}
%%The above budgeted robust convex (nonlinear) optimization model is in general intractable even when uncertainty set $\mathcal{U}(\Gamma)$ is a general polytope, by noting that the robust counterpart of convex (piecewise linear) constraints
%%$$
%%\max\limits_{j \in \mathbf{M}_i}  \max\limits_{k \in \mathbf{K}_{ij}}   z^{k}_{ij} x^{k}_{ij} \ge \mathcal{T}_R, \forall i \in \mathbf{N}
%%$$
%%cannot be tractably dualized.  Nevertheless, under the independence condition of component lifetimes $\tilde{z}^{k}_{ij}, \forall i \in \mathbf{N}, j\in \mathbf{M}_i, k \in \mathbf{K}_{ij}$, if we define
%%$$
%%\mathcal{U}(\gamma):=\left\{\tilde{z}^{k}_{ij}\in \Big[\underline{{z}}_{ij}(\gamma), \overline{{z}}_{ij}(\gamma) \Big], \forall i \in \mathbf{N}, j\in \mathbf{M}_i, k \in \mathbf{K}_{ij} \right\},
%%$$
%%then the above regular robust reliability model can be equivalently expressed as the following MIP problem:
%%\begin{equation}\label{RO-Model}
%%\begin{array}{rcll}
%%& \min\limits_{\x} &  \sum\limits_{i\in \mathbf{N}} \sum\limits_{j \in \mathbf{M}_i}\left[ \sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\right]c_{ij} \\[0.3 cm]
%%&{\rm s.t.} &   \max\limits_{j \in \mathbf{M}_i}  \max\limits_{k \in \mathbf{K}_{ij}}  \underline{{z}}_{ij}(\gamma) x^{k}_{ij} \ge \mathcal{T}_R,   & \forall i \in \mathbf{N} \\[0.3 cm]
%%&&  \sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\ge  L_{ij}, & \forall i \in \mathbf{N}, j\in \mathbf{M}_i\\
%%&& x^{k}_{ij} \in \{0,1\},  & \forall i \in \mathbf{N}, j\in \mathbf{M}_i, k \in \mathbf{K}_{ij}.
%%\end{array}
%%\end{equation}
%\subsection{Out-of-sample performance comparison with the probabilistic model}\label{sec:test-1}
%To further illustrate the performance of our robust reliability model, we compare the design obtained from the proposed robust redundancy optimization  model (\ref{HP1-ambiguity-MILP-FL1})-(\ref{HP1-ambiguity-MILP-FL2}), termed {\em robust design} with the design of the probabilistic redundancy optimization model, termed {\em probabilistic design}. As mentioned in the Introduction, when the situation  involves multiple types (i.e. $|\mathbf{M}_i|>1$), or both the cold-standby and active parallel redundant subsystems are considered, the probabilistic model in general becomes intractable (as mentioned in the sections of Introduction and Literature Review). Therefore, for a fair comparison, we consider a series-parallel system with $|\mathbf{M}_i|=1, \forall i \in [5]$, which preserves a linear MIP formulation. For a coherent exposition of the experimental study, we place the details of the probabilistic redundancy model as well as its MIP transformation in the Appendix.
%
%In particular, we first randomly generate lifetime samples (size=100), and then  compute the probability levels $\P[\tilde{z}_{i}\le \mathcal{T}_R ], \forall i \in [5]$ and the parameters $([\underline{\hmu}, \overline{\hmu}], [\underline{\bm{z}}, \overline{\bm{z}}], \bm{\hsigma}, \hepsilon)$ from the generated lifetime samples  for parameter inputs of probabilistic and robust models, respectively, where $R_0=0.9$ and $\mathcal{T}_R=18,19$ and $20$. We obtain probabilistic and robust designs by solving the perspective redundancy models. Then we perform 1000 sets of out-of-sample experiments where in each set, 1000 lifetime data points are generated using the sample mean and StD of the original lifetime samples. The out-of-sample system lifetimes as well as its mean values of both probabilistic and robust designs are compared and plotted in Figure~\ref{Fig:lifetime-compare1}, and the comparison of out-of-sample reliability levels is provided in Table~\ref{tab-compare2}.
%
%
%
%
%
%
%\begin{figure}[h*]
%  \centering
%  \subfigure[]{\includegraphics[scale=0.35]{lifetime-1}}\!\!\!\!\!\!\!\!
%  \subfigure[]{\includegraphics[scale=0.35]{lifetime-2}}\!\!\!\!\!\!\!\!
%  \subfigure[]{\includegraphics[scale=0.35]{lifetime-3}}\!\!\!\!\!\!\!\!
%  \subfigure[]{\includegraphics[scale=0.35]{mean-lifetime-1}}\!\!\!\!\!\!\!\!
%  \subfigure[]{\includegraphics[scale=0.35]{mean-lifetime-2}}\!\!\!\!\!\!\!\!
%  \subfigure[]{\includegraphics[scale=0.35]{mean-lifetime-3}}\!\!\!\!\!\!\!\!
%\caption{\footnotesize (a)-(c) The out-of-sample system lifetime scenarios (in 1 set of experiments with 1000 lifetime data) of probabilistic and robust design under $\mathcal{T}_R=18,19$ and $20$, respectively.
%(d)-(f) The (sample) mean values of out-of-sample system lifetimes (in 1000 sets of experiments) of probabilistic and robust design under $\mathcal{T}_R=18,19$ and $20$, respectively.}
%\label{Fig:lifetime-compare1}
%\end{figure}
%
%
%
%\begin{table}[h*]\footnotesize%\small%
%\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability levels comparison ($R_0=0.9$), where the `Design' specifies the number of redundant components allocated in each of 5 subsystems, `P-Model' and `C-DRO-Model' refer to the probabilistic model and conditionally robust model, respectively. }
%\begin{center}
%\begin{tabular}{|c|c|| c| c |c |c   |}\hline
%    $\mathcal{T}_R$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
%     \hline
%    \multirow{2}{*}{18}& P-Model & (1,1,1,1,1) & \multirow{2}{*}{$R_0=0.9$}& 0.617 & 0.015   \\
%     & C-DRO-Model & (2,1,2,2,2) &  &0.873 & 0.010  \\
%\hline
%    \multirow{2}{*}{19}& P-Model & (2,1,1,1,1)&\multirow{2}{*}{$R_0=0.9$} & 0.564 & 0.015   \\
%     & C-DRO-Model & (2,1,2,2,2) && 0.884 & 0.009  \\
%\hline
%    \multirow{2}{*}{20}& P-Model & (2,1,1,3,2) &\multirow{2}{*}{$R_0=0.9$}& 0.554 & 0.016   \\
%     & C-DRO-Model & (2,2,3,3,2) && 0.909 & 0.009  \\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%
%From the comparative results in Figure~\ref{Fig:lifetime-compare1} and Table~\ref{tab-compare2}, the robust designs in all the different system lifetime requirements ($\mathcal{T}_R=18,19$ and $20$)  provide more reliable performance profiles than the probabilistic designs, in protecting the reliability level $R_0$. Specifically, in each set of out-of-sample tests, compared with the probabilistic design, the robust design pulls the sample system lifetimes (Figure~\ref{Fig:lifetime-compare1}:(a)-(c)) and sample mean of the system lifetimes (Figure~\ref{Fig:lifetime-compare1}:(d)-(f)) further away from the lifetime requirement $\mathcal{T}_R$ the to the right side, forming a wider safety gap which is able to absorb more lifetime uncertainty. Furthermore, from Table~\ref{tab-compare2}, the sample mean of reliability levels achieved by robust design is $0.873, 0.884$ and $0.909$ under different $\mathcal{T}_R$ levels which is very close or exceeds the designed level $R_0=0.9$ of reliability requirement. In contrast, the probabilistic design can only achieve $0.617, 0.564$ and $0.554$ in perspective $\mathcal{T}_R$ levels, which fall to meet the designed reliability level. Also, reliability levels achieved by robust design have a smaller variance than that of probabilistic design.
%On the other hand, although it is noted from the specifications of the `Design' column in Table~\ref{tab-compare2} that the higher performance achieved by the robust design is at the cost of using more components, our distributional robust design is not {\em ad-hoc}, but is calibrated by the distributional information of component lifetimes as given in the ambiguity set. In other words, it provides a more informed decision of redundancy allocation that can ensure the designed system reliability level given the component lifetime distributions being consistent to the characteristics of the ambiguity set.
%
%
%
%The implications of the comparison is that when the component lifetimes are highly uncertain, it could be risky to implement the probabilistic design (from the redundancy model that assumes known component reliability levels), since the actual reliability level achieved could be far below the designed level. In this situation, the robust design, albeit with more conservative redundancy allocation, is able to protect (or at least be very close to) the designed reliability level. Hence, it effectively reduces the risk of the system being `unreliable'.
%%\begin{eqnarray*}
%%\begin{array}{rcll}
%%& \min\limits_{\y} &  \sum\limits_{i\in \mathbf{N}} \sum\limits_{j \in \mathbf{M}_i}\left[y_{ij}\right]c_{ij} \\[0.3 cm]
%%& {\rm s.t.} &  \displaystyle \prod_{i \in \mathbf{N}}\left[1-\prod_{j \in \mathbf{M}_i}(r_{ij})^{y_{ij}}\right]\ge R_0  & \\[0.3 cm]
%%&& L_{ij}\le y_{ij}\le U_{ij}, & \forall i \in \mathbf{N}, j\in \mathbf{M}_i\\
%%&& y_{ij} \in \mathbb{Z}_+,  & \forall i \in \mathbf{N}, j\in \mathbf{M}_i,
%%\end{array}
%%\end{eqnarray*}
%%where
%%$$
%%r_{ij}=\P\Big[\tilde{z}_{ij}\le \mathcal{T}_R \Big],
%%$$
%%which can be estimated from the data. By transforming the integer variable $y_{ij}$ with binaries $x^{k}_{ij}$:
%%$$
%%y_{ij}=L_{ij}+\sum_{k=0}^{U_{ij}-L_{ij}}kx^{k}_{ij},~\mbox{with}~\sum_{k=0}^{U_{ij}-L_{ij}}x^{k}_{ij}=1
%%$$
%%The above model can be readily linearized as the following MIP with binaries:
%%\begin{eqnarray}
%%\begin{array}{rcll}
%%& \min\limits_{\x} & \displaystyle \sum\limits_{i\in \mathbf{N}} \sum\limits_{j \in \mathbf{M}_i}\left[L_{ij}+\sum_{k=0}^{U_{ij}-L_{ij}}kx^{k}_{ij}\right]c_{ij} \\[0.3 cm]
%%& {\rm s.t.} &  \displaystyle \sum_{i \in \mathbf{N}} \sum_{k=0}^{U_{ij}-L_{ij}}x^{k}_{ij}\ln\left[1-\prod_{j \in \mathbf{M}_i} (r_{ij})^{L_{ij}+k} \right]\ge \ln R_0  & \\[0.3 cm]
%%&& \displaystyle \sum_{k=0}^{U_{ij}-L_{ij}}x^{k}_{ij}=1, & \forall i \in \mathbf{N}, j\in \mathbf{M}_i\\
%%&& x^{k}_{ij} \in \{0,1\},  & \forall i \in \mathbf{N}, j\in \mathbf{M}_i, k \in \mathbf{K}_{ij}
%%\end{array}
%%\end{eqnarray}
%
%
%
%
%
%\subsection{A case of air brake system in high speed train}\label{sec:rlcs}
%A high speed train typically consists of several electric multiple units (EMUs) which are self-propelled carriages using electricity as the motive power. The air brake system is one critical subsystem of the EMU \cite{Hasegawa1999}. As shown in Figure \ref{figure-cs}, it mainly consists of one air compressor which generates compressed air, one check valve which ensures single direction of the airflow, one main air reservoir which stores the compressed air for braking operation, one relay valve which controls the air pressure according to the received signal and one boosting cylinder which transfers the air pressure to the amplified oil pressure to active the basic braking mechanisms to slow down or stop the unit \cite{Hasegawa1999,Cheng2009}.
%
%\begin{figure}[h]
%\centering
%\includegraphics[scale=0.5]{figure-case-study.pdf}
%\caption{\small Diagram of the air brake system of a EMU}
%\label{figure-cs}
%\end{figure}
%
%The high speed train is highly reliability-demanding, since one small accident would result to large number of passenger life losses given a high operational speed. To enhance the reliability of the air brake system, different types of redundancies, e.g. cold standby or hot standby, need to be introduced. For the two valves cold standbies are used, whereas for the rest components hot standbies are needed. The parameters of the various components are presented in Table \ref{tab-cs}, which are modified based the real data from a railway company in Europe, due to confidential agreement. Due to the space limit, the maximal number of components connected in parallel can not excess 4 and there must be at least one component in each subsystem. This system is required to operate without failure for 15-20 years at probability of 0.95-0.99.
%
%\begin{table}[t*]\scriptsize%\footnotesize%\small%
%\caption{\label{tab-cs} \footnotesize  Lifetime parameters of the available components for air brake system.}
%\begin{center}
%\begin{tabular}{|c| c | c || c c c  c |}\hline
%     Subsystem & Redundancy Strategy & Component Type & $\underline{z}_{ij}$ (yrs) & $\overline{z}_{ij}$ (yrs) &$\mu_{ij}$ (yrs) &  $c_{ij}$ (k\$) \\
%     \hline
%                    &\multirow{5}{*}{Active redundant} & 1 & 10 & 23 &15 & 10.0  \\
%                    & &2 & 12 & 21 &16& 15.0 \\
%     Air compressor & &3 & 16 & 24 &20& 17.0   \\
%                    & &4 & 17 & 25 &22& 25.0  \\
%                    & &5 & 20 & 26 &23& 32.0 \\
%     \hline
%                    &\multirow{5}{*}{Cold Standby} &1 & 4 & 6 &10& 1.0   \\
%                    & &2 & 6 & 10.5 &8& 1.5 \\
%      Check valve   & &3 & 7.5 & 11 &9& 2.0   \\
%                    & &4 & 8.5 & 12 &10& 2.5 \\
%                    & &5 & 15 & 17.5 &16& 3.0   \\
%     \hline
%                    &\multirow{6}{*}{Cold Standby} &1 & 4.5 & 8 &6& 1.5  \\
%                    & &2 & 6 & 9 &7& 2.0   \\
%   Control valve    & &3 & 7.5 & 10 &9& 2.5  \\
%                    & &4 & 8 & 11 &10& 3.0  \\
%                    & &5 & 10 & 12 &10.5& 3.5  \\
%                    & &6 & 10.5 & 12.5 &12& 4.0  \\
%\hline
%                    &\multirow{5}{*}{Active redundant} &1  & 15  & 21 &18& 15.0 \\
%                    & &2  & 17  & 22 &20& 20.5 \\
% Main air reservoir && 3  & 18  & 23 &21& 23.5 \\
%                    & &4  & 19  & 26 &23& 25.5 \\
%                    & &5  & 20  & 27 &25& 30.0 \\
% \hline
%                    &\multirow{5}{*}{Active redundant} &1  & 14  & 20 &16& 20.0 \\
%                    & &2  & 16  & 21 &18& 22.5 \\
%  Boosting cylinder & &3  & 18  & 24 &22& 25.5 \\
%                    & &4  & 20  & 28 &24& 30.5 \\
%                    & &5  & 25  & 31 &28& 34.0 \\
%\hline
%\end{tabular}
%\end{center}
%\end{table}
%
%Implementing the proposed robust redundancy allocation model, we can obtain the design table (Table~\ref{d-table}) which provides with the system design information (and also the cost information) with different combinations of $(\mathcal{T}_R, R_0)$. The designer can then choose the most suitable system designs with these information. From Table~\ref{d-table}, we see that as the system lifetime requirement $\mathcal{T}_R$ or system reliability level $R_0$ increases, the design allocates more components (e.g. the number of components allocated to `Air compressor' is increased from 1 to 2 as $\mathcal{T}_R$ increases from 19 yrs to 20 yrs, under $R_0=0.95$) and/or shifts the components to more reliable ones (with longer expected lifetimes, e.g. the 1 component allocated to `Main air reservoir' is shifted from  Type 2 to Type 5, as $\mathcal{T}_R$ increases from 16 yrs to 19 yrs, under $R_0=0.99$).
%
%\begin{sidewaystable}[!htbp]\scriptsize%\footnotesize%\small%
%\caption{\label{tab-cs} \footnotesize  Design table for the air brake system: The number of components of each allocated in each subsystem for $R_0=0.95, 0.97$ and $0.99$, and $\mathcal{T}_R=15-20$ yrs, where no feasible design can be found for $R_0=0.99, \mathcal{T}_R=20$.}\label{d-table}
%\begin{center}
%\begin{tabular}{|c| c|| cccccc || cccccc || cccccc |}\hline
%\multirow{2}{*}{Subsystem} & \multirow{2}{*}{Type} & \multicolumn{6}{c||}{$R_0=0.95$,~$\mathcal{T}_R$ (yrs) }& \multicolumn{6}{c||}{$R_0=0.97$,~$\mathcal{T}_R$ (yrs)}& \multicolumn{6}{c|}{$R_0=0.99$,~$\mathcal{T}_R$ (yrs)} \\
% \cline{3-20}
%      &&15 & 16 & $17$ &  $18$& $19$& $20$&$15$ & $16$ & $17$ &  $18$& $19$& $20$&$15$ & $16$ & $17$ &  $18$& $19$ & 20 \\
%     \hline
%                    & 1 & 0 & 0 &0 & 0 & 0& 0   & 0 & 0 &0 & 0& 0 & 0   & 0 & 0 &0 & 0& 0 &\multirow{5}{*}{--} \\
%      Air           & 2 & 0 & 0 &0 & 0 & 0& 0   & 0 & 0 &0 & 0&0  & 0   & 0 & 0 &0& 0 &0 &\\
%     compressor     & 3 & 1 & 1 &1 & 1 & 0& 0   & 1 & 1 &0 & 0&0  & 0   & 1 & 0 &0& 0 &0 & \\
%                    & 4 & 0 & 0 &0 & 0 & 1& 2   & 0 & 0 &1 & 1&0  & 0   & 0 & 1 &0& 0 &0 &\\
%                    & 5 & 0 & 0 &0 & 0 & 0& 0   & 0 & 0 &0 & 0&1  & 2   & 0 & 0 &1& 1 &1 &\\
%     \hline
%                    & 1 & 1 & 1 &1& 1 & 0 & 0   & 1 & 1 &1& 1 & 0 & 0   & 1 & 1 &1& 1 & 0 &\multirow{5}{*}{--} \\
%       Check        & 2 & 0 & 2 &0& 0 & 1 & 1   & 0 & 0 &0& 0 & 1& 1    & 0 & 0 &0& 0 & 1 &\\
%      valve         & 3 & 0 & 0 &0& 0 & 0 & 0   & 0 & 0 &0& 0 & 0 & 1   & 0 & 0 &0& 0 & 0 & \\
%                    & 4 & 0 & 0 &0& 0 & 0 & 0   & 0 & 0 &0& 0 & 0& 0    & 0 & 0 &0& 0 & 0 &\\
%                    & 5 & 1 & 0 &1& 1 & 1 & 1   & 1 & 1 &1& 1 & 1 & 1   & 1 & 1 &1& 1 & 1 &\\
%     \hline
%                    & 1 & 0 & 0 &1& 1 & 1& 1   & 0 & 3 &0& 0 & 1& 1    & 0 & 1 &0& 1 & 1 &\multirow{5}{*}{--}\\
%       Control      & 2 & 0 & 0 &1& 0 & 0 & 1   & 0 & 0 &0& 2 & 0& 1    & 0 & 2 &0& 1 & 0 &\\
%   valve            & 3 & 1 & 1 &1& 2 & 2& 0    & 1 & 1 &1& 1 & 2& 0    & 1 & 0 &1& 0 & 2 &\\
%                    & 4 & 1 & 0 &0& 0 & 0& 0    & 1 & 0 &0& 0 & 0& 0    & 1 & 0 &0& 1 & 0 &\\
%                    & 5 & 0 & 0 &0& 0 & 0& 1    & 0 & 0 &1& 0 & 0& 1    & 0 & 0 &1& 0 & 0 &\\
%                    & 6 & 0 & 0 &0& 0 & 0& 0    & 0 & 0 &0& 0 & 0& 0    & 0 & 0 &0& 0 & 0 &\\
%\hline
%                    & 1  & 1  & 0 &0& 0 & 0& 0  & 0 & 0 &0& 0 & 0& 0    & 0 & 0 &0& 0 & 0&\multirow{5}{*}{--}\\
% Main               & 2  & 0  & 1 &0& 0 & 0& 0  & 1 & 1 &0& 0 & 0& 0    & 1 & 1 &0& 0 & 0&\\
% air reservoir      & 3  & 0  & 0 &1& 0 & 0& 0  & 0 & 0 &1& 0 & 0& 0    & 0 & 0 &1& 0 & 0&\\
%                    & 4  & 0  & 0 &0& 1 & 0& 0  & 0 & 0 &0& 1 & 1& 0    & 0 & 0 &0& 1 & 0&\\
%                    & 5  & 0  & 0 &0& 0 & 1& 1  & 0 & 0 &0& 0 & 0& 2    & 0 & 0 &0& 0 & 1&\\
% \hline
%                    & 1  & 0  & 0 &0& 0 & 0& 0     & 0 & 0 &0& 0 & 0& 0    & 0 & 0 &0& 0 & 0&\multirow{5}{*}{--}\\
% Boosting           & 2  & 1  & 0 &0& 0 & 0& 0    & 1 & 0 &0& 0 & 0& 0    & 1 & 0 &0& 0 & 0&\\
% cylinder           & 3  & 0  & 1 &1& 0 & 0& 0    & 0 & 1 &1& 0 & 0& 0    & 0 & 1 &1& 0 & 0&\\
%                    & 4  & 0  & 0 &0& 1 & 1& 0    & 0 & 0 &0& 1 & 1& 0    & 0 & 0 &0& 1 & 1&\\
%                    & 5  & 0  & 0 &0& 0 & 0& 1    & 0 & 0 &0& 0 & 0& 1    & 0 & 0 &0& 0 & 0&\\
%\hline
%\multicolumn{2}{|c||}{Design cost (k\$)} & 64& 72.5 &76&\!\! 83.5 &\!\! 96.5&\!\! 125.5    & 69.5 & 72.5 &84& 91.5 & 99& 169.5    & 69.5& \!\! 80.5\!\! &91\!\!& 98.5\!\! & 103.5\!\!&--\\
%\hline
%\end{tabular}
%\end{center}
%\end{sidewaystable}

\section{Computational Study}
In this section we present numerical experiments of our model, as well as a case study. The computational study consists of six parts: (i) visualizing the result of dimension reduction and clustering; (ii) testing how design changes when parameters vary, including $K$ and other hyperparameters; (iii) choosing best parameter $K$ by cross validation; (iv) experimenting on the value of side information ;  (v)comparing our design with a baseline probabilistic model  and (vi) a real-life case of maintenance in high speed train industry. The distribution used in experiment (i)-(v) is generated by an accountable real data set from (Wang~et~al.~2019).
All computational result were done on a PC with Intel(R) Core(TM) i7-7500U CPU at 2.7 GHz, coded in Python. The MIP models were solved by a solver library called Gurobi, version 8.1.1.

\subsection{Visualizing clusters}
In this section we present a visualization of dimension reduction and clustering. We first apply tSNE algorithm, a state-of-art algorithm for dimension reduction and visualization, \iffalse (refer to Maaten L, Hinton G. Visualizing data using t-SNE[J]. Journal of machine learning research, 2008, 9(Nov): 2579-2605.)\fi to reduce the lifetime data from 28 dimensions to 2 dimensions, and then perform K-means clustering. We present figures of different choice of number of clusters, $K=2$ and $K=5$, respectively. As we can see in the figure, the original multi-dimensional data can be well clustered after dimension reduction. The clustering results, including results from other choices of $K$, are used in following experiments.

\begin{figure}[H]
\centering
%\includegraphics{TSNE_1.png}
\includegraphics[width=\columnwidth]{2D_tsne.jpg}
\caption{\footnotesize Visualization of dimension reduction by tSNE algorithm and clustering by K-means algorithms, into 2 clusters on the left and 5 clusters on the right. Note that the two figures are produced by different data.}
\label{figure_TSNE_1}
\end{figure}

\subsection{System design variation with different parameters}

\begin{figure}[H]
\centering
\includegraphics[scale=0.8]{KVARIATION11.pdf}
\caption{ The multi-type series-parallel system we are experimenting in this section. It consists of 3 subsystems, with 1 type of active parallel component and 2 types of cold-standby component in each subsystem. Each type of component can have up to 3 redundancies.}
\label{figure1}
\end{figure}

In this subsection we experiment on adjusting parameters of the model and observe how system design $x$ changed accordingly.  we consider the following setting: a series-parallel system with  subsystems ($|\mathbf{N}|=3$), with each subsystem containing 3 types of components ($|\mathbf{M}_i|= 3, \forall i \in \mathbf{N}$), among which 1 types are active-parallel and 2 types are cold-standby, and each type is of 3 redundant components ($|\mathbf{T}_{ij}|=3, \forall i \in \mathbf{N}, j\in \mathbf{M}_i$). The lifetime requirement $\mathcal{T}_S = 29$. $\epsilon_{ij} = 0.05,\forall i \in [3], j \in [3]$. The parameters we are adjusting includes $K \in [1,5, 10], R_0 \in \{0.95, 0.97, 0.99\}$, and $\Omega$ to adjust $\underline{\hmu}$ and $\overline{\hmu}$:.
$$
\underline{\mu}^{k}_{ij}={\nu}^{k}_{ij}-\mathcal{R},
\quad \overline{\mu}^{k}_{ij}={\nu}^{k}_{ij}+\mathcal{R} ,
$$
where $\mathcal{R}\in \{0.025, 0.05, 0.075, 0.1\}$.  Therefore, by changing the values of $\mathcal{R}$, we can have different sets of parameters $\underline{\hmu}$ and $\overline{\hmu}$. Specifically, large values of $\mathcal{R}$ correspond to the large gaps between $\underline{\hmu}$ and $\overline{\hmu}$.

\begin{table}[!htbp]\label{d-table}\scriptsize%\footnotesize%\small%\begin{table}[!htbp]\label{d-table}\scriptsize%\footnotesize%\small%
\caption{ \footnotesize  The design table for different $K$ under $\mathcal{T}_S = 29$}
\begin{center}
\begin{adjustbox}{angle=270}
\scalebox{1}{
\begin{tabular}{|c|c|c| c|| ccccc|| ccccc|| ccccc||}\hline
 \multirow{3}{*}{$\epsilon$} & \multirow{3}{*}{$\mathcal{R}$} & \multirow{3}{*}{Subsystem} & \multirow{3}{*}{Type} & \multicolumn{5}{c||}{$R_0 = 0.95$} & \multicolumn{5}{c||}{$R_0 = 0.97$} & \multicolumn{5}{c||}{$R_0 = 0.99$}\\
 \cline{5-19}
  &&  &  & \multicolumn{5}{c||}{$ K$} & \multicolumn{5}{c||}{$K$} & \multicolumn{5}{c||}{$ K$}\\
      &&&&1 & 3 & 5 &  8 &  10 &1 & 3 & 5 &  8 &  10 & 1 & 3 & 5 &  8 &  10  \\
     \hline
                    &&& AP & 1 & 1 & 1 &1 & 1 & 0 &0 & 0 & 0 & 1 & 1 & 1 & 1 &1 & 1\\
        &&1         & CS-I & 1 & 1 & 1 &1 & 1 & 2 & 2 & 2 & 2& 1 & 2 & 2 & 2 &2 & 2\\
                    &&& CS-II & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0\\
     \cline{3-19}
                    &&& AP & 1 & 1 & 1 &1 & 1 & 1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1 & 1\\
        &0.025&2    & CS-I & 0& 0 & 0 & 0 & 1 & 0 & 0 & 0 &0& 0& 0 & 0 & 0 &0 & 0\\
                    &&& CS-II & 2 & 2 & 2 &2 & 2 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2\\
     \cline{3-19}
                    &&& AP &0 & 0 & 1& 1 & 1 & 0 &1 & 1 &1 & 1 & 1 &0 &0 &0 & 0\\
        &&3         & CS-I & 1 & 1& 0& 0 & 0 & 1 & 0 & 0& 0 & 1 & 1 & 1 & 1 &1 & 1\\
                    &&& CS-II & 3 & 3 & 3 &3 & 3&  3  & 3 & 3& 3 & 3& 3 & 3 & 3 &3 & 3\\
\cline{2-19}
\multicolumn{1}{|c|}{} & \multicolumn{3}{c||}{Design cost (k\$)} & 39.5& 39.5&\!\! 39    &39& 39 & 44.5&44&44&\!\!44 & 41 &55 & 54.5 &\!\! 54.5 &54.5& 54.5 \\
     \cline{2-19}
                    &&& AP& 1 & 1 & 1 &1 & 1 & 0 & 0 & 0 &0 & 0 & 1& 1 & 1 & 1 &1\\
        &&1         & CS-I& 1 & 1 & 1 &1 & 1 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2 \\
                    &&& CS-II & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0& 0\\
     \cline{3-19}
                    &&& AP & 1 & 1 & 1 &1 & 1 & 1 & 1 & 1 &1 & 1 & 1 & 1 & 1 &1 & 1 \\
        &0.05&2     & CS-I &1 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0\\
                    &&& CS-II & 2 & 2 & 2 &2 & 2 & 2 & 2 &2 &2 &2& 2 & 2 & 2 &2 &2\\
     \cline{3-19}
                    &&& AP & 1 & 1 & 1 &1 & 0 & 1& 1 & 0& 0 & 0 & 1 &1 & 1 &1 & 1\\
        &&3         & CS-I & 3 & 3 & 3 &3 & 1 & 3 & 3 & 1& 1 & 1 &  3 & 3 & 3 & 3 & 3\\
                    &&& CS-II & 1 & 1 & 1 &1 & 3 & 1 & 1& 3& 3 & 3 & 1 & 1 & 1 &1 & 1\\
\cline{2-19}
\multicolumn{1}{|c|}{0.05} & \multicolumn{3}{c||}{Design cost (k\$)} & 40&40 &\!\! 40  &40& 39.5 & 45&45&\!\! 44.5 & 44.5 & 44.5 & 55&55&\!\! 55  & 55 & 55\\
     \cline{2-19}
                     &&& AP& 1& 1 & 1 & 1 &1 & 0 & 0 & 0 &0 & 0 & 1& 1 & 1 & 1 &1\\
        &&1           &CS-I2 & 1& 1 & 1 & 1 &1 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2\\
                    &&& CS-II & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0 \\
     \cline{3-19}
                    &&& AP & 1& 1 & 1 & 1 &1 & 1 &1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1  \\
        &0.075&2    & CS-I & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0 \\
                    &&& CS-II & 2 & 2 & 2 &2 & 2 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2 \\
     \cline{3-19}
                    &&& AP & 1& 1 & 1 & 1 &1  & 1 &1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1\\
        &&3           & CS-I & 3 & 3 & 3 & 3 & 3 & 3 & 3 & 3 &3 & 3& 3 & 3 & 3 &3 & 3\\
                    &&& CS-II & 1& 1 & 1 & 1 &1  & 1 &1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1\\
\cline{2-19}
\multicolumn{1}{|c|}{} & \multicolumn{3}{c||}{Design cost (k\$)} & 40&40&\!\! 40 & 40 & 40 & 45&45&\!\! 45& 45&45 & 55&55 &\!\! 55    & 55& 55\\

     \cline{2-19}
                    &&& AP& 1& 1 & 1 & 1 &1 & 0 & 0 & 0 &0 & 0 & 1& 1 & 1 & 1 &1\\

        &&1         & CS-I & 1& 1 & 1 & 1 &1 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2\\
                    &&&CS-II3 & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0\\
     \cline{3-19}
                    &&& AP & 1& 1 & 1 & 1 &1  &  1 &1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1 \\
        &0.1&2          & CS-I & 0 & 0 & 0 &0 & 0 & 0 & 0 & 0 &0 & 0& 0 & 0 & 0 &0 & 0 \\
                    &&& CS-II & 2 & 2 & 2 &2 & 2 & 2 & 2 & 2 &2 & 2& 2 & 2 & 2 &2 & 2\\
     \cline{3-19}
                    &&& AP &1 & 1& 1 & 1 & 1 & 1  & 1 & 1 & 1 &1 & 1& 1 & 1 & 1 &1 \\
        &&3         & CS-I & 3 & 3 & 3 &3 & 3 & 3 & 3 & 3 &3 & 3& 3 & 3 & 3 &3 & 3\\
                    &&& CS-II& 1& 1 & 1 & 1 &1  & 1 & 1 & 1 & 1 &1 & 1& 1 & 1 & 1 & 1 \\
\cline{2-19}
\multicolumn{1}{|c|}{} & \multicolumn{3}{c||}{Design cost (k\$)} & 40&\!\! 40   & 40&40 & 40 & 45&45&\!\! 45 &45& 45 & 55&55&\!\! 55 & 55 & 55 \\
\hline
\end{tabular}}
\end{adjustbox}
\end{center}
\end{table}

%The resulting system design in the case of $\epsilon_{ij} = 0.05$ are shown in the table. We leave the rest of experiment results in the Electronic Companion.

Note that when $K = 1$, the model reduces to the case of robust model without clustering by Wang, et. al.

The observations of the experimental results are the following:
(i) the cost increases when the variation range $[\underline{\hmu}, \overline{\hmu}]$ of expected lifetimes increases, or the dispersion parameter $\epsilon$ of the lifetimes of components enlarges. Such increased cost is due to the enlarged ambiguity set $\mathbb{F}_K$ resulting from the change of above distributional parameters $([\underline{\hmu}, \overline{\hmu}]$. (ii) Intuitively, cost also increases if the required reliability level $R_0$ is increased.
(iii) The cost decreases as number of clusters $K$ increases. This is due to to the fact that since the mean and dispersion information of each cluster are included in the ambiguity set, more clusters implies more information and therefore smaller ambiguity set.

We then perform out-of-sample tests on the designs $K = 1, 5, 10$. To test the robustness of the design, we generate testing data with smaller lifetime mean and larger standard deviation. In particular, we let $\hmu_{test} = (1 - \Delta_{M})\hmu_{train}$, and $\hsigma_{test} = \Delta_{S}\hsigma_{train}$. $\Delta_M$ is set to 8\%, and $\Delta_S$ is set to 10\%, 20\% and 30\%, respectively.

\begin{table}[htp]\footnotesize%\small%
\caption{\label{tab-compare2} \footnotesize The out of sample reliability of designs generated by $K = 1, 5,10$ models under different $\Delta_{s}$. $R_0$ is set to $0.95$. In the design columns, the 3 columns are corresponding to different subsystems. The three numbers in each column indicates the number of redundancies used that is active parallel, cold-standby type I, or cold-standby type II, respectively. }
\begin{center}
\begin{tabular}{|c||c|c|| c|c|c| c |  c |c   |}\hline
\multirow{2}{*}{$\mathcal{T}_S$}  & \multirow{2}{*}{$(\Delta_{M}, \Delta_{S})$} & \multirow{2}{*}{Model} & \multicolumn{3}{c|}{Design} & \multirow{2}{*}{cost} & \multirow{2}{*}{Mean of out-of-sample reliability level} & \multirow{2}{*}{StD} \\
\cline{4-6}
&&& AP & CS-I & CS-II &&&\\
\hline
\multirow{9}{*}{29} & \multirow{3}{*}{(8\%, 10\%)} &  K=1 Model & (0,2,0) & (1,3,0) & (0,1,3)& 45.5 &0.976 & 0.152   \\
     & & K=5 Model & (0,2,0) & (1,2,0) & (1,2,2)& 45.0 &0.959 & 0.198 \\
     & & K=10 Model & (1,1,0) & (1,3,0) & (1,1,3)& 42.0 &0.607 & 0.489  \\
    \cline{2-9}
     &\multirow{3}{*}{(8\%, 20\%)}  &  K=1 Model & (0,2,0) & (1,3,0) & (0,1,3)& 45.5  &  0.972 & 0.167   \\
     & & K=5 Model &  (0,2,0) & (1,2,0) & (1,2,2)& 45.0&0.953 & 0.211 \\
     & & K=10 Model &  (1,1,0) & (1,3,0) & (1,1,3)& 42.0  &0.620 & 0.485  \\
    \cline{2-9}
     &\multirow{3}{*}{(8\%, 30\%)}  &  K=1 Model & (0,2,0) & (1,3,0) & (0,1,3)& 45.5 &  0.964 & 0.186   \\
     & & K=5 Model & (0,2,0) & (1,2,0) & (1,2,2)& 45.0 &0.947 & 0.223 \\
     & & K=10 Model &  (1,1,0) & (1,3,0) & (1,1,3)& 42.0 &0.614 & 0.487  \\
\hline

\end{tabular}
\end{center}
\end{table}

\begin{figure}[H]
\centering
\includegraphics[width=\columnwidth]{Out_of_sample_K.png}
\caption{\footnotesize Figure (a)(b)(c) represents the out of sample reliability of designs generated by $K = 1, 5, 10$ models under different $\Delta_{S}$, respectively. The vertical beam represents $\mathcal{T}_S $. The fraction of the lifetime histogram on the right side of beam represents the out-of-sample reliability level.}
\label{figureK}
\end{figure}

From the results, we can observe that with moderate number of clusters ($K = 5$), we can obtain designs with less costs than designs generated by robust model with no clustering ($K = 1$), while retaining robustness even when there is a significant shrink in mean lifetime and much larger standard deviation. This shows that our framework can produce designs that are robust enough and cheaper, by the incorporation of clustering. However, if the number of clusters becomes too large ($K = 10$), the out-of-sample reliability drops significantly. The possible reason is that when $K$ is too high, the model has to split natural clusters into smaller ones, which means that it learns unnecessary information and overfits. Thus, it is crucial to choose the optimal $K$. We present an experiment of choosing $K$ by cross validation in the next subsection.

\subsection{Choosing $K$ by cross validation}
In this subsection, we present a cross validation experiment by applying Algorithm 2. In particular, we choose $m = 10$ and do a 10-fold cross validation. The number of instances of constraints violation, as well as the cost of designs, are plotted in the figure below on the left.  \iffalse We also offers a combined metric for cross validation. First, the cost and number of validation are both normalized to the range $[0, 1]$. Then, compute $(1-\lambda)cost(K) + \lambda{{\#}violation(K)}$, where $\lambda \in [0,1]$. By assigning different $\lambda$, we can adapt to scenarios in of different cost-violation tradeoffs. In particular, high $\lambda$ means that robustness of the design is more valued than the cost; low $\lambda$ indicates the contrary. \fi The combined metric under different $\lambda$ are plotted in the figure below on the right. Observe that when $\lambda$ is low, large $K$ such as 7 and 9 are preferred; when $\lambda$ is high, moderate $K$ like 5 are better. Since large $K$ generally correspond to less cost, this result matches the intuition that people are willing to pay more cost if robustness is more valued.
 %least constraint violation occurs, so $K = 5$ is the ideal parameter to cluster this data set. $K = 5$ will be used in the following subsections.
\iffalse
\begin{figure}[H]
\begin{subfigure}{0.5\textwidth}
\includegraphics[scale=0.65]{cross_validation.png}
\caption{\footnotesize}
\label{figure4-1}
\end{subfigure}
\begin{subfigure}{0.5\textwidth}
\centering
\includegraphics[scale=0.37]{cross_validation_lambda.png}
\caption{\footnotesize }
\label{figure4-2}
\end{subfigure}
\caption{\footnotesize (a) The number of violations and costs with different $K$. (b) Costs penalized by $\lambda$ with different $K$, with each line associated with a different $\lambda$.}
\end{figure}
\fi

\begin{figure}[H]
\centering
\includegraphics[width=\columnwidth]{cv.jpg}
\caption{\footnotesize (a) The number of violations and costs with different $K$. (b) Costs penalized by $\lambda$ with different $K$, with each line associated with a different $\lambda$.}
\label{figure4-2}
\end{figure}


\iffalse
\begin{table}[h*]\footnotesize%\small%
\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.85$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
\begin{center}
\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
    $L_S$ & Out of sample $\sigma$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
     \hline
    \multirow{6}{*}{7.625} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.834 & 0.372   \\
     && C-DRO-Model & (1,1,3) &  &0.997 & 0.053  \\
        \cline{2-7}
     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.724 & 0.447   \\
     && C-DRO-Model & (1,1,3) &  &0.982 & 0.132  \\
        \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.673 & 0.469   \\
     && C-DRO-Model & (1,1,3) &  &0.971 & 0.168  \\
\hline
    \multirow{6}{*}{7.75} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.781 & 0.414   \\
     && C-DRO-Model & (1,1,4) &  &0.999 & 0.028  \\
                 \cline{2-7}
     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.690 & 0.462   \\
     && C-DRO-Model & (1,1,4) &  &0.991 & 0.095  \\
                  \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.637 & 0.481   \\
     && C-DRO-Model & (1,1,4) &  &0.989 & 0.105  \\
\hline
    \multirow{6}{*}{7.875} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.716 & 0.451   \\
     && C-DRO-Model & (1,1,7) &  &1.000 & 0.000  \\
               \cline{2-7}
     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.640 & 0.480   \\
     && C-DRO-Model & (1,1,7) &  &0.998 & 0.040  \\
               \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.85$}& 0.602 & 0.490   \\
     && C-DRO-Model & (1,1,7) &  &0.999 & 0.035  \\
\hline
\end{tabular}
\end{center}
\end{table}

\begin{table}[h*]\footnotesize%\small%
\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.90$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
\begin{center}
\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
    $L_S$ & Out of sample $\sigma$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
     \hline
    \multirow{6}{*}{7.625} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.618 & 0.486   \\
     && C-DRO-Model & (1,1,3) &  &0.998 & 0.047  \\
        \cline{2-7}
     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.576 & 0.494   \\
     && C-DRO-Model & (1,1,3) &  &0.989 & 0.103   \\
        \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.560 & 0.496   \\
     && C-DRO-Model & (1,1,3) &  &0.988 & 0.111   \\
\hline
    \multirow{6}{*}{7.75} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.528 & 0.499   \\
     && C-DRO-Model & (1,1,4) &  &0.999 & 0.037   \\
                 \cline{2-7}
     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.538 & 0.499   \\
     && C-DRO-Model & (1,1,4) &  &0.996 & 0.063   \\
                  \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.530 & 0.499   \\
     && C-DRO-Model & (1,1,4) &  &0.994 & 0.080   \\
\hline
    \multirow{6}{*}{7.875} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.458 & 0.498   \\
     && C-DRO-Model & (1,1,7) &  &0.996 & 0.060  \\
               \cline{2-7}
     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.481 & 0.500   \\
     && C-DRO-Model & (1,1,7) &  &0.996 & 0.060   \\
               \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.496 & 0.500   \\
     && C-DRO-Model & (1,1,7) &  &0.995 & 0.069  \\
\hline
\end{tabular}
\end{center}
\end{table}

\begin{table}[h*]\footnotesize%\small%
\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.90$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
\begin{center}
\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
    $L_S$ & Out of sample $\sigma$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
     \hline
    \multirow{6}{*}{7.625} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.143 & 0.350   \\
     && C-DRO-Model & (1,1,3) &  &0.653 & 0.476  \\
        \cline{2-7}
     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.576 & 0.494   \\
     && C-DRO-Model & (1,1,3) &  &0.989 & 0.103   \\
        \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.560 & 0.496   \\
     && C-DRO-Model & (1,1,3) &  &0.988 & 0.111   \\
\hline
    \multirow{6}{*}{7.75} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.528 & 0.499   \\
     && C-DRO-Model & (1,1,4) &  &0.999 & 0.037   \\
                 \cline{2-7}
     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.538 & 0.499   \\
     && C-DRO-Model & (1,1,4) &  &0.996 & 0.063   \\
                  \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.530 & 0.499   \\
     && C-DRO-Model & (1,1,4) &  &0.994 & 0.080   \\
\hline
    \multirow{6}{*}{7.875} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.458 & 0.498   \\
     && C-DRO-Model & (1,1,7) &  &0.996 & 0.060  \\
               \cline{2-7}
     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.481 & 0.500   \\
     && C-DRO-Model & (1,1,7) &  &0.996 & 0.060   \\
               \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.496 & 0.500   \\
     && C-DRO-Model & (1,1,7) &  &0.995 & 0.069  \\
\hline
\end{tabular}
\end{center}
\end{table}

\begin{table}[h*]\footnotesize%\small%
\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.90$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
\begin{center}
\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
    $L_S$ & Out of sample $\sigma$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
     \hline
    \multirow{6}{*}{7.625} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.348 & 0.476   \\
     && C-DRO-Model & (1,1,3) &  &0.930 & 0.256  \\
        \cline{2-7}
     & \multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.427 & 0.495   \\
     && C-DRO-Model & (1,1,3) &  &0.948 & 0.221  \\
        \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.445 & 0.497   \\
     && C-DRO-Model & (1,1,3) &  &0.957 & 0.203   \\
\hline
    \multirow{6}{*}{7.75} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.278 & 0.448   \\
     && C-DRO-Model & (1,1,4) &  &0.933 & 0.250   \\
                 \cline{2-7}
     &\multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.381 & 0.486   \\
     && C-DRO-Model & (1,1,4) &  &0.974 & 0.159   \\
                 \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.409 & 0.492   \\
     && C-DRO-Model & (1,1,4) &  &0.979 & 0.143   \\
\hline
    \multirow{6}{*}{7.875} & \multirow{2}{*}{2}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.214 & 0.410   \\
     && C-DRO-Model & (1,1,7) &  &0.909 & 0.287  \\
               \cline{2-7}
     & \multirow{2}{*}{6}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.345 & 0.475   \\
     && C-DRO-Model & (1,1,7) &  &0.973 & 0.163  \\
               \cline{2-7}
     &\multirow{2}{*}{10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.90$}& 0.383 & 0.486   \\
     && C-DRO-Model & (1,1,7) &  &0.985 & 0.122   \\
\hline
\end{tabular}
\end{center}
\end{table}
\fi

\iffalse
From the experiment result, we can observe that despite a smaller ambiguity set, our design ($x^{(2)}$) can achieve robustness level that is comparable to the design without clustering $x^{(1)}$, and are far better than the baseline probabilistic model ($x^{(3)}$).
\fi


\subsection{Value of side information}
In this subsection we experiment on clustering according to side information. The system we are studying is the same as the one in Section 5.2. We choose $K = 5$, corresponding to $\lambda = 0.8$ in the cross validation section. When generating samples from distributions, we also obtain the side information of which of the 5 distributions the sample is drawn from. We then cluster the data set by $K = 5$, based solely on the side information, and compute parameters $(\hnu, [\underline{\hmu}, \overline{\hmu}], \bm{\hsigma}, \p)$ of the model from it. We obtain design ($x^{(2)}$) from the model, and compare it with design ($x^{(1)}$) obtained from the model in which $K$ is also 5, but is directly clustered based on lifetime information instead of the side information.

\begin{table}[H]\label{d-table}\scriptsize%\footnotesize%\small%
\caption{ \footnotesize  The design table for $K = 5$ model with and without side information (S.I)}
\begin{center}
\begin{tabular}{|c|c| c|| ccccc || ccccc || ccccc |}\hline
\multirow{3}{*}{S.I} & \multirow{3}{*}{Subsystem} & \multirow{3}{*}{Type} & \multicolumn{5}{c||}{$R_0=0.95$ }& \multicolumn{5}{c||}{$R_0=0.97$}& \multicolumn{5}{c|}{$R_0=0.99$} \\
  \cline{4-18}
 &&&\multicolumn{5}{c||}{$\mathcal{T}_S $ (yrs) }& \multicolumn{5}{c||}{$\mathcal{T}_S $ (yrs)}& \multicolumn{5}{c||}{$\mathcal{T}_S $ (yrs)}\\

      &&&28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30  \\
     \cline{1-18}
                    && AP & 0 & 0 &0 & 0 & 0  & 0 & 0 &1 & 1 & 1    & 1 & 1 & 0 & 1 & 1 \\                       &
        1            & CS-l & 2 & 2 &2 & 2 & 2  & 2 & 2 &2 & 2 & 2    & 3 & 3 & 3 & 3 & 3 \\
                    &&CS-ll & 3 & 0 &0 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 0 & 0 & 0 & 0 & 0 \\
     \cline{2-18}
                    && AP  & 2 & 2 &2 & 1 & 3  & 2& 1 &2 & 2 & 2    & 1 & 1 & 1 & 1 & 1 \\ With  &
        2            & CS-l & 3 & 3 &0 & 3 & 0  &0 & 2 &3 & 0 & 0   & 3 & 3 & 3& 3& 3 \\ S.I &
                     & CS-ll & 0 & 0 &2 & 0 & 3  & 3 & 3 &0 & 3 & 3    & 3 & 3& 3 & 3 & 3 \\
     \cline{2-18}
                    && AP & 1 & 1 &1 & 1 & 1    & 1 & 0 &1 & 1 & 1    & 1 & 1 & 1 & 1 & 1 \\                       &
        3            & CS-l & 0 & 0 &1 & 1 & 1   & 2 & 3 &1 & 1 & 1    & 3 & 3 & 3 & 3 & 3 \\
                    && CS-ll & 3 & 3 &3 & 3 & 3   & 3 & 3 &3 & 3 & 3    & 3 & 3 & 3 & 3 & 3 \\
\hline
\multicolumn{3}{|c||}{Design cost (k\$)} & 45.5& 45.5 &47&\!\! 46.5 &\!\! 50    & 51 & 54.5 &57.5& 59 & 59   & 81.5 & \!\! 81.5\!\! &81.5\!\!& 81.5\!\! &81.5 \\
\hline
\multicolumn{3}{|c||}{Cost saved (k\$)} & 1 & 1 &1.5&\!\! 4 &\!\! 4.5    & 3.5 & 4.5 &2& 0.5 & 2    & 0 & \!\! 0\!\! &25\!\!& 15\!\! &15 \\
\cline{1-18}
     \cline{2-18}
                    && AP & 0 & 0 &0 & 0 & 0  & 0 & 1&1 & 1 & 1    & 1 & 1 & 1 & 1 & 1 \\                          &
        1            & CS-l & 2 & 2 &2 & 2 & 2  & 2 & 2 &2 & 2 & 2    & 3 & 3 & 3 & 3 & 3 \\
                    && CS-ll & 0 & 0 &0 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 0 & 0 & 1 & 1 & 1 \\
     \cline{2-18}
                    && AP & 1 & 1 &3 & 3 & 1 & 1 & 2 &2 & 2 & 2    & 1 & 1 & 1 & 1 & 1 \\ Without &
        2            & CS-l & 3 & 3 &3 & 3 & 3  & 3 & 0 &3 & 3 & 0    & 3 & 3& 3 & 3 & 3 \\ S.I &
                     & CS-ll & 0 & 0 &0 & 0 & 3  & 3 & 3 &0 & 0 & 3    & 3 & 3 & 3 & 3 & 3 \\
     \cline{2-18}
                    && AP  & 1 & 1 &1 & 1 & 1  & 1 & 1 &1  & 1    & 1  & 1& 1 & 1 & 1 & 1 \\                          &
       3             & CS-l & 1 & 1 &1 & 1 & 2  & 2 & 1 &2  & 2    & 2  & 3& 3 & 2 & 2 & 2 \\
                    && CS-ll& 3 & 3 &3 & 3   &3  & 3 & 3 &3 & 3    & 3  & 3& 3 & 3 & 3 & 3 \\
\hline
\multicolumn{3}{|c||}{Design cost (k\$)} & 46.5& 46.5 &48.5&\!\!50.5 &\!\! 54.5 &54.5     & 59 & 59.5 &59.5& 61 & 81.5    & 81.5\!\! & \!\! 96.5\!\! &96.5\!\!& 96.5\\
\hline
\end{tabular}
\end{center}
\end{table}


\begin{table}[htp]\footnotesize%\small%
\caption{\label{tab-compare3} \footnotesize The out-of-sample result of designs obtained at $\mathcal{T}_S  = 29$. $R_0$ is set to $0.95$. In the design columns, the 3 columns are corresponding to different subsystems. The three numbers in each column indicates the number of redundancies used that is active parallel, cold-standby type I, or cold-standby type II, respectively. }
\begin{center}
\begin{tabular}{|c|c|c||c|c|c|| c|c|c|}\hline
 \multirow{2}{*}{($\Delta_{M}, \Delta_{S}$)} &\multirow{2}{*}{Model}  &  Reliability & \multirow{2}{*}{($\Delta_{M}, \Delta_{S}$)} &\multirow{2}{*}{Model}  &  Reliability& \multirow{2}{*}{($\Delta_{M}, \Delta_{S}$)} &\multirow{2}{*}{Model}  &  Reliability \\
&& level &&& level &&& level \\
\hline

       \multirow{2}{*}{(5\%, 10\%)}&  With S.I   &0.9999& \multirow{2}{*}{(8\%, 10\%)}&  With S.I& 0.994& \multirow{2}{*}{(10\%, 10\%)} &  With S.I &  0.9626   \\
      & Without S.I &1.0& & Without S.I &  0.9989&& Without S.I &0.9877  \\
    \hline
     \multirow{2}{*}{(5\%, 15\%)}  & With S.I& 0.9999 &\multirow{2}{*}{(8\%, 15\%)}  &  With S.I  & 0.9939&\multirow{2}{*}{(10\%, 15\%)}  &  With S.I & 0.9557  \\
     & Without S.I & 1.0& & Without S.I & 0.9985  & & Without S.I & 0.9883\\
    \hline
      \multirow{2}{*}{(5\%, 20\%)}& With S.I  & 1.0 &\multirow{2}{*}{(8\%, 20\%)}  &  With S.I & 0.994  &\multirow{2}{*}{(10\%, 20\%)}  &  With S.I & 0.9588 \\
      & Without S.I &1.0 & & Without S.I &  0.9986& & Without S.I & 0.9853\\
    \hline
      \multirow{2}{*}{(5\%, 30\%)}  &  With S.I& 0.9996 &\multirow{2}{*}{(8\%, 30\%)}  &  With S.I & 0.9919  &\multirow{2}{*}{(10\%, 30\%)}  &  With S.I &  0.9519 \\
     & Without S.I  &1.0  & & Without S.I  &0.9972 & & Without S.I & 0.9814 \\


\hline
\end{tabular}

\end{center}
\end{table}


\iffalse
\begin{tabular}{|c|c||c|c|| c|c||c| c||}\hline
\multirow{2}{*}{$L_S$} & \multirow{2}{*}{($\Delta_{M}, \Delta_{S}$)} &\multirow{2}{*}{Model} & \multicolumn{3}{c|}{Design} & \multirow{2}{*}{cost}  & \multirow{2}{*}{Mean of out-of-sample reliability level} & \multirow{2}{*}{StD} \\
\cline{4-6}
&&& AP & CS-I & CS-II &&&\\
\hline
\multirow{25}{*}{29} & \multirow{2}{*}{(5\%, 10\%)} &  With S.I & (0,2,0) & (2,3,0) & (0,2,3)& 49 &0.9999 & 0.01   \\
     & & Without S.I & (0,2,0) & (2,3,0) & (0,3,3)& 51  &1.0 & 0.0 \\
    \cline{2-9}
     &\multirow{2}{*}{(5\%, 15\%)}  &  With S.I & (0,2,0) & (2,3,0) & (0,2,3)& 49 & 0.9999 & 0.01   \\
     & & Without S.I & (0,2,0) & (2,3,0) & (0,3,3)& 51    &1.0 & 0.0 \\
    \cline{2-9}
     &\multirow{2}{*}{(5\%, 20\%)}  & With S.I &(0,2,0) & (2,3,0) & (0,2,3)& 49 & 1.0 & 0.0   \\
     & & Without S.I & (0,2,0) & (2,3,0) & (0,3,3)& 51    &1.0 & 0.0 \\
    \cline{2-9}
     &\multirow{2}{*}{(5\%, 30\%)}  &  With S.I & (0,2,0) & (2,3,0) & (0,2,3)& 49 & 0.9996 & 0.02   \\
     & & Without S.I & (0,2,0) & (2,3,0) & (0,3,3)& 51   &1.0 & 0.0 \\
    \clineB{2-9}{2}
 & \multirow{2}{*}{(8\%, 10\%)} &  With S.I & (0,2,0) & (2,3,0) & (0,2,3)& 49  &0.994 & 0.0772   \\
     & & Without S.I & (0,2,0) & (2,3,0) & (0,3,3)& 51    &0.9997 & 0.0173 \\
    \cline{2-9}
     &\multirow{2}{*}{(8\%, 15\%)}  &  With S.I & (0,2,0) & (2,3,0) & (0,2,3)& 49  & 0.9939 & 0.0779   \\
     & & Without S.I & (0,2,0) & (2,3,0) & (0,3,3)& 51  &0.9998 & 0.0141 \\
    \cline{2-9}
     &\multirow{2}{*}{(8\%, 20\%)}  &  With S.I &(0,2,0) & (2,3,0) & (0,2,3)& 49  & 0.994 & 0.0772   \\
     & & Without S.I & (0,2,0) & (2,3,0) & (0,3,3)& 51   &0.9998 & 0.0264 \\
    \cline{2-9}
     &\multirow{2}{*}{(8\%, 30\%)}  &  With S.I & (0,2,0) & (2,3,0) & (0,2,3)& 49 & 0.9919 & 0.0869   \\
     & & Without S.I &(0,2,0) & (2,3,0) & (0,3,3)& 51   &0.9988 & 0.0346 \\
     \cline{2-9}
     \clineB{2-9}{2}
 & \multirow{2}{*}{(10\%, 10\%)} &  With S.I & (0,2,0) & (2,3,0) & (0,2,3)& 49  &0.9633 & 0.1880   \\
     & & Without S.I & (0,2,0) & (2,3,0) & (0,3,3)& 51    &0.9953 & 0.0683 \\
    \cline{2-9}
     &\multirow{2}{*}{(10\%, 15\%)}  &  With S.I & (0,2,0) & (2,3,0) & (0,2,3)& 49  & 0.9563 & 0.2044   \\
     & & Without S.I & (0,2,0) & (2,3,0) & (0,3,3)& 51    &0.9963& 0.0607 \\
    \cline{2-9}
     &\multirow{2}{*}{(10\%, 20\%)}  &  With S.I & (0,2,0) & (2,3,0) & (0,2,3)& 49  & 0.9589 & 0.1985   \\
     & & Without S.I & (0,2,0) & (2,3,0) & (0,3,3)& 51   &0.9942 & 0.0759 \\
    \cline{2-9}
     &\multirow{2}{*}{(10\%, 30\%)}  &  With S.I & (0,2,0) & (2,3,0) & (0,2,3)& 49  & 0.9523 & 0.2131   \\
     & & Without S.I & (0,2,0) & (2,3,0) & (0,3,3)& 51    &0.991 & 0.0944 \\
\hline
\end{tabular}

\end{center}
\end{table}
\fi
The result in table 3 shows that when side information is incorporated, we can achieve a design with much lower cost. We then choose $\mathcal{T}_S  = 29$, and perform out-of-sample tests, in the similar way as previous experiments. We can observe that even with this significant cost saved, the design obtained by clustering by side information still performs well enough under significant $\Delta_M$ and $\Delta_S$
\begin{figure}[H]
\centering
\includegraphics[scale=0.2]{Out_of_sample_side.png}
\caption{\footnotesize The out-of-sample test result of designs obtained at $\mathcal{T}_S = 29$, with and without S.I.  The vertical beam represents $\mathcal{T}_S $. The fraction of the lifetime histogram on the right side of beam represents the out-of-sample reliability level.}
\label{figure4}
\end{figure}


\iffalse
\begin{table}[H]\footnotesize%\small%
\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.95$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
\begin{center}
\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
    $L_S$ & $\Delta_{m}, \Delta_{s}$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
     \hline
    \multirow{12}{*}{7.5} & \multirow{2}{*}{1, 5}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.952 & 0.214   \\
     && C-DRO-Model & (1,2,8) &  &1.000 & 0.000  \\
        \cline{2-7}
     &\multirow{2}{*}{1, 10}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.886 & 0.318   \\
     && C-DRO-Model & (1,2,8) &  &1.000 & 0.000  \\
        \cline{2-7}
     &\multirow{2}{*}{1, 20}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.800 & 0.400   \\
     && C-DRO-Model & (1,2,8) &  &1.000 & 0.000  \\
\hline
\end{tabular}
\end{center}
\end{table}

\begin{table}[!htbp]\label{d-table}\scriptsize%\footnotesize%\small%
\caption{ \footnotesize  The design table for $K = 10$ model with side information}
\begin{center}
\begin{tabular}{|c| c|| ccccc || ccccc || ccccc |}\hline
\multirow{2}{*}{Subsystem} & \multirow{2}{*}{Type} & \multicolumn{5}{c||}{$R_0=0.95$,~$L_S$ (yrs) }& \multicolumn{5}{c||}{$R_0=0.97$,~$L_S$ (yrs)}& \multicolumn{5}{c|}{$R_0=0.99$,~$L_S$ (yrs)} \\
 \cline{3-17}
      &&28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30  \\
     \hline
                    & 1 & 1 & 1 &1 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 1 & 1 & 1 & 1 & 1 \\
        1           & 2 & 1 & 1 &1 & 2 & 2  & 2 & 2 &2 & 2 & 2    & 2 & 2 & 2 & 2 & 2 \\
                    & 3 & 0 & 0 &0 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 0 & 0 & 0 & 0 & 0 \\
     \hline
                    & 1 & 1 & 1 &1 & 1 & 1  & 1 & 1 &1 & 1 & 1    & 1 & 1 & 1 & 1 & 1 \\
        2           & 2 & 1 & 0 &0 & 0 & 0  & 1 & 0 &0 & 0 & 3    & 1 & 0 & 0 & 0 & 3 \\
                    & 3 & 1 & 2 &2 & 2 & 2  & 1 & 2 &2 & 2 & 0    & 1 & 2 & 2 & 2 & 0 \\
     \hline
                    & 1 & 1 & 0 &1 & 1 & 0  & 1 & 1 &0 & 1 & 1    & 0 & 0 & 0 & 1 & 1 \\
       3            & 2 & 0 & 1 &3 & 0 & 1  & 0 & 0 &1 & 3 & 2    & 1 & 1 & 1 & 3 & 2 \\
                    & 3 & 3 & 3 &1 & 3 & 3  & 3 & 3 &3 & 1 & 2    & 3 & 3 & 3 & 1 & 2 \\
\hline
\multicolumn{2}{|c||}{Design cost (k\$)} & 38.5& 39.5 &40&\!\! 44 &\!\! 44.5    & 43.5 & 44 &44.5& 45 & 46    & 54 & \!\! 54.5\!\! &54.5\!\!& 55\!\! &56\\
\hline
\end{tabular}
\end{center}
\end{table}


\begin{table}[!htbp]\label{d-table}\scriptsize%\footnotesize%\small%
\caption{ \footnotesize  The design table for $K = 10$ model without side information}
\begin{center}
\begin{tabular}{|c| c|| ccccc || ccccc || ccccc |}\hline
\multirow{2}{*}{Subsystem} & \multirow{2}{*}{Type} & \multicolumn{5}{c||}{$R_0=0.95$,~$L_S$ (yrs) }& \multicolumn{5}{c||}{$R_0=0.97$,~$L_S$ (yrs)}& \multicolumn{5}{c|}{$R_0=0.99$,~$L_S$ (yrs)} \\
 \cline{4-18}
      &&28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30 &28 & 28.5 & 29 &  29.5 & 30  \\
     \hline
                    & 1 & 1 & 0 &0 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 1 & 1 & 1 & 1 & 1 \\
        1           & 2 & 1 & 2 &2 & 2 & 2  & 2 & 2 &2 & 2 & 2    & 2 & 2 & 2 & 2 & 2 \\
                    & 3 & 0 & 0 &0 & 0 & 0  & 0 & 0 &0 & 0 & 0    & 0 & 0 & 0 & 0 & 0 \\
     \hline
                    & 1 & 1 & 1 &1 & 1 & 1  & 1 & 1 &1 & 1 & 1    & 1 & 1 & 1 & 1 & 1 \\
        2           & 2 & 1 & 2 &0 & 0 & 3  & 1 & 0 &0 & 2 & 2    & 1 & 0 & 0 & 0 & 3 \\
                    & 3 & 1 & 0 &2 & 2 & 0  & 1 & 2 &2 & 1 & 1    & 1 & 2 & 2 & 2 & 0 \\
     \hline
                    & 1 & 1 & 1 &1 & 1 & 1  & 0 & 1 &1 & 1 & 0    & 0 & 1 & 1 & 1 & 1 \\
       3            & 2 & 3 & 3 &0 & 0 & 3  & 1 & 3 &3 & 3 & 2    & 1 & 3 & 3 & 3 & 2 \\
                    & 3 & 1 & 1 &3 & 3 & 1  & 3 & 1 &1 & 1 & 3    & 3 & 1 & 1 & 1 & 2 \\
\hline
\multicolumn{2}{|c||}{Design cost (k\$)} & 39.5& 44 &44&\!\! 44 &\!\! 45.5    & 44 & 45 &45& 46 & 47.5    & 54 & \!\! 55\!\! &55\!\!& 55\!\! &56\\
\hline
\end{tabular}
\end{center}
\end{table}
\fi

\iffalse
Data creating process finised
Academic license - for non-commercial use only
cost is 38.5003857478587 , L_S is 26, R_0 is 0.95

(((1.0, -0.0, -0.0), (1.0, -0.0, -0.0), (7.563683503866358e-06, 7.563683503866358e-06, 7.563683503866358e-06)), ((-0.0, -0.0, 1.0), (-0.0, 1.0, -0.0), (1.0, -0.0, -0.0)), ((-0.0, -0.0, -0.0), (1.0, -0.0, 1.0), (1.0, 1.0, -0.0)))
cost is 43.0 , L_S is 26, R_0 is 0.97

(((-0.0, -0.0, -0.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, -0.0), (1.0, 1.0, -0.0), (1.0, -0.0, 1.0)))
cost is 53.0 , L_S is 26, R_0 is 0.99

(((-0.0, 1.0, -0.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, 1.0, -0.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, -0.0), (1.0, 1.0, -0.0), (-0.0, 1.0, 1.0)))
cost is 44.0 , L_S is 27, R_0 is 0.95

(((-0.0, -0.0, -0.0), (1.0, -0.0, 1.0), (0.0, -0.0, -0.0)), ((-0.0, 1.0, -0.0), (1.0, -0.0, -0.0), (1.0, -0.0, -0.0)), ((-0.0, -0.0, -0.0), (-0.0, -0.0, 1.0), (1.0, 1.0, 1.0)))
cost is 44.0 , L_S is 27, R_0 is 0.97

(((-0.0, -0.0, -0.0), (1.0, 1.0, -0.0), (0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (-0.0, 1.0, -0.0), (0.0, 1.0, -0.0)), ((-0.0, -0.0, -0.0), (-0.0, 1.0, -0.0), (1.0, 1.0, 1.0)))
cost is 54.0 , L_S is 27, R_0 is 0.99

(((-0.0, 1.0, -0.0), (-0.0, 1.0, 1.0), (0.0, -0.0, -0.0)), ((-0.0, 1.0, -0.0), (1.0, -0.0, -0.0), (-0.0, -0.0, 1.0)), ((-0.0, -0.0, -0.0), (-0.0, 1.0, -0.0), (1.0, 1.0, 1.0)))
cost is 44.0 , L_S is 28, R_0 is 0.95

(((-0.0, -0.0, -0.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, 1.0, -0.0), (-0.0, 1.0, -0.0), (-0.0, 1.0, -0.0)), ((-0.0, -0.0, -0.0), (-0.0, -0.0, 1.0), (1.0, 1.0, 1.0)))
cost is 45.00017123897305 , L_S is 28, R_0 is 0.97

(((5.707965768311441e-06, 5.707965768311441e-06, 5.707965768311441e-06), (1.0, 1.0, -0.0), (0.0, 0.0, -0.0)), ((0.0, 1.0, -0.0), (1.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 0.0), (-0.0, 1.0, -0.0), (1.0, 1.0, 1.0)))
cost is 54.0 , L_S is 28, R_0 is 0.99

(((-0.0, -0.0, 1.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, 1.0, -0.0), (-0.0, 1.0, -0.0), (-0.0, 1.0, -0.0)), ((-0.0, -0.0, -0.0), (-0.0, -0.0, 1.0), (1.0, 1.0, 1.0)))
cost is 45.0 , L_S is 29, R_0 is 0.95

(((-0.0, -0.0, -0.0), (1.0, -0.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (-0.0, -0.0, -0.0), (1.0, -0.0, 1.0)), ((-0.0, 1.0, -0.0), (1.0, 1.0, 1.0), (-0.0, -0.0, 1.0)))
cost is 54.5 , L_S is 29, R_0 is 0.97

(((-0.0, -0.0, 1.0), (-0.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((1.0, -0.0, -0.0), (-0.0, -0.0, -0.0), (1.0, 1.0, -0.0)), ((-0.0, -0.0, -0.0), (1.0, -0.0, -0.0), (1.0, 1.0, 1.0)))
cost is 55.0 , L_S is 29, R_0 is 0.99

(((-0.0, -0.0, 1.0), (1.0, 1.0, -0.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (-0.0, -0.0, -0.0), (1.0, -0.0, 1.0)), ((-0.0, 1.0, -0.0), (1.0, 1.0, 1.0), (-0.0, 1.0, -0.0)))
cost is 55.0 , L_S is 30, R_0 is 0.95

(((-0.0, -0.0, 1.0), (1.0, 1.0, -0.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (1.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, -0.0), (1.0, -0.0, -0.0), (1.0, 1.0, 1.0)))
cost is 55.5 , L_S is 30, R_0 is 0.97

(((-0.0, -0.0, 1.0), (-0.0, 1.0, 1.0), (0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (1.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (1.0, 1.0, 1.0), (-0.0, 1.0, -0.0)))
cost is 56.0 , L_S is 30, R_0 is 0.99

(((1.0, -0.0, -0.0), (1.0, 1.0, -0.0), (-0.0, -0.0, -0.0)), ((-0.0, -0.0, 1.0), (1.0, 1.0, 1.0), (-0.0, -0.0, -0.0)), ((1.0, 0.0, -0.0), (1.0, 1.0, -0.0), (-0.0, 1.0, 1.0)))
\fi
\subsection{Comparison with a baseline probabilistic model}

\begin{figure}[H]
\centering
\includegraphics[scale=0.8]{KVARIATION55.pdf}
\caption{\footnotesize The series-parallel system we are studying in this section, consists of a single type of component, with active parallel strategy only. This simplicity is due to the limitation of baseline probabilistic model.}
\label{figure1}
\end{figure}

To illustrate the performance of our robust reliability model, we compare the design ($x^{(1)}$) obtained from the proposed robust redundancy optimization model with the design ($x^{(2)}$) obtained from a probabilistic redundancy optimization model. We choose $K = 5$, correspond to $\lambda = 0.8$ in the previous subsection. As mentioned in the Introduction and Literature Review, when the situation involves multiple types ({\it i.e.,} $|\mathbf{M}_i|>1$), or both the cold-standby and active parallel redundant subsystems are considered, the probabilistic model generally becomes intractable. Therefore, for a fair comparison, we consider a series-parallel system with $|\mathbf{N}| = 3$ and $|\mathbf{M}_i|=1, \forall i \in [3]$, which preserves a linear MIP formulation for the probabilistic model. For a coherent exposition of the experimental study, we place the details of the probabilistic redundancy model as well as its MIP transformation in the Electronic Companion.

\iffalse
In particular, we first randomly generate lifetime samples (size=2500) and then compute the probability levels $\P[\tilde{z}_{i}\le L_S ], \forall i \in [3]$ and the parameters $(\hnu, \underline{\hmu}, \overline{\hmu}, \bm{\hsigma}, \p)$ from the generated lifetime samples for parameter inputs of probabilistic and robust models, respectively, where $R_0=0.95$ and $L_S=7.5$. We obtain the designs by solving the perspective redundancy models. \fi We perform out-of-sample experiments in the similar way as the out-of-sample-test performed in Section 5.2. We increases the out-of-sample standard deviation ($\Delta_S = 10\%, 15\%, 20\%$, respectively),and shrink the lifetime mean ($\Delta_M = 5\%, 8\%, 10\%$) at the same time. The out-of-sample system lifetimes are compared and plotted in figure, and the comparison of out-of-sample reliability levels is provided in table.
\begin{table}[H]\footnotesize%\small%
\caption{\label{tab-compare2} \footnotesize  Out-of-sample reliability level comparison ($R_0=0.95$), where `Design' specifies the number of components allocated in each of 3 subsystems, and `P-Model' and `C-DRO-Model' refer to the probabilistic model and robust model, respectively. }
\begin{center}
\begin{tabular}{|c||c|c|| c| c |c |c   |}\hline
    $\mathcal{T}_S $ & $(\Delta_{M}, \Delta_{S})$ &Model & Design &   Designed reliability level &Mean of out-of-sample reliability level & StD \\
     \hline
    \multirow{19}{*}{7} & \multirow{2}{*}{(5\%, 10\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.914 & 0.280   \\
     && C-DRO-Model & (1,1,2) &  &0.993 & 0.084  \\
        \cline{2-7}
     &\multirow{2}{*}{(5\%, 15\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.913 & 0.282   \\
     && C-DRO-Model & (1,1,2) &  &0.990 & 0.097  \\
        \cline{2-7}
     &\multirow{2}{*}{(5\%, 20\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.908 & 0.289   \\
     && C-DRO-Model & (1,1,2) &  &0.994 & 0.075  \\


     \clineB{2-7}{2}

     & \multirow{2}{*}{(8\%, 10\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.835 & 0.371   \\
     && C-DRO-Model & (1,1,2) &  &0.971 & 0.167  \\
        \cline{2-7}
     &\multirow{2}{*}{(8\%, 15\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.830 & 0.37   \\
     && C-DRO-Model & (1,1,2) &  &0.968 & 0.177  \\
        \cline{2-7}
     &\multirow{2}{*}{(8\%, 20\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.825 & 0.380   \\
     && C-DRO-Model & (1,1,2) &  &0.968 & 0.177  \\

     \clineB{2-7}{2}
     & \multirow{2}{*}{(10\%, 10\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.745 & 0.436   \\
     && C-DRO-Model & (1,1,2) &  &0.932 & 0.252  \\
        \cline{2-7}
     &\multirow{2}{*}{(10\%, 15\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.754 & 0.431   \\
     && C-DRO-Model & (1,1,2) &  &0.930 & 0.256  \\
        \cline{2-7}
     &\multirow{2}{*}{(10\%, 20\%)}& P-Model & (1,1,1) & \multirow{2}{*}{$R_0=0.95$}& 0.742 & 0.438   \\
     && C-DRO-Model & (1,1,2) &  &0.946 & 0.227  \\
\hline
\end{tabular}
\end{center}
\end{table}

\begin{figure}[H]
\centering
\includegraphics[width=\columnwidth]{Out_of_sample_def.png}
\caption{\footnotesize The out-of-sample system lifetime scenarios of robust model with clustering and probabilistic model under different $\Delta_{M}$ and $\Delta_{S}$. The vertical beam represents $\mathcal{T}_S $. The fraction of the lifetime histogram on the right side of beam represents the out-of-sample reliability level.}
\label{figure_out_of_sample_test_def}
\end{figure}

We can observe that the out-of-sample reliability of the design from baseline model becomes unsatisfiable, while the design from robust model can still keep the reliability level intact. Robust model outperforms baseline model more as $\Delta_M$ and $\Delta_S$ increases. This illustrates that our model outperforms the baseline model in that it is significantly more robust, especially under extreme circumstances. In addition, as mentioned before, our model is tractable for multi-type mixed strategy systems, while the probabilistic model will become intractable. Thus our model is superior in both robustness and computability.

\subsection{Case study}

In this section we present a case study of the braking system on high speed train, to provide an example of applying our model in practice. The braking system is a critical subsystem on the train, as the failure of the system can cost hundreds of life. Therefore, it is a common practice to install multiple redundant braking mechanisms on the train, either cold-standby or active-parallel. In this study we model the problem as a system consists of a single subsystem, with 3 types of braking components:  MTB, pneumatically brake stop cock, and emergency brake valve. MTB components works in active-parallel, while the other twos are cold-standby redundancies. Up to 2 MTBs, 8 pneumatically brake stop cocks, and 2 emergency brake valves may be installed.

\begin{figure}[H]
\centering
\includegraphics[scale=0.6]{case-description-edited.pdf}
\caption{ The braking system in this case. It consists of 3 types of component: pneumatically brake stop cock, MTB, and emergency brake valve. MTB components works in active-parallel, while the other twos are cold-standby redundancies. Only two redundant components are shown in figure for each type for simplicity.}
\label{figure1}
\end{figure}

A subset of the dataset is presented below.

\begin{table}[H]\label{without side information-table}%\scriptsize%\footnotesize%\small%
\caption{ \footnotesize  A sample of dataset.}
\begin{center}
\begin{tabular}{|c|c| c|c|}\hline
Type & {Speed (km/s)} & {Emergency brake valve status} & {Mileage at failure(kms)} \\
\hline
\multirow{5}{*} {Pneumatically brake stop cock} & 0 & 1 & 1205489.233 \\
&3.191489362 &0&1210654.084\\
&68.08510638&0&1212826.491\\
&100&1&1214834.413\\
&\vdots&\vdots&\vdots\\
\hline
\multirow{5}{*} {MTB} & 0 & 1 & 1209897.965\\
& 25.53191489 & 0 & 1213133.429\\
& 3.191489362   &1 & 1213997.540\\
& 58.5106383 & 0 & 1215082.013\\
&\vdots&\vdots&\vdots\\

\hline
\multirow{5}{*} {Emergency brake valve} & 3.191489362 & 0 & 1210654.084\\
& 4.255319149 & 1 & 1212022.818\\
& 68.08510638   &0 & 1212826.491\\
& 35.10638298 & 0 & 1217498.174\\
&\vdots&\vdots&\vdots\\
\hline

\end{tabular}
\end{center}
\end{table}

In this case, lifetime is measured by the distance the train has run when the component is installed, instead of time, since failure rate is more closely related to the former. We can compute the lifetime by subtracting the mileage when the component is installed by the mileage when the component fails.

The speed and the binary internal status of emergency brake valve at the point of failure are suspect to be correlated to failure rate. High speed may overload the braking system that already accumulates much attrition. The internal status, although does not directly tells us whether the component fails, might contain useful information of the braking system as a whole at the point of failure, like the flight recorder ("black box") on aeroplanes. Therefore, we incorporated them as side information in the clustering process.

First, we applies the cross validation algorithm to search for ideal $K$. We choose $\lambda = 0.8$, since robustness is critical in high speed train. The result is shown in the figure below.

\begin{figure}[H]
\centering
\includegraphics[width=\columnwidth]{real_case_cv.jpg}
\caption{\footnotesize (a) The number of violations and costs with different $K$. (b) Costs penalized by $\lambda$ with different $K$, with each line associated with a different $\lambda$.}
\label{figure4-2}
\end{figure}


 The results shows that $K = 3, 4, 5$ is ideal. We subsequently obtain the design in the table below using $K=5$, using side information in the clustering process. In particular, we divide the speed into 3 cases: stationary (under 20 km/s), low speed (20-50 km/s) and high speed (over 50 km/s). The valve openness is a boolean value and is naturally divided into 2 cases. Therefore, we can divide the dataset to 6 clusters based on speed and valve status. However, in our dataset there is no instance in the class of low speed and valve status 1. This leaves us with 5 clusters ($K = 5$). We then collect distributional information from each of the clusters, and obtain the design.

\iffalse
\begin{table}[H]\label{without side information-table}\scriptsize%\footnotesize%\small%
\caption{ \footnotesize  The design table for optimal clusters $K = 5$ of real case model. Cock, MTB and Emergency represents the pneumatically brake stop cocks, MTB and emergency brake valves, respectively.}
\begin{center}
\begin{tabular}{|c|c| c|| ccc || ccc || ccc || ccc |}\hline
\multirow{2}{*}{$\epsilon$} & \multirow{2}{*}{Subsystem} & \multirow{2}{*}{Type} & \multicolumn{3}{c||}{$R_0=0.95$,~$L_S$ (kms) }& \multicolumn{3}{c||}{$R_0=0.97$,~$L_S$ (kms)}& \multicolumn{3}{c||}{$R_0=0.98$,~$L_S$ (kms)}& \multicolumn{3}{c|}{$R_0=0.99$,~$L_S$ (kms)} \\
 \cline{4-15}
      &&&2000 & 2250 & 2500 &  2000 & 2250 &2500 & 2000 & 2250 &  2500 & 2000 & 2250 &  2500  \\
     \cline{1-15}
                  && Cock & 5 & 6 &7 & 6 & 6  & 7 & 6 &7& 7 & 6 &7& 8 \\
$\epsilon=0.5$      & 1 & MTB & 1 & 1 &1 & 0 & 1  & 1 & 0 &0 & 1& 1 &1 & 1 \\
                 &&Emergency  & 2 & 2 &2 & 2 & 2 &2  & 2 & 2 &2  & 1 & 1 &1  \\

\hline
\multicolumn{3}{|c||}{Design cost (k\$)} & 27& 31 &35&\!\! 28 &\!\! 31    &35 & 28 &32& 35 & 29    & 33 & \!\! 37 \\
\hline
\end{tabular}
\end{center}
\end{table}
\fi


\begin{table}[H]\label{new-table}\scriptsize%\footnotesize%\small%
\caption{ \footnotesize  The design table for $K = 5$ real case model with side information (S.I)}
\begin{center}
\begin{tabular}{|c|c| c|| ccc || ccc || ccc || ccc |}\hline
\multirow{3}{*}{With S.I,$\epsilon$} & \multirow{3}{*}{Subsystem} & \multirow{3}{*}{Type} &\multicolumn{3}{c||}{$R_0=0.95$ }& \multicolumn{3}{c||}{$R_0=0.97$}& \multicolumn{3}{c||}{$R_0=0.98$}& \multicolumn{3}{c|}{$R_0=0.99$} \\
\cline{4-15}
 &&&\multicolumn{3}{c||}{~$\mathcal{T}_S $ (kms) }& \multicolumn{3}{c||}{~$\mathcal{T}_S $ (kms)}& \multicolumn{3}{c||}{~$\mathcal{T}_S $ (kms)}& \multicolumn{3}{c|}{~$\mathcal{T}_S $ (kms)}\\

      &&&2000 & 2250 & 2500 &  2000 & 2250 &2500 & 2000 & 2250 &  2500 & 2000 & 2250 &  2500  \\
     \cline{1-15}
                  && Cock & 5 & 6 &7 & 5 & 6  & 7 & 6 &6& 7 & 6 &6& 7 \\
        $\epsilon=0.025$& 1           & MTB& 1 & 1 &1 & 1 & 1  & 1 & 0 &1 & 1& 1 &1 & 1  \\
                  &&Emergency & 2 & 2 &2 & 2 & 2 &2  & 2 & 2 &2  & 1 & 2 &2  \\

\hline
\multicolumn{3}{|c||}{Design cost (k\$)} & 27& 31 &35&\!\! 27 &\!\! 31    &35 & 28 &31& 35 & 29    & 31 & \!\! 35 \\

     \cline{1-15}
                 && Cock& 5 & 6 &7 & 6 & 6  & 7 & 6 &7& 7 & 6 &7& 8  \\
         $\epsilon=0.05$& 1           & MTB & 1 & 1 &1 & 0 & 1  & 1 & 0 &1 & 1& 1 &1 & 0 \\
                 &&Emergency & 2 & 2 &2 & 2 & 2 &2  & 2 & 2 &2  & 1 & 2 &2  \\

\hline
\multicolumn{3}{|c||}{Design cost (k\$)} & 27& 31 &35&\!\! 28 &\!\! 31    &35 & 28 &32& 35 & 29    & 33 & \!\! 36 \\

     \cline{1-15}
                  && Cock& 6 & 6 &7 & 6 & 7 & 7 & 6 &7& 8 & 7 &7& 8 \\
        $\epsilon=0.075$& 1           & MTB  & 0 & 1 &1 & 1 & 0  & 1 & 1 &1 & 0& 0 &1 & 1 \\
                 &&Emergency & 2 & 2 &2 & 1 & 2 &2  & 1 & 1 &2  & 1 & 1 &1  \\

\hline
\multicolumn{3}{|c||}{Design cost (k\$)} & 28& 31 &35&\!\! 29 &\!\! 32    &35 & 29 &33& 36 & 30    & 33 & \!\! 37 \\
\hline
\end{tabular}
\end{center}
\end{table}

We can observe that as lifetime requirement $\mathcal{T}_S $, level of robustness $R_0$, or dispersion parameter $\epsilon$ increases, the cost of design generally increases. This is due to either extra components installed (for example, when $\mathcal{T}_S $ increases from 2000 to 2250 under $R_0 = 0.95$, $\epsilon = 0.05$, an extra cock is installed), or a cheaper but less robust component is replaced by a more expensive but more robust one (for example, when $R_0$ increases from 0.95 to 0.97 under $\mathcal{T}_S = 2000$,$\epsilon = 0.05$, the single MTB is replaced by a cock).

\section{Conclusions}{\color{red}
Redundancy allocation problem (RAP) is a critical problem in the field of reliability engineering. Distributionally robust optimization (DRO) models has been employed to high robustness in RAP. We present a framework of combining clustering and dimension reduction into DRO models, to achieve a smaller ambiguity set, which helps alleviate the problem that DRO models can sometimes be over conservative. This framework can incorporate side information to improve the ambiguity set, or, in absence of side information, use clustering to discover and utilize underlying structures of the data. We develop a mixed integer linear program (MILP)reformulation of the model without adding additional integer variables, thus keeping the model tractable. We also devise the supergradient based algorithm to decompose the MILP formulation, so that the model can be solved by multiple computers in parallel, which makes the model practically viable even with large number of clusters.
}

\begin{thebibliography}{}

\bibitem{Ardakan2014}Ardakan M. A., A. Z. Hamadani, M. Alinaghian. Optimizing bi-objective redundancy allocation problem with a mixed redundancy strategy. {\em ISA transactions}~{\bf 55} :116--128.

\bibitem{Bertsimas2011}Bertsimas D., M. Sim. 2004. The price of robustness. {\em Operations Research}~{\bf 52}~(1):35--53.

\bibitem{Bhunia2010} Bhunia, A. K., L. Sahoo,  D. Roy. 2010. Reliability stochastic optimization
for a series system with interval component reliability via genetic
algorithm.  {\em Appl.Math. Computat.}~{\bf 216}~(3): 929--939, 2010.


\bibitem{Chern1992}Chern, M.S. 1992. On the computational complexity of reliability redundancy allocation in a series system. {\em Operations research letters}~{\bf 11}~(5):309--315.

\bibitem{Cheng2009}Cheng, Z., X. Wang, C. Tian, F. Wang. 2009. Mission reliability simulation of High-speed EMU service braking system. {\em Proceedings of the 8th International Conference on Reliability}, Maintainability and Safety (ICRMS 2009), 253--256.

\bibitem{Coit1998}Coit, D. W., A.E. Smith. 1998. Redundancy allocation to maximize a lower percentile of the system time-to-failure distribution. {\em IEEE Trans. Rel.}~ {\bf 47}~(1):79--87.

\bibitem{Coit2003}Coit, D. W. 2003. Maximization of system reliability with a choice of redundancy strategies. {\em IIE Transactions}~{\bf 35}~(6):535-543.

\bibitem{Coit2004} Coit, D.W.,  T. Jin,  N. Wattanapongsakorn. 2004. System optimization
with component reliability estimation uncertainty: A multi-criteria approach. {\em IEEE Trans. Rel.}~ {\bf 53}~(3) :  369--380, 2004.


\bibitem{Elsayed2012}Elsayed, E A. 2012. {\em Reliability Engineering}. 2nd Edition. Wiley.


\bibitem{Govindan2017}Govindan, K., A. Jafarian, M.E. Azbari, T.M. Choi. 2017. Optimal bi-objective redundancy allocation for systems reliability and risk management. {\em IEEE Transactions on Cybernetics}~{\bf 46}~(8):1735--1748.


%\bibitem{Lam2012}Lam SW, T.S. Ng, and M. Sim. (2012). Multiple objectives satisficing under uncertainty. To appear in Operations Research, 2012.

%\bibitem{Lin2011}Lin J, Muthuraman K, Lawley M (2011) Optimal and approximate algorithms for sequential clinical scheduling with no-shows. {\it IIE Transactions on Healthcare Systems Engineering} 1:20--36.

 %\bibitem{McCarthy2000} McCarthy K, McGee HM, O'Boyle CA. 2000. Outpatient clinic waiting times and non-attendance as indicators of quality. {\it Psychology, Health and Medicine} 5: 287--293.

\bibitem{Grani2017} Hanasusanto, G. A., V. Roitch, D. Kuhn, W. Wiesemann. 2017. Ambiguous joint chance constraints under mean and dispersion information. {\it Operations Research}~{\bf 65}~(3):715--767.


\bibitem{Elegbede2003}Elegbede, A.C., C. Chu, K.H. Adjallah, F. Yalaoui. 2003. Reliability allocation through cost minimization. {\em IEEE Transactions on reliability}~{\bf 52}~(1):106--111.

\bibitem{Feizollahi2012} Feizollahi, M.J., M. Modarres. 2012. The robust deviation redundancy allocation problem with interval component reliabilities. {\em IEEE Transactions on reliability}~{\bf 61}~(4):957--965.


\bibitem{Feizollahi2014}Feizollahi, M.J., S. Ahmed, M. Modarres. 2014. The robust redundancy allocation problem in series-parallel systems with budgeted uncertainty.  {\em IEEE Transactions on reliability}~{\bf 63}~(1):239--250.

\bibitem{Feizollahi2015} Feizollahi, M.J., R. Soltan, H. Feyzollahi. 2015. The robust cold standby redundancy allocation in series-parallel systems with budgeted uncertainty. {\em IEEE Transactions on reliability}~{\bf 64}~(2):799--806.

\bibitem{Friedman2001} Friedman, J., T. Hastie, R. Tibshirani. 2001. {\em The elements of statistical learning.} Springer series in statistics. New York.

\bibitem{Hasegawa1999}Hasegawa, I., Uchida, S. 1999. Braking systems. {\em Japan Railway and Transport Review}~{\bf 20}:52-59.

\bibitem{James2013} James, G., D. Witten, T. Hastie, R. Tibshirani. 2013. {\em An introduction to statistical learning.} Springer series in statistics. New York.

\bibitem{Ketchen1996}Ketchen, Jr. D.J., C.L. Shook, 1996. The application of cluster analysis in Strategic Management Research: An analysis and critique. {\em Strategic Management Journal}~{\bf 17}~(6):441-458.

\bibitem{Kuo2001}Kuo, W., V.R. Prasad, F.A. Tillman, C.L. Hwang. 2001. {\em Optimal Reliability Design: Fundamentals and Applications.} Cambridge university press. Cambridge.

\bibitem{Kuo2007}Kuo, W., R. Wan. 2007. Recent advances in optimal reliability allocation. {\em IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans}~{\bf 37}~(2):143-156.

\bibitem{Li2014}Li, Y.F., Y. Ding, E. Zio. 2014. Random fuzzy extension of the universal generating function approach for the
reliability assessment of multi-state systems under aleatory and epistemic uncertainties. {\em IEEE Transactions on Reliability}~{\bf 63}~(1):13--25.

\bibitem{Li2011}Li, C.Y., X. Chen, X.S. Yi, J.Y. Tao. 2011. Interval-valued reliability analysis of multi-state systems. {\em IEEE Transactions on Reliability}~{\bf 60}~(1):323--330.

\bibitem{Li2008} Li, X.,  X. Hu. 2008. Some new stochastic comparisons for redundancy
allocations in series and parallel systems.~{\em  Statist. Probabil. Lett.}~{\bf 78}~(18): 3388--3394.


\bibitem{Liao2014}Liao, L., F. K\"{o}ttig. 2014. Review of hybrid prognostics approaches for remaining useful life prediction of engineered systems, and an application to battery life prediction. {\em IEEE Transactions on Reliability}. {\bf 63}~(1):191--207.


\bibitem{MacQueen1967} MacQueen, J. 1967. Some methods for classification and analysis of multivariate observations. {\em Proceedings of the Fifth Berkeley Symposium on Mathematical Statistics and Probability} {\bf 1}~(Statistics):281--297.


\bibitem{ Marseguerra2005} Marseguerra, M., E. Zio, L. Podofillini, D. W. Coit. 2005. Optimal design of reliable network systems in presence of uncertainty. {\em IEEE Trans Rel.}~{\bf 54}~(2):243--253.

\bibitem{Military1992}Military, U.S. 1992. Reliability prediction of electronic equipment. MIL-HDBK-217F Notice 1.

\bibitem{Ng2002} Ng, A. Y, M. I. Jordan, Y. Weiss. 2002. On spectral clustering: Analysis and an algorithm. {\em Advances in neural information processing systems}:849-856.

\bibitem{Ng2014} Ng, S. Y., Y. Xing, K. L. Tsui. 2014. A naive Bayes model for robust remaining useful life prediction of lithium-ion battery. {\em Applied Energy}~{\bf 118}: 114-123.


\bibitem{Prasad2001}Prasad, V. R., W. Kuo, K. O. Kim. 2001. Maximization of a percentile life of a series system through component redundancy allocation. {\em IIE Transactions}~{\bf 33}~(12):1071--1079.

\bibitem{Pecht2008} Pecht, M. 2008. {\em Prognostics and Health Management of Electronics.} John Wiley \& Sons, Ltd.

\bibitem{Quinlan1986}Quinlan, J. R. 1986. Induction of decision trees. {\em Machine Learning}~{\bf 1}: 81--106.


\bibitem{Shapiro2001}Shapiro~A.~2001.~On duality theory of conic linear problems. In {\em Semi-Infinite Programming}, chapter 7, 135--165, Kluwer Academic Publishers, 2001.

\bibitem{Sibson1973}Sibson R. 1973. SLINK: an optimally efficient algorithm for the single-link cluster method. {\em The Computer Journal. British Computer Society}~{\bf 16}~(1):30-34.

\bibitem{Soltani2015}Soltani R., J. Safari, S.J. Sadjadi. 2015. Robust counterpart optimization for the redundancy allocation problem in series-parallel systems with component mixing under uncertainty. {\em Applied Mathematics \& Computation}~{\bf 271}~(C): 80--88.


\bibitem{Sun2017} Sun, M. X., Y. F. Li, E. Zio. 2017. On the optimal redundancy allocation for multi-state series-parallel systems under epistemic uncertainty. {\em Reliability Engineering \& System Safety}. Accepted.

\bibitem{Tang2014}Tang, S., C. Yu, X. Wang, X. Guo, X. Si. 2014. Remaining useful life prediction of lithium-ion batteries based on the wiener process with measurement error. {\em Energies}~{\bf 7}~(2):520--547.

\bibitem{Tekiner-Mogulkoc2011}Tekiner-Mogulkoc, H., D. W. Coit. 2011. System reliability optimization
considering uncertainty: Minimization of the coefficient of variation
for series-parallel systems.~{\em  IEEE Trans. Rel.}~{\bf 60}~(30): 667--674, 2011.

\bibitem{Thorndike1953}Thorndike R. L. 1953. Who Belongs in the Family?. {\em Psychometrika}~{\bf 18}~(4):267-276.

\bibitem{Wang2012} Wang, Y., L. Li, S. Huang, Q. Chang. 2012. Reliability and covariance estimation of weighted k-out-of-n multi-state Systems. {\em European Journal of Operational Research}~{\bf 221}:~138--147.

{\color{red} \bibitem{Wang2019} Wang, S.,...}

\bibitem{Wisemann2014} Wiesemann, W.,  D. Kuhn, M. Sim. 2014. Distributionally robust convex optimization. {\it Operations Research}~{\bf 62} ~(6)~ 1358--1376.

\bibitem{xie2017} Xie, W., Ahmed, S. 2017. Distributionally robust chance constrained optimal power flow with renewables: A conic reformulation. {\em IEEE Transactions on Power Systems.} Accepted.


\bibitem{Yalaoui2005}Yalaoui, A., E.  Chatelet, C. Chu. 2005. A new dynamic programming method for reliability redundancy allocation in a parallel-series system. {\em IEEE transactions on reliability}.~{\bf 54}~(2):254--261.


\bibitem{Zaretalab2015}Zaretalab, A., V. Hajipour, M. Sharifi, M. R. Shahriari. 2015. A knowledge-based archive multi-objective simulated annealing algorithm to optimize series-parallel system with choice of redundancy strategies. {\em Computers \& Industrial Engineering}~{\bf 80}:33-44.

\bibitem{Zhao2003} Zhao, R., B. Liu. 2003. Stochastic programming models for general redundancy-optimization problems.~{\em IEEE Trans. Rel.}~{\bf 52}~(2): 181--191, 2003.

\bibitem{Zhao2011} Zhao, P., P.S. Chan, H.K.T. Ng. 2011. Optimal allocation of redundancies in series systems. {\em European Journal of Operational Research}~{\bf 220}~(3):673--683.

\bibitem{Kuhn2013}Zymler, S., D. Kuhn, B. Rustem. 2013. Distributionally robust joint chance constraints with second-order moment information, {\em Mathematical Programming}~{\bf 137}~(1-2):167--198.
\end{thebibliography}
\newpage
\section*{Appendix: A benchmark probabilistic reliability model}
As a benchmark for fair comparison, we consider the following probabilistic reliability model
\begin{eqnarray*}
\begin{array}{rcll}
& \min\limits_{\y} &  \sum\limits_{i\in \mathbf{N}}y_{i}c_{i} \\[0.3 cm]
& {\rm s.t.} &  \displaystyle \prod_{i \in \mathbf{N}}\left(1-r_{i}^{y_{i}}\right)\ge R_0  & \\[0.3 cm]
&& L_{i}\le y_{i}\le U_{i}, & \forall i \in \mathbf{N}\\
&& y_{i} \in \mathbb{Z}_+,  & \forall i \in \mathbf{N},
\end{array}
\end{eqnarray*}
where each subsystem is equipped with one type of components (i.e., $|\mathbf{M}_i|\equiv 1, \forall i \in \mathbf{N}$ ), and
$$
r_{i}=\P\Big[\tilde{z}_{i}\le \mathcal{T}_R \Big],
$$
which can be estimated from the data. By transforming the integer variable $y_{i}$ with binaries $x_{ik}$:
$$
y_{i}=L_{i}+\sum_{k=0}^{U_{i}-L_{i}}kx_{ik},~\mbox{with}~\sum_{k=0}^{U_{i}-L_{i}}x_{ik}=1,
$$
the above model can be linearized as the following MIP with binaries (Feizollahi and Modarres~2012):
\begin{eqnarray}
\begin{array}{rcll}
& \min\limits_{\x} & \displaystyle \sum\limits_{i\in \mathbf{N}} \left[L_{i}+\sum_{k=0}^{U_{i}-L_{i}}kx_{ik}\right]c_{i} \\[0.3 cm]
& {\rm s.t.} &  \displaystyle \sum_{i \in \mathbf{N}} \sum_{k=0}^{U_{i}-L_{i}}x_{ik}\ln\left[1-r_{i}^{L_{i}+k} \right]\ge \ln R_0  & \\[0.3 cm]
&& \displaystyle \sum_{k=0}^{U_{i}-L_{i}}x_{ik}=1, & \forall i \in \mathbf{N}\\
&& x_{ik} \in \{0,1\},  & \forall i \in \mathbf{N}, k \in [0; U_{i}-L_{i}],
\end{array}
\end{eqnarray}
which can be solved by off-the-shelf MIP solvers. Nevertheless, the above linear MIP transformation holds only for the regular series-parallel redundant systems with single type of components,  as for the situation that involves multiple types (i.e. $|\mathbf{M}_i|>1$ for some $i \in \mathbf{N}$) or the cold-standby subsystems are considered, the probabilistic model in general becomes intractable.

%\section*{Appendix II}
%\begin{proposition}\label{P-proposition1b}
%Given a system design $\x$, the worst-case probabilistic chance function (\ref{Prob-1}) solves the following linear program (LP):
%\begin{eqnarray}
%&\!\!\!\!\!\! \max  &  1-  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{k\in \mathbf{K}_{ij}} \left(\alpha^{k}_{ij}\underline{\mu}_{{ij}}+ \beta^{k}_{ij}\overline{\mu}_{{ij}}\right)-\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\epsilon_{ij}\lambda_{ij} - \tau \label{HP1-ambiguity-LP-FLaa} \\
% &\!\!\!\!\!\!{\rm s.t.} & \sum\limits_{j\in \mathbf{M}^{\rm p}_{l}}\sum\limits_{k\in \mathbf{K}_{lj}} q_{ljk}\mathcal{T}_R\nonumber\\
% &&+\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{k\in \mathbf{K}_{ij}} \left[ \phi^{l }^{k}_{ij}\underline{z}_{ij}+ \varphi^{l}^{k}_{ij}\overline{z}_{ij}  + {\nu_{ij}\left(\pi^{l }^{k}_{ij}-\varpi^{l }^{k}_{ij} \right)} \right]+\tau \ge 1,~\forall {l  \in [3;5]}  \label{HP1-ambiguity-LP-FL1aa}\\
% &&p_{l}\mathcal{T}_R+\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{k\in \mathbf{K}_{ij}} \left[ \phi^{l }^{k}_{ij}\underline{z}_{ij}+ \varphi^{l}^{k}_{ij}\overline{z}_{ij}  + {\nu_{ij}\left(\pi^{l }^{k}_{ij}-\varpi^{l }^{k}_{ij} \right)} \right]+\tau \ge 1,~\forall {l  \in [1;2]}\\
%  && \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{k\in \mathbf{K}_{ij}} \left[ \varsigma^{k}_{ij}\underline{z}_{ij}+ \vartheta^{k}_{ij}\overline{z}_{ij}  +  {\nu_{ij}\left(\gamma^{k}_{ij}-\theta^{k}_{ij} \right)}\right]+\tau \ge 0\label{HP1-ambiguity-LP-FL1-2aa}\\
% && q_{l jk}x_{l jk} +\phi^{l }_{l jk}+\varphi^{l }_{l jk}+ { \pi^{l }_{l jk}-\varpi^{l }_{l jk} }  = \alpha_{l jk}+\beta_{l jk},~\forall {l \in [3;5]}, j \in \mathbf{M}^{\rm p}_l, k\in \mathbf{K}_{l j}   \\
%&& p_{l}x_{l jk} +\phi^{l }_{l jk}+\varphi^{l }_{l jk}+ { \pi^{l }_{l jk}-\varpi^{l }_{l jk} }  = \alpha_{l jk}+\beta_{l jk},~\forall {l \in [1;2]}, j \in \mathbf{M}^{\rm c}_l, k\in \mathbf{K}_{l j}   \\
%&&\phi^{l }^{k}_{ij}+\varphi^{l }^{k}_{ij}+ { \pi^{l }^{k}_{ij}-\varpi^{l }^{k}_{ij} }= \alpha^{k}_{ij}+\beta^{k}_{ij}, ~ \forall {l  \in \mathbf{N}}, i \in \mathbf{N}\setminus\{l \}, j \in \mathbf{M}_i, k\in \mathbf{K}_{ij}  \label{HP1-ambiguity-LP-FL2aa}\\
%&&{|\mathbf{K}_{ij} |\sigma_{ij}} (\pi^{l }^{k}_{ij}+\varpi^{l }^{k}_{ij}) =\lambda_{ij},  ~ \forall {l  \in \mathbf{N}}, i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij} \\
%&& \varsigma^{k}_{ij}+ \vartheta^{k}_{ij} + { \gamma^{k}_{ij}-\theta^{k}_{ij} }= \alpha^{k}_{ij}+\beta^{k}_{ij}, ~ \forall  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij} \\
%&&  {|\mathbf{K}_{ij} |\sigma_{ij}}(\gamma^{k}_{ij}+\theta^{k}_{ij}) = \lambda_{{ij}}, ~  \forall  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij} \\
%%&&  q_{l jk}\le y_{{l jk}}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}_{l}, k\in \mathcal{N}(l,j) \\[0.3 cm]
%%&& y_{{l jk}} \ge M x_{l jk}, ~\forall l  \in \mathbf{N}, j \in \mathbf{M}_{l}, k\in \mathcal{N}(l,j) \\[0.3 cm]
%%&& y_{{l jk}} \le  q_{l jk }+(x_{l jk}-1)M, ~\forall l  \in \mathbf{N}, j \in \mathcal{J}({l}), k \in \mathcal{N}(l,j)\\[0.3 cm]
%%&&  \sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\ge  L_{ij}, ~  \forall  i \in \mathbf{N}, j\in \mathbf{M}_i \\
%&& \alpha^{k}_{ij}\le 0, \beta^{k}_{ij}\ge 0, \lambda_{{ij}} \ge 0, \tau \in \mathbb{R}, ~\forall i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij}\\
%&&q_{l jk}\le 0, ~\forall {l  \in [3;5]}, j\in \mathbf{M}^{\rm p}_{l}, k \in \mathbf{K}_{lj}\\
%&&p_{l}\le 0, ~\forall {l  \in [1;2]}\\
%&& \phi^{l }^{k}_{ij} \ge 0,  \varphi^{l }^{k}_{ij} \le 0,  \pi^{l }^{k}_{ij}\ge 0,\varpi^{l }^{k}_{ij}\ge 0, ~\forall {l  \in \mathbf{N}},  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij}\\
%&& \theta^{k}_{ij}\ge 0, \gamma^{k}_{ij}\ge 0, \varsigma^{k}_{ij}\ge 0, \vartheta^{k}_{ij} \le 0, ~\forall  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij}, \label{HP2-ambiguity-LP-FLaa}
%\end{eqnarray}
%where $\halpha, \hbeta, \hlambda, \tau, \q, \s, \hphi, \hvarphi, \hpi, \hvarpi, \htheta, \hgamma, \hvarsigma$ and $\hvartheta$ are auxiliary variables.
%\end{proposition}
%
%\begin{proposition}\label{proposition1b}
%The robust system reliability redundancy allocation problem (\ref{HP1-ambiguity-X}) can be cast into the following mixed integer linear program (MILP):
%\begin{eqnarray}
% & \min\limits_{\x} &  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \left[\sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\right]c_{ij} \label{HP1-ambiguity-MILP-FL1aa}\\
% &{\rm s.t.} & 1-  \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{k\in \mathbf{K}_{ij}} \left(\alpha^{k}_{ij}\underline{\mu}_{{ij}}+ \beta^{k}_{ij}\overline{\mu}_{{ij}}\right)-\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i}\epsilon_{ij}\lambda_{ij} - \tau\ge R_{0}   \\
% &&  \sum_{j\in \mathbf{M}_i}\sum\limits_{k\in \mathbf{K}_{ij}} x^{k}_{ij}\ge  L_{i}, ~  \forall  i \in \mathbf{N} \\
%%&& \sum\limits_{j\in \mathbf{M}_{l}} \sum\limits_{k\in \mathbf{K}_{l j}}q_{l jk}\mathcal{T}_R\nonumber\\
%% &&+\sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{k\in \mathbf{K}_{ij}} \left[ \phi^{l }^{k}_{ij}\underline{z}_{ij}+ \varphi^{\varsigma}^{k}_{ij}\overline{z}_{ij}  + {\nu_{ij}\left(\pi^{l }^{k}_{ij}-\varpi^{l }^{k}_{ij} \right)} \right]+\tau \ge 1,~\forall {l  \in \mathbf{N}}  \\
%&& y^{\rm p}_{l jk} +\phi^{l }_{l jk}+\varphi^{l }_{l jk}+ { \pi^{l }_{l jk}-\varpi^{l }_{l jk} }  = \alpha_{l jk}+\beta_{l jk},~\forall {l \in [3;5]}, j \in \mathbf{M}^{\rm p}_{l}, k\in \mathbf{K}_{l j}    \\
%&& y^{\rm c}_{l jk} +\phi^{l }_{l jk}+\varphi^{l }_{l jk}+ { \pi^{l }_{l jk}-\varpi^{l }_{l jk} }  = \alpha_{l jk}+\beta_{l jk},~\forall {l \in [1;2]}, j \in \mathbf{M}^{\rm c}_{l}, k\in \mathbf{K}_{l j}    \\
% && (\ref{HP1-ambiguity-LP-FL1aa}-\ref{HP1-ambiguity-LP-FL1-2aa}); (\ref{HP1-ambiguity-LP-FL2aa})-(\ref{HP2-ambiguity-LP-FLaa})\\
%%&&\phi^{l }^{k}_{ij}+\varphi^{l }^{k}_{ij}+ { \pi^{l }^{k}_{ij}-\varpi^{l }^{k}_{ij} }= \alpha^{k}_{ij}+\beta^{k}_{ij}, ~ \forall {l  \in \mathbf{N}}, i \in \mathbf{N}\setminus\{l \}, j \in \mathbf{M}_i, k\in \mathbf{K}_{ij}  \\
%%&&{|\mathbf{K}_{ij} |\sigma_{ij}} (\pi^{l }^{k}_{ij}+\varpi^{l }^{k}_{ij}) =\lambda_{ij},  ~ \forall {l  \in \mathbf{N}}, i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij} \\
%%&& \sum\limits_{i\in \mathbf{N}}\sum\limits_{j\in \mathbf{M}_i} \sum\limits_{k\in \mathbf{K}_{ij}} \left[\left(\varsigma^{k}_{ij}\underline{z}_{ij}+ \vartheta^{k}_{ij}\overline{z}_{ij} \right) +  {\nu_{ij}\left(\gamma^{k}_{ij}-\theta^{k}_{ij} \right)}\right]+\tau \ge 0\\
%%&& \varsigma^{k}_{ij}+ \vartheta^{k}_{ij} + { \gamma^{k}_{ij}-\theta^{k}_{ij} }= \alpha^{k}_{ij}+\beta^{k}_{ij}, ~ \forall  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij} \\
%%&&  {|\mathbf{K}_{ij} |\sigma_{ij}}(\gamma^{k}_{ij}+\theta^{k}_{ij}) = \lambda_{{ij}}, ~  \forall  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij} \\
%&&  q_{l jk}\le y^{\rm p}_{{l jk}}, ~\forall l  \in [3;5], j \in \mathbf{M}^{\rm p}_{l}, k\in \mathbf{K}_{l j} \\
%&& y^{\rm p}_{{l jk}} \ge M_1 x_{l jk}, ~\forall l  \in [3;5], j \in \mathbf{M}^{\rm p}_{l}, k\in \mathbf{K}_{l j} \\
%&& y^{\rm p}_{{l jk}} \le  q_{l jk}+(x_{l jk}-1)M_1, ~\forall l  \in [3;5], j \in \mathbf{M}^{\rm p}_{l}, k \in \mathbf{K}_{l j}\\
%&&  p_{l}\le y^{\rm c}_{{l jk}}, ~\forall l  \in [1;2], j \in \mathbf{M}^{\rm c}_{l}, k\in \mathbf{K}_{l j} \\
%&& y^{\rm c}_{{l jk}} \ge M_2 x_{l jk}, ~\forall l  \in [1;2], j \in \mathbf{M}^{\rm c}_{l}, k\in \mathbf{K}_{l j} \\
%&& y^{\rm c}_{{l jk}} \le  p_{l }+(x_{l jk}-1)M_2, ~\forall l  \in [1;2], j \in \mathbf{M}^{\rm c}_{l}, k \in \mathbf{K}_{l j}\\
%%&& \alpha^{k}_{ij}\le 0, \beta^{k}_{ij}\ge 0, \lambda_{{ij}} \ge 0, \tau \in \Re, ~\forall i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij}\\
%%&&q_{l jk}\le 0, y_{l jk} \le 0, ~\forall {l  \in \mathbf{N}}, j\in \mathbf{M}_{l}, k\in \mathbf{K}_{l j} \\
%%&& \phi^{l }^{k}_{ij} \ge 0,  \varphi^{l }^{k}_{ij} \le 0,  \pi^{l }^{k}_{ij}\ge 0,\varpi^{l }^{k}_{ij}\ge 0, ~\forall {l  \in \mathbf{N}},  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij}\\
%%&& \theta^{k}_{ij}\ge 0, \gamma^{k}_{ij}\ge 0, \varsigma^{k}_{ij}\ge 0, \vartheta^{k}_{ij} \le 0, ~\forall  i \in \mathbf{N}, j\in \mathbf{M}_i, k\in \mathbf{K}_{ij}\\
%&& y^{\rm p}_{{l jk}}\le 0,  ~ \forall l \in [3;5], j\in \mathbf{M}^{\rm a}_i, k \in \mathbf{K}_{l j}\\
%&& y^{\rm c}_{{l jk}}\le 0,  ~ \forall l \in [1;2], j\in \mathbf{M}^{\rm c}_i, k \in \mathbf{K}_{l j}\\
%&&  x^{k}_{ij} \in \{0,1\},  ~ \forall i \in \mathbf{N}, j\in \mathbf{M}_i, k \in \mathbf{K}_{ij}, \label{HP1-ambiguity-MILP-FL2aa}
%\end{eqnarray}
%where $\halpha, \hbeta, \hlambda, \tau, \q, \s, \y^{\rm p}, \y^{\rm c}, \hphi, \hvarphi, \hpi, \hvarpi, \htheta, \hvartheta, \hvarsigma$ and $\hvartheta$ are auxilary variables and $M$ is a sufficiently small negative number.
%\end{proposition}

\end{document}
%%
%% E    nd of file `elsarticle-template-num.tex'.