Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scala.collection.mutable.HashMap
- import scala.collection.mutable.ArrayBuffer
- sealed trait Symbol[C] extends Ordered[Symbol[C]] {
- def isTerminal(): Boolean
- }
- case class CharacterSymbol[C: Ordering](val character: C) extends Symbol[C] {
- def isTerminal(): Boolean = false
- def compare(that: CharacterSymbol[C]): Int = Ordering[C].compare(this.character, that.character)
- def compare(that: TerminalSymbol[C]): Int = -1
- override def compare(that: Symbol[C]): Int = that match {
- case that: CharacterSymbol[C] => compare(that)
- case that: TerminalSymbol[C] => compare(that)
- }
- override def toString(): String = character.toString()
- }
- case class TerminalSymbol[C: Ordering](val index: Int) extends Symbol[C] {
- def isTerminal(): Boolean = true
- def compare(that: TerminalSymbol[C]): Int = Ordering[Int].compare(this.index, that.index)
- def compare(that: CharacterSymbol[C]): Int = 1
- override def compare(that: Symbol[C]): Int = that match {
- case that: CharacterSymbol[C] => compare(that)
- case that: TerminalSymbol[C] => compare(that)
- }
- override def toString(): String = "$" + index.toString()
- }
- object Symbols {
- def fromString[C: Ordering](string: Seq[C], index: Int): Seq[Symbol[C]] = string.map(c => CharacterSymbol(c)) :+ TerminalSymbol[C](index)
- }
- class State[C: Ordering] { self =>
- private val transitionMap: HashMap[C, Transition[C]] = HashMap.empty[C, Transition[C]]
- def transitions: Map[C, Transition[C]] = transitionMap.toMap
- def getTransitions: Map[C, Transition[C]] = transitions
- def getTransition(c: C): Option[Transition[C]] = transitions.get(c)
- def addTransition(c: C, transition: Transition[C]) = transitionMap.put(c, transition)
- private var sourceTransitionOption: Option[Transition[C]] = None
- def sourceTransition: Option[Transition[C]] = sourceTransitionOption
- def getSourceTransition: Option[Transition[C]] = sourceTransition
- def setSourceTransition(transition: Transition[C]) = {
- sourceTransitionOption = Some(transition)
- }
- private var suffixLinkOption: Option[State[C]] = None
- def suffixLink: Option[State[C]] = suffixLinkOption
- def getSuffixLink: Option[State[C]] = suffixLink
- def setSuffixLink(suffixLink: State[C]) = {
- suffixLinkOption = Some(suffixLink)
- }
- def parent: Option[State[C]] = sourceTransition.map(_.source)
- def children: Seq[State[C]] = transitions.values.map(_.dest).toSeq
- def isLeaf: Boolean = transitions.isEmpty
- def inInternal: Boolean = !transitions.isEmpty
- def isRoot: Boolean = false
- def isPerp: Boolean = false
- private def statesUptoRoot: Iterator[State[C]] = new BufferedIterator[State[C]] {
- private var cursor: State[C] = self
- override def head: State[C] = cursor
- override def hasNext: Boolean = cursor != null
- override def next(): State[C] = {
- val currentCursor = cursor
- cursor = cursor.sourceTransition.map(_.source).getOrElse(null)
- currentCursor
- }
- }
- def depth: Int = statesUptoRoot.size - 1
- def prefix: Seq[C] = sourceTransition.map(_.prefix).getOrElse(Seq.empty[C])
- def toState: State[C] = this
- def toDebugString: String = {
- val builder = new StringBuilder("root\n")
- def addEntryLines(state: State[C], prefix: String): Unit = {
- if (!state.transitions.isEmpty) {
- for ((first, transition) <- state.transitions.toSeq.sortBy(_._1)) {
- builder ++= prefix + "|-" + transition.label + "\n"
- addEntryLines(transition.dest, prefix + "| ")
- }
- }
- }
- addEntryLines(this, "")
- builder.toString()
- }
- def printDebugString: Unit = {
- println(toDebugString)
- }
- }
- class RootState[C: Ordering] extends State[C] {
- override def toString: String = "root"
- override def isRoot: Boolean = true
- }
- class PerpState[C: Ordering](val root: RootState[C]) extends State[C] {
- private val transitionToRoot = new Transition(Seq.empty[C], this, 0, 0, root)
- override def getTransition(c: C): Option[Transition[C]] = Some(transitionToRoot)
- override def toString: String = "perp"
- override def isPerp: Boolean = true
- }
- class SlicedSeq[A](val original: Seq[A], val from: Int, val until: Int) extends Seq[A] {
- def sliced: Seq[A] = original.slice(from, until)
- override def apply(idx: Int): A = sliced.apply(idx)
- override def iterator: Iterator[A] = sliced.iterator
- override def length: Int = sliced.length
- }
- class Transition[C](val sequence: Seq[C], val source: State[C], val start: Int, val end: Int, val dest: State[C])
- extends SlicedSeq(sequence, start - 1, end) {
- def subsequence: Seq[C] = sequence.slice(start - 1, end)
- def prefix: Seq[C] = sequence.slice(0, end)
- def label: String = subsequence.map(_.toString).mkString
- }
- class GeneralizedSuffixTree[C: Ordering] {
- private val sequenceBuffer: ArrayBuffer[Seq[C]] = ArrayBuffer.empty[Seq[C]]
- def sequences: Seq[Seq[C]] = sequenceBuffer
- val root = new RootState[Symbol[C]]()
- val perp = new PerpState[Symbol[C]](root)
- root.setSuffixLink(perp)
- def addSequence(sequence: Seq[C]): Unit = {
- val sequenceIndex = sequences.length
- sequenceBuffer.append(sequence)
- val symbols = Symbols.fromString(sequence, sequenceIndex)
- val terminalSymbol = symbols.last
- def inf: Int = symbols.length
- def t_(i: Int): Symbol[C] = {
- symbols(i - 1)
- }
- def createTransition(sequence: Seq[Symbol[C]], from: State[Symbol[C]], startAndEnd: (Int, Int), to: State[Symbol[C]]): Unit = {
- val start = startAndEnd._1
- val end = startAndEnd._2
- val transition = new Transition(sequence, from, start, end, to)
- from.addTransition(transition.head, transition)
- to.setSourceTransition(transition)
- }
- def createSuffixLink(arg: State[Symbol[C]], res: State[Symbol[C]]): Unit = {
- arg.setSuffixLink(res)
- }
- def findTransition(t: Symbol[C], from: State[Symbol[C]]): Option[Transition[Symbol[C]]] = {
- from.getTransition(t)
- }
- def splitTransition(gprime: Transition[Symbol[C]], s: State[Symbol[C]], kp: (Int, Int)): State[Symbol[C]] = {
- val k = kp._1
- val p = kp._2
- val pprime = gprime.end
- val kprime = gprime.start
- val sprime = gprime.dest
- val r = new State[Symbol[C]]()
- // s -> sprime => s -> r -> sprime
- createTransition(gprime.sequence, r, (kprime + p - k + 1, pprime), sprime)
- createTransition(gprime.sequence, s, (kprime, kprime + p - k), r)
- r
- }
- def fprime(s: State[Symbol[C]]): State[Symbol[C]] = {
- s.getSuffixLink.get
- }
- def update(state: State[Symbol[C]], ki: (Int, Int)): (State[Symbol[C]], Int) = {
- var s = state
- var k = ki._1
- var i = ki._2
- var sk = (s, k)
- var oldr = root.toState
- var endPointAndR = testAndSplit(s, (k, i - 1), t_(i))
- var endPoint = endPointAndR._1
- var r = endPointAndR._2
- while (!endPoint) {
- val rprime = new State[Symbol[C]]()
- createTransition(symbols, r, (i, inf), rprime)
- if (oldr != root) {
- createSuffixLink(oldr, r)
- }
- oldr = r
- sk = canonize(fprime(s), (k, i - 1))
- s = sk._1
- k = sk._2
- endPointAndR = testAndSplit(s, (k, i - 1), t_(i))
- endPoint = endPointAndR._1
- r = endPointAndR._2
- }
- if (oldr != root) {
- createSuffixLink(oldr, s)
- }
- (s, k)
- }
- def testAndSplit(state: State[Symbol[C]], kp: (Int, Int), t: Symbol[C]): (Boolean, State[Symbol[C]]) = {
- var s = state
- var k = kp._1
- var p = kp._2
- if (k <= p) {
- var gprime = findTransition(t_(k), s).get
- var pprime = gprime.end
- var kprime = gprime.start
- var sprime = gprime.to
- //if (t == t_(kprime + p - k + 1)) {
- if (t == gprime(p - k + 1)) {
- (true, s)
- } else { // split the transition with new state r in between
- val r = splitTransition(gprime, s, kp)
- (false, r)
- }
- } else {
- val trans = findTransition(t, s)
- if (!trans.isDefined) {
- (false, s)
- } else {
- (true, s)
- }
- }
- }
- def canonize(state: State[Symbol[C]], kp: (Int, Int)): (State[Symbol[C]], Int) = {
- var s = state
- var k = kp._1
- var p = kp._2
- if (p < k) {
- (s, k)
- } else {
- var gprime = findTransition(t_(k), s).get
- var pprime = gprime.end
- var kprime = gprime.start
- var sprime = gprime.dest
- while ((pprime - kprime) <= (p - k)) { // while length of transition is leq length of remains
- k = k + pprime - kprime + 1 // proceed k to the length of transition
- s = sprime // push state to the target of transition
- if (k <= p) { // if remains are still left, find the next transition
- gprime = findTransition(t_(k), s).get
- pprime = gprime.end
- kprime = gprime.start
- sprime = gprime.dest
- }
- }
- (s, k)
- }
- }
- var s = root.toState
- var k = 1
- var i = 0
- var sk = (s, k)
- while (t_(i + 1) != terminalSymbol) {
- i = i + 1
- sk = update(s, (k, i))
- s = sk._1
- k = sk._2
- sk = canonize(s, (k, i))
- s = sk._1
- k = sk._2
- }
- // add terminal symbol
- i = i + 1
- sk = update(s, (k, i))
- s = sk._1
- k = sk._2
- sk = canonize(s, (k, i))
- s = sk._1
- k = sk._2
- }
- def addSequences(sequences: Seq[C]*): Unit = {
- for (sequence <- sequences) {
- addSequence(sequence)
- }
- }
- }
- object Application {
- def main(args: Array[String]): Unit = {
- {
- val sequence = "mississippi"
- val tree = new GeneralizedSuffixTree[Char]()
- tree.addSequence(sequence)
- tree.root.printDebugString
- val debugString =
- """
- root
- |-i
- | |-ppi$0
- | |-ssi
- | | |-ppi$0
- | | |-ssippi$0
- | |-$0
- |-mississippi$0
- |-p
- | |-i$0
- | |-pi$0
- |-s
- | |-i
- | | |-ppi$0
- | | |-ssippi$0
- | |-si
- | | |-ppi$0
- | | |-ssippi$0
- |-$0
- """
- println(debugString.trim == tree.root.toDebugString.trim)
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement