Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Sub LayoutPrep()
- '
- ' Macro written by DSpider, around mid 2012.
- '
- ' What this does, is it breaks down a Word document into a squeaky clean document,
- ' with its basic formatting intact, ready for re-applying styles or quick styles
- ' in Word, or in Adobe InDesign.
- '
- ' - Bold text
- ' - Italic text
- ' - Underlined text
- ' - Subscripts
- ' - Superscripts
- ' - Paragraph ending marks
- ' - Line Breaks
- ' - Page Breaks
- '
- '
- ' Set the desktop variable and the working directory for everything else.
- '
- Dim WshShell As Object
- Dim SpecialPath As String
- Set WshShell = CreateObject("WScript.Shell")
- DESKTOP = WshShell.SpecialFolders("Desktop")
- ChangeFileOpenDirectory DESKTOP
- '
- ' 1. Convert numbered and bulleted lists to regular text. Honestly, they're more
- ' of a nuisance with scanned text and OCR. Without this they would not make it
- ' to the second part because Word treats them (along with footnotes) more like
- ' objects.
- '
- ActiveDocument.ConvertNumbersToText
- '
- '
- ' 2. Convert footnotes to regular text so that they too make it intact.
- '
- Dim afootnote As Footnote
- Dim NumberOfFootnotes As Integer
- Dim i As Integer
- Dim aFootnoteReference As String
- Dim aFootnoteRefTag As String
- NumberOfFootnotes = ActiveDocument.Footnotes.Count
- For i = NumberOfFootnotes To 1 Step -1
- Set afootnote = ActiveDocument.Footnotes(i)
- afootnote.Range.Select
- Selection.MoveStartWhile Cset:=" " & Chr(9)
- Selection.Cut
- aFootnoteReference = afootnote.Reference.Text
- Select Case aFootnoteReference
- Case Chr(2)
- aFootnoteRefTag = "num"
- Case "*"
- aFootnoteRefTag = "star"
- Case Else
- aFootnoteRefTag = "symbol" _
- & aFootnoteReference & "/FNRef"
- End Select
- afootnote.Reference.Select
- If afootnote.Reference.Text = Chr(40) Then
- With Dialogs(wdDialogInsertSymbol)
- aFootnoteRefTag = _
- "FNSym," & .Font & "," _
- & .CharNum & ""
- End With
- End If
- afootnote.Delete
- Selection.InsertBefore ChrW(9616) _
- & aFootnoteRefTag
- Selection.Collapse (wdCollapseEnd)
- Selection.Paste
- Selection.InsertAfter ChrW(9612)
- Next i
- '
- '
- ' 3. Remove all the tab characters by replacing them with the space character.
- ' This is because FineReader sometimes adds multiple tabs when you only need
- ' one. Easier to spot too when you're assigning styles.
- '
- Selection.Find.ClearFormatting
- Selection.Find.Replacement.ClearFormatting
- With Selection.Find
- .Text = "^t"
- .Replacement.Text = " "
- .Forward = True
- .Wrap = wdFindContinue
- .Format = False
- .MatchCase = False
- .MatchWholeWord = False
- .MatchWildcards = False
- .MatchSoundsLike = False
- .MatchAllWordForms = False
- End With
- Selection.Find.Execute Replace:=wdReplaceAll
- '
- '
- ' 4. Paragraph-level formatting: Delete the ruler tabs.
- '
- Selection.WholeStory
- With Selection.ParagraphFormat
- .SpaceBeforeAuto = False
- .SpaceAfterAuto = False
- End With
- Selection.ParagraphFormat.TabStops.ClearAll
- ActiveDocument.DefaultTabStop = InchesToPoints(0.5)
- '
- '
- ' 5. Paragraph-level formatting: Align everything to the left.
- '
- With Selection.ParagraphFormat
- .LeftIndent = InchesToPoints(0)
- .RightIndent = InchesToPoints(0)
- .SpaceBefore = 0
- .SpaceBeforeAuto = False
- .SpaceAfter = 0
- .SpaceAfterAuto = False
- .LineSpacingRule = wdLineSpaceSingle
- .Alignment = wdAlignParagraphLeft
- .FirstLineIndent = InchesToPoints(0)
- .OutlineLevel = wdOutlineLevelBodyText
- .CharacterUnitLeftIndent = 0
- .CharacterUnitRightIndent = 0
- .CharacterUnitFirstLineIndent = 0
- .LineUnitBefore = 0
- .LineUnitAfter = 0
- End With
- '
- '
- ' 6. Replace the line break character (but keep the paragraph ending marks or
- ' else the process would get too slow), then replace the Page Break and
- ' Section Break.
- '
- With ActiveDocument.Content.Find
- .ClearFormatting
- .Replacement.ClearFormatting
- .Forward = True
- .Wrap = wdFindContinue
- .Format = False
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = False
- .Text = "^l"
- .Replacement.Text = ChrW(9668)
- .Execute Replace:=wdReplaceAll
- .Text = "^m"
- .Replacement.Text = ChrW(9618)
- .Execute Replace:=wdReplaceAll
- .Text = "^b"
- .Replacement.Text = ChrW(9618)
- .Execute Replace:=wdReplaceAll
- End With
- '
- '
- ' 7. Italic characters.
- '
- Selection.Find.ClearFormatting
- Selection.Find.Font.Italic = True
- Selection.Find.Replacement.ClearFormatting
- With Selection.Find
- .Text = "(?)"
- .Replacement.Text = ChrW(9500) & "\1" & ChrW(9508)
- .Forward = True
- .Wrap = wdFindContinue
- .Format = True
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = True
- End With
- Selection.Find.Execute Replace:=wdReplaceAll
- '
- '
- ' 8. Bold characters.
- '
- Selection.Find.ClearFormatting
- Selection.Find.Font.Bold = True
- Selection.Find.Replacement.ClearFormatting
- With Selection.Find
- .Text = "(?)"
- .Replacement.Text = ChrW(9568) & "\1" & ChrW(9571)
- .Forward = True
- .Wrap = wdFindContinue
- .Format = True
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = True
- End With
- Selection.Find.Execute Replace:=wdReplaceAll
- '
- '
- ' 9. Underlined characters.
- '
- Selection.Find.ClearFormatting
- Selection.Find.Font.Underline = wdUnderlineSingle
- Selection.Find.Replacement.ClearFormatting
- With Selection.Find
- .Text = "(?)"
- .Replacement.Text = ChrW(9556) & "\1" & ChrW(9559)
- .Forward = True
- .Wrap = wdFindContinue
- .Format = True
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = True
- End With
- Selection.Find.Execute Replace:=wdReplaceAll
- '
- '
- ' 10. Superscripts.
- '
- Selection.Find.ClearFormatting
- With Selection.Find.Font
- .Superscript = True
- .Subscript = False
- End With
- Selection.Find.Replacement.ClearFormatting
- With Selection.Find
- .Text = "(?)"
- .Replacement.Text = ChrW(9560) & "\1" & ChrW(9563)
- .Forward = True
- .Wrap = wdFindContinue
- .Format = True
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = True
- End With
- Selection.Find.Execute Replace:=wdReplaceAll
- '
- '
- ' 11. Subscripts.
- '
- Selection.Find.ClearFormatting
- With Selection.Find.Font
- .Superscript = False
- .Subscript = True
- End With
- Selection.Find.Replacement.ClearFormatting
- With Selection.Find
- .Text = "(?)"
- .Replacement.Text = ChrW(9554) & "\1" & ChrW(9557)
- .Forward = True
- .Wrap = wdFindContinue
- .Format = True
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = True
- End With
- Selection.Find.Execute Replace:=wdReplaceAll
- '
- '
- ' 12. Save to the desktop as 'Plain Text.txt' and close the file.
- '
- ActiveDocument.SaveAs2 FileName:="Plain Text.txt", FileFormat:= _
- wdFormatText, LockComments:=False, Password:="", AddToRecentFiles:=True, _
- WritePassword:="", ReadOnlyRecommended:=False, EmbedTrueTypeFonts:=False, _
- SaveNativePictureFormat:=False, SaveFormsData:=False, SaveAsAOCELetter:= _
- False, Encoding:=1200, InsertLineBreaks:=False, AllowSubstitutions:=False _
- , LineEnding:=wdCRLF, CompatibilityMode:=0
- ActiveDocument.Close
- '
- '
- ' --------------------------
- ' PART II
- ' --------------------------
- '
- ' Open the broken down text, to be able to restore the formatting.
- '
- Documents.Open FileName:="Plain Text.txt", ConfirmConversions:=False, _
- ReadOnly:=False, AddToRecentFiles:=False, PasswordDocument:="", _
- PasswordTemplate:="", Revert:=False, WritePasswordDocument:="", _
- WritePasswordTemplate:="", Format:=wdOpenFormatAuto, XMLTransform:="", _
- Encoding:=1200
- '
- '
- ' 1. Delete the useless "Plain Text" style. There's no such thing as plain text
- ' in Word. Everything has some kind of formatting applied to it.
- '
- ActiveDocument.Styles("Plain Text").Delete
- '
- '
- ' 2. Remove all quick styles from the document; default Word templates are
- ' unaffected.
- '
- Dim s As Style
- For Each s In ActiveDocument.Styles
- If s.Type = wdStyleTypeCharacter Or _
- s.Type = wdStyleTypeParagraph Or _
- s.Type = wdStyleTypeLinked Then
- s.QuickStyle = False
- End If
- Next s
- '
- '
- ' 3. Create a "Text" style and apply it to the document because "Normal" is too
- ' mainstream. For various adjustments you can add it to the Quick Styles menu.
- '
- ActiveDocument.Styles.Add Name:="Text", Type:=wdStyleTypeParagraph
- ActiveDocument.Content.Style = ActiveDocument.Styles("Text")
- With ActiveDocument.Styles("Text").Font
- .Name = "Times New Roman"
- .Size = 10
- .Bold = False
- .Italic = False
- .Underline = wdUnderlineNone
- .UnderlineColor = wdColorAutomatic
- .StrikeThrough = False
- .DoubleStrikeThrough = False
- .Outline = False
- .Emboss = False
- .Shadow = False
- .Hidden = False
- .SmallCaps = False
- .AllCaps = False
- .Color = wdColorAutomatic
- .Engrave = False
- .Superscript = False
- .Subscript = False
- .Scaling = 100
- .Kerning = 0
- .Animation = wdAnimationNone
- .Ligatures = wdLigaturesNone
- .NumberSpacing = wdNumberSpacingDefault
- .NumberForm = wdNumberFormDefault
- .StylisticSet = wdStylisticSetDefault
- .ContextualAlternates = 0
- End With
- With ActiveDocument.Styles("Text")
- .AutomaticallyUpdate = False
- .BaseStyle = ""
- .NextParagraphStyle = "Text"
- End With
- '
- '
- ' 4. Restore subscripts.
- '
- Selection.Find.ClearFormatting
- Selection.Find.Replacement.ClearFormatting
- With Selection.Find.Replacement.Font
- .Superscript = False
- .Subscript = True
- End With
- With Selection.Find
- .Text = ChrW(9554) & "(?)" & ChrW(9557)
- .Replacement.Text = "\1"
- .Forward = True
- .Wrap = wdFindContinue
- .Format = True
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = True
- End With
- Selection.Find.Execute Replace:=wdReplaceAll
- '
- '
- ' 5. Restore superscripts.
- '
- Selection.Find.ClearFormatting
- Selection.Find.Replacement.ClearFormatting
- With Selection.Find.Replacement.Font
- .Superscript = True
- .Subscript = False
- End With
- With Selection.Find
- .Text = ChrW(9560) & "(?)" & ChrW(9563)
- .Replacement.Text = "\1"
- .Forward = True
- .Wrap = wdFindContinue
- .Format = True
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = True
- End With
- Selection.Find.Execute Replace:=wdReplaceAll
- '
- '
- ' 6. Replace paragraph endings temporarily so that they too can receive bold and
- ' italic attributes. It's a little difficult to explain but just know that it's
- ' needed... For example if you have a block of text with italic (or bold)
- ' attributes, the paragraph ending marks (ΒΆ) will not receive the attribute and
- ' every line will have separate tags instead of treating it like a whole.
- '
- ' This is the reason the "decoding" process takes such a long time. Because you
- ' basically search and replace throughout a SINGLE 1+ MB paragraph (depending
- ' on the complexity of the book.
- '
- With ActiveDocument.Content.Find
- .ClearFormatting
- .Replacement.ClearFormatting
- .Forward = True
- .Wrap = wdFindContinue
- .Format = False
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = False
- .Text = "^p"
- .Replacement.Text = ChrW(9608)
- .Execute Replace:=wdReplaceAll
- End With
- '
- '
- ' 7. Restore underlined characters.
- '
- Selection.Find.ClearFormatting
- Selection.Find.Replacement.ClearFormatting
- Selection.Find.Replacement.Font.Underline = wdUnderlineSingle
- With Selection.Find
- .Text = ChrW(9556) & "(?)" & ChrW(9559)
- .Replacement.Text = "\1"
- .Forward = True
- .Wrap = wdFindContinue
- .Format = True
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = True
- End With
- Selection.Find.Execute Replace:=wdReplaceAll
- '
- '
- ' 8. Restore bold characters.
- '
- Selection.Find.ClearFormatting
- Selection.Find.Replacement.ClearFormatting
- Selection.Find.Replacement.Font.Bold = True
- With Selection.Find
- .Text = ChrW(9568) & "(?)" & ChrW(9571)
- .Replacement.Text = "\1"
- .Forward = True
- .Wrap = wdFindContinue
- .Format = True
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = True
- End With
- Selection.Find.Execute Replace:=wdReplaceAll
- '
- '
- ' 9. Restore italic characters.
- '
- Selection.Find.ClearFormatting
- Selection.Find.Replacement.ClearFormatting
- Selection.Find.Replacement.Font.Italic = True
- With Selection.Find
- .Text = ChrW(9500) & "(?)" & ChrW(9508)
- .Replacement.Text = "\1"
- .Forward = True
- .Wrap = wdFindContinue
- .Format = True
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = True
- End With
- Selection.Find.Execute Replace:=wdReplaceAll
- '
- '
- ' 10. Restore paragraph ending marks, line breaks and page breaks.
- '
- With ActiveDocument.Content.Find
- .ClearFormatting
- .Replacement.ClearFormatting
- .Forward = True
- .Wrap = wdFindContinue
- .Format = False
- .MatchCase = False
- .MatchWholeWord = False
- .MatchAllWordForms = False
- .MatchSoundsLike = False
- .MatchWildcards = False
- .Text = ChrW(9608)
- .Replacement.Text = "^p"
- .Execute Replace:=wdReplaceAll
- .Text = ChrW(9668)
- .Replacement.Text = "^l"
- .Execute Replace:=wdReplaceAll
- .Text = ChrW(9618)
- .Replacement.Text = "^m"
- .Execute Replace:=wdReplaceAll
- End With
- '
- '
- ' 11. Save to the desktop as 'Formatted Text.rtf'. RTF chosen because it has the
- ' highest compatibility with InDesign, and possibly other word processors.
- '
- ActiveDocument.SaveAs2 FileName:="Formatted Text.rtf", FileFormat:= _
- wdFormatRTF, LockComments:=False, Password:="", AddToRecentFiles:=True, _
- WritePassword:="", ReadOnlyRecommended:=False, EmbedTrueTypeFonts:=False, _
- SaveNativePictureFormat:=False, SaveFormsData:=False, SaveAsAOCELetter:= _
- False, CompatibilityMode:=0
- ActiveDocument.Close
- '
- '
- ' 12. Clean-up, and a simple prompt message when it's done.
- '
- Kill "Plain Text.txt"
- Documents.Open FileName:="Formatted Text.rtf"
- MsgBox ("Done!")
- End Sub
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement