View difference between Paste ID: 9TLg4UWG and jpJQUpDK
SHOW: | | - or go back to the newest paste.
1
Sub LayoutPrep()
2
'
3
'  Macro written by DSpider, around mid 2012.
4
'
5
'  What this does, is it breaks down a Word document into a squeaky clean document,
6
'  with its basic formatting intact, ready for re-applying styles or quick styles
7
'  in Word, or in Adobe InDesign.
8
'
9
'    - Bold text
10
'    - Italic text
11
'    - Underlined text
12
'    - Subscripts
13
'    - Superscripts
14
'    - Paragraph ending marks
15
'    - Line Breaks
16
'    - Page Breaks
17
'
18
'
19
'  Set the desktop variable and the working directory for everything else.
20
'
21
    Dim WshShell As Object
22
    Dim SpecialPath As String
23
    Set WshShell = CreateObject("WScript.Shell")
24
    DESKTOP = WshShell.SpecialFolders("Desktop")
25
    ChangeFileOpenDirectory DESKTOP
26
'
27
'  1. Convert numbered and bulleted lists to regular text. Honestly, they're more
28
'     of a nuisance with scanned text and OCR. Without this they would not make it
29
'     to the second part because Word treats them (along with footnotes) more like
30
'     objects.
31
'
32
    ActiveDocument.ConvertNumbersToText
33
'
34
'
35
'  2. Convert footnotes to regular text so that they too make it intact.
36
'
37
   Dim afootnote As Footnote
38
   Dim NumberOfFootnotes As Integer
39
   Dim i As Integer
40
   Dim aFootnoteReference As String
41
   Dim aFootnoteRefTag As String
42
 
43
 NumberOfFootnotes = ActiveDocument.Footnotes.Count
44
 For i = NumberOfFootnotes To 1 Step -1
45
    Set afootnote = ActiveDocument.Footnotes(i)
46
    afootnote.Range.Select
47
    Selection.MoveStartWhile Cset:=" " & Chr(9)
48
    Selection.Cut
49
    aFootnoteReference = afootnote.Reference.Text
50
    Select Case aFootnoteReference
51
    Case Chr(2)
52
    aFootnoteRefTag = "num"
53
    Case "*"
54
    aFootnoteRefTag = "star"
55
    Case Else
56
    aFootnoteRefTag = "symbol" _
57
    & aFootnoteReference & "/FNRef"
58
 End Select
59
 afootnote.Reference.Select
60
 If afootnote.Reference.Text = Chr(40) Then
61
    With Dialogs(wdDialogInsertSymbol)
62
        aFootnoteRefTag = _
63
        "FNSym," & .Font & "," _
64
        & .CharNum & ""
65
    End With
66
 End If
67
 afootnote.Delete
68
 Selection.InsertBefore ChrW(9616) _
69
 & aFootnoteRefTag
70
 Selection.Collapse (wdCollapseEnd)
71
 Selection.Paste
72
 Selection.InsertAfter ChrW(9612)
73
 Next i
74
'
75
'
76
'  3. Remove all the tab characters by replacing them with the space character.
77
'     This is because FineReader sometimes adds multiple tabs when you only need
78
'     one. Easier to spot too when you're assigning styles.
79
'
80
    Selection.Find.ClearFormatting
81
    Selection.Find.Replacement.ClearFormatting
82
    With Selection.Find
83
        .Text = "^t"
84
        .Replacement.Text = " "
85
        .Forward = True
86
        .Wrap = wdFindContinue
87
        .Format = False
88
        .MatchCase = False
89
        .MatchWholeWord = False
90
        .MatchWildcards = False
91
        .MatchSoundsLike = False
92
        .MatchAllWordForms = False
93
    End With
94
    Selection.Find.Execute Replace:=wdReplaceAll
95
'
96
'
97
'  4. Paragraph-level formatting: Delete the ruler tabs.
98
'
99
    Selection.WholeStory
100
    With Selection.ParagraphFormat
101
        .SpaceBeforeAuto = False
102
        .SpaceAfterAuto = False
103
    End With
104
    Selection.ParagraphFormat.TabStops.ClearAll
105
    ActiveDocument.DefaultTabStop = InchesToPoints(0.5)
106
'
107
'
108
'  5. Paragraph-level formatting: Align everything to the left.
109
'
110
    With Selection.ParagraphFormat
111
        .LeftIndent = InchesToPoints(0)
112
        .RightIndent = InchesToPoints(0)
113
        .SpaceBefore = 0
114
        .SpaceBeforeAuto = False
115
        .SpaceAfter = 0
116
        .SpaceAfterAuto = False
117
        .LineSpacingRule = wdLineSpaceSingle
118
        .Alignment = wdAlignParagraphLeft
119
        .FirstLineIndent = InchesToPoints(0)
120
        .OutlineLevel = wdOutlineLevelBodyText
121
        .CharacterUnitLeftIndent = 0
122
        .CharacterUnitRightIndent = 0
123
        .CharacterUnitFirstLineIndent = 0
124
        .LineUnitBefore = 0
125
        .LineUnitAfter = 0
126
    End With
127
'
128
'
129
'  6. Replace the line break character (but keep the paragraph ending marks or
130
'     else the process would get too slow), then replace the Page Break and
131
'     Section Break.
132
'
133
With ActiveDocument.Content.Find
134
    .ClearFormatting
135
    .Replacement.ClearFormatting
136
    .Forward = True
137
    .Wrap = wdFindContinue
138
    .Format = False
139
    .MatchCase = False
140
    .MatchWholeWord = False
141
    .MatchAllWordForms = False
142
    .MatchSoundsLike = False
143
    .MatchWildcards = False
144
    .Text = "^l"
145
    .Replacement.Text = ChrW(9668)
146
    .Execute Replace:=wdReplaceAll
147
    .Text = "^m"
148
    .Replacement.Text = ChrW(9618)
149
    .Execute Replace:=wdReplaceAll
150
    .Text = "^b"
151
    .Replacement.Text = ChrW(9618)
152
    .Execute Replace:=wdReplaceAll
153
End With
154
'
155
'
156
'  7. Italic characters.
157
'
158
    Selection.Find.ClearFormatting
159
    Selection.Find.Font.Italic = True
160
    Selection.Find.Replacement.ClearFormatting
161
    With Selection.Find
162
        .Text = "(?)"
163
        .Replacement.Text = ChrW(9500) & "\1" & ChrW(9508)
164
        .Forward = True
165
        .Wrap = wdFindContinue
166
        .Format = True
167
        .MatchCase = False
168
        .MatchWholeWord = False
169
        .MatchAllWordForms = False
170
        .MatchSoundsLike = False
171
        .MatchWildcards = True
172
    End With
173
    Selection.Find.Execute Replace:=wdReplaceAll
174
'
175
'
176
'  8. Bold characters.
177
'
178
    Selection.Find.ClearFormatting
179
    Selection.Find.Font.Bold = True
180
    Selection.Find.Replacement.ClearFormatting
181
    With Selection.Find
182
        .Text = "(?)"
183
        .Replacement.Text = ChrW(9568) & "\1" & ChrW(9571)
184
        .Forward = True
185
        .Wrap = wdFindContinue
186
        .Format = True
187
        .MatchCase = False
188
        .MatchWholeWord = False
189
        .MatchAllWordForms = False
190
        .MatchSoundsLike = False
191
        .MatchWildcards = True
192
    End With
193
    Selection.Find.Execute Replace:=wdReplaceAll
194
'
195
'
196
'  9.  Underlined characters.
197
'
198
    Selection.Find.ClearFormatting
199
    Selection.Find.Font.Underline = wdUnderlineSingle
200
    Selection.Find.Replacement.ClearFormatting
201
    With Selection.Find
202
        .Text = "(?)"
203
        .Replacement.Text = ChrW(9556) & "\1" & ChrW(9559)
204
        .Forward = True
205
        .Wrap = wdFindContinue
206
        .Format = True
207
        .MatchCase = False
208
        .MatchWholeWord = False
209
        .MatchAllWordForms = False
210
        .MatchSoundsLike = False
211
        .MatchWildcards = True
212
    End With
213
    Selection.Find.Execute Replace:=wdReplaceAll
214
'
215
'
216
'  10. Superscripts.
217
'
218
    Selection.Find.ClearFormatting
219
    With Selection.Find.Font
220
        .Superscript = True
221
        .Subscript = False
222
    End With
223
    Selection.Find.Replacement.ClearFormatting
224
    With Selection.Find
225
        .Text = "(?)"
226
        .Replacement.Text = ChrW(9560) & "\1" & ChrW(9563)
227
        .Forward = True
228
        .Wrap = wdFindContinue
229
        .Format = True
230
        .MatchCase = False
231
        .MatchWholeWord = False
232
        .MatchAllWordForms = False
233
        .MatchSoundsLike = False
234
        .MatchWildcards = True
235
    End With
236
    Selection.Find.Execute Replace:=wdReplaceAll
237
'
238
'
239
'  11. Subscripts.
240
'
241
    Selection.Find.ClearFormatting
242
    With Selection.Find.Font
243
        .Superscript = False
244
        .Subscript = True
245
    End With
246
    Selection.Find.Replacement.ClearFormatting
247
    With Selection.Find
248
        .Text = "(?)"
249
        .Replacement.Text = ChrW(9554) & "\1" & ChrW(9557)
250
        .Forward = True
251
        .Wrap = wdFindContinue
252
        .Format = True
253
        .MatchCase = False
254
        .MatchWholeWord = False
255
        .MatchAllWordForms = False
256
        .MatchSoundsLike = False
257
        .MatchWildcards = True
258
    End With
259
    Selection.Find.Execute Replace:=wdReplaceAll
260
'
261
'
262
'  12. Save to the desktop as 'Plain Text.txt' and close the file.
263
'
264
    ActiveDocument.SaveAs2 FileName:="Plain Text.txt", FileFormat:= _
265
        wdFormatText, LockComments:=False, Password:="", AddToRecentFiles:=True, _
266
        WritePassword:="", ReadOnlyRecommended:=False, EmbedTrueTypeFonts:=False, _
267
         SaveNativePictureFormat:=False, SaveFormsData:=False, SaveAsAOCELetter:= _
268
        False, Encoding:=1200, InsertLineBreaks:=False, AllowSubstitutions:=False _
269
        , LineEnding:=wdCRLF, CompatibilityMode:=0
270
    ActiveDocument.Close
271
'
272
'
273
' --------------------------
274
'  PART II
275
' --------------------------
276
'
277
' Open the broken down text, to be able to restore the formatting.
278
'
279
    Documents.Open FileName:="Plain Text.txt", ConfirmConversions:=False, _
280
        ReadOnly:=False, AddToRecentFiles:=False, PasswordDocument:="", _
281
        PasswordTemplate:="", Revert:=False, WritePasswordDocument:="", _
282
        WritePasswordTemplate:="", Format:=wdOpenFormatAuto, XMLTransform:="", _
283
        Encoding:=1200
284
'
285
'
286
'  1. Delete the useless "Plain Text" style. There's no such thing as plain text
287
'     in Word. Everything has some kind of formatting applied to it.
288
'
289
    ActiveDocument.Styles("Plain Text").Delete
290
'
291
'
292-
'  2. Remove all quick styles for the document (default Word templates are intact).
292+
'  2. Remove all quick styles from the document; default Word templates are
293
'     unaffected.
294
'
295
    Dim s As Style
296
    For Each s In ActiveDocument.Styles
297
      If s.Type = wdStyleTypeCharacter Or _
298
         s.Type = wdStyleTypeParagraph Or _
299
         s.Type = wdStyleTypeLinked Then
300
         s.QuickStyle = False
301
      End If
302
    Next s
303
'
304
'
305
'  3.  Create a "Text" style and apply it to the document because "Normal" is too
306
'      mainstream. For various adjustments you can add it to the Quick Styles menu.
307
'
308
    ActiveDocument.Styles.Add Name:="Text", Type:=wdStyleTypeParagraph
309
    ActiveDocument.Content.Style = ActiveDocument.Styles("Text")
310
        With ActiveDocument.Styles("Text").Font
311
        .Name = "Times New Roman"
312
        .Size = 10
313
        .Bold = False
314
        .Italic = False
315
        .Underline = wdUnderlineNone
316
        .UnderlineColor = wdColorAutomatic
317
        .StrikeThrough = False
318
        .DoubleStrikeThrough = False
319
        .Outline = False
320
        .Emboss = False
321
        .Shadow = False
322
        .Hidden = False
323
        .SmallCaps = False
324
        .AllCaps = False
325
        .Color = wdColorAutomatic
326
        .Engrave = False
327
        .Superscript = False
328
        .Subscript = False
329
        .Scaling = 100
330
        .Kerning = 0
331
        .Animation = wdAnimationNone
332
        .Ligatures = wdLigaturesNone
333
        .NumberSpacing = wdNumberSpacingDefault
334
        .NumberForm = wdNumberFormDefault
335
        .StylisticSet = wdStylisticSetDefault
336
        .ContextualAlternates = 0
337
    End With
338
    With ActiveDocument.Styles("Text")
339
        .AutomaticallyUpdate = False
340
        .BaseStyle = ""
341
        .NextParagraphStyle = "Text"
342
    End With
343
'
344
'
345
'  4. Restore subscripts.
346
'
347
    Selection.Find.ClearFormatting
348
    Selection.Find.Replacement.ClearFormatting
349
    With Selection.Find.Replacement.Font
350
        .Superscript = False
351
        .Subscript = True
352
    End With
353
    With Selection.Find
354
        .Text = ChrW(9554) & "(?)" & ChrW(9557)
355
        .Replacement.Text = "\1"
356
        .Forward = True
357
        .Wrap = wdFindContinue
358
        .Format = True
359
        .MatchCase = False
360
        .MatchWholeWord = False
361
        .MatchAllWordForms = False
362
        .MatchSoundsLike = False
363
        .MatchWildcards = True
364
    End With
365
    Selection.Find.Execute Replace:=wdReplaceAll
366
'
367
'
368
'  5. Restore superscripts.
369
'
370
    Selection.Find.ClearFormatting
371
    Selection.Find.Replacement.ClearFormatting
372
    With Selection.Find.Replacement.Font
373
        .Superscript = True
374
        .Subscript = False
375
    End With
376
    With Selection.Find
377
        .Text = ChrW(9560) & "(?)" & ChrW(9563)
378
        .Replacement.Text = "\1"
379
        .Forward = True
380
        .Wrap = wdFindContinue
381
        .Format = True
382
        .MatchCase = False
383
        .MatchWholeWord = False
384
        .MatchAllWordForms = False
385
        .MatchSoundsLike = False
386
        .MatchWildcards = True
387
    End With
388
    Selection.Find.Execute Replace:=wdReplaceAll
389
'
390
'
391
'  6. Replace paragraph endings temporarily so that they too can receive bold and
392
'     italic attributes. It's a little difficult to explain but just know that it's
393
'     needed... For example if you have a block of text with italic (or bold)
394
'     attributes, the paragraph ending marks (¶) will not receive the attribute and
395
'     every line will have separate tags instead of treating it like a whole.
396
'
397
'     This is the reason the "decoding" process takes such a long time. Because you
398
'     basically search and replace throughout a SINGLE 1+ MB paragraph (depending
399
'     on the complexity of the book.
400
'
401
    With ActiveDocument.Content.Find
402
        .ClearFormatting
403
        .Replacement.ClearFormatting
404
        .Forward = True
405
        .Wrap = wdFindContinue
406
        .Format = False
407
        .MatchCase = False
408
        .MatchWholeWord = False
409
        .MatchAllWordForms = False
410
        .MatchSoundsLike = False
411
        .MatchWildcards = False
412
        .Text = "^p"
413
        .Replacement.Text = ChrW(9608)
414
        .Execute Replace:=wdReplaceAll
415
    End With
416
'
417
'
418
'  7. Restore underlined characters.
419
'
420
    Selection.Find.ClearFormatting
421
    Selection.Find.Replacement.ClearFormatting
422
    Selection.Find.Replacement.Font.Underline = wdUnderlineSingle
423
    With Selection.Find
424
        .Text = ChrW(9556) & "(?)" & ChrW(9559)
425
        .Replacement.Text = "\1"
426
        .Forward = True
427
        .Wrap = wdFindContinue
428
        .Format = True
429
        .MatchCase = False
430
        .MatchWholeWord = False
431
        .MatchAllWordForms = False
432
        .MatchSoundsLike = False
433
        .MatchWildcards = True
434
    End With
435
    Selection.Find.Execute Replace:=wdReplaceAll
436
'
437
'
438
'  8. Restore bold characters.
439
'
440
    Selection.Find.ClearFormatting
441
    Selection.Find.Replacement.ClearFormatting
442
    Selection.Find.Replacement.Font.Bold = True
443
    With Selection.Find
444
        .Text = ChrW(9568) & "(?)" & ChrW(9571)
445
        .Replacement.Text = "\1"
446
        .Forward = True
447
        .Wrap = wdFindContinue
448
        .Format = True
449
        .MatchCase = False
450
        .MatchWholeWord = False
451
        .MatchAllWordForms = False
452
        .MatchSoundsLike = False
453
        .MatchWildcards = True
454
    End With
455
    Selection.Find.Execute Replace:=wdReplaceAll
456
'
457
'
458
'  9. Restore italic characters.
459
'
460
    Selection.Find.ClearFormatting
461
    Selection.Find.Replacement.ClearFormatting
462
    Selection.Find.Replacement.Font.Italic = True
463
    With Selection.Find
464
        .Text = ChrW(9500) & "(?)" & ChrW(9508)
465
        .Replacement.Text = "\1"
466
        .Forward = True
467
        .Wrap = wdFindContinue
468
        .Format = True
469
        .MatchCase = False
470
        .MatchWholeWord = False
471
        .MatchAllWordForms = False
472
        .MatchSoundsLike = False
473
        .MatchWildcards = True
474
    End With
475
    Selection.Find.Execute Replace:=wdReplaceAll
476
'
477
'
478
'  10. Restore paragraph ending marks, line breaks and page breaks.
479
'
480
    With ActiveDocument.Content.Find
481
        .ClearFormatting
482
        .Replacement.ClearFormatting
483
        .Forward = True
484
        .Wrap = wdFindContinue
485
        .Format = False
486
        .MatchCase = False
487
        .MatchWholeWord = False
488
        .MatchAllWordForms = False
489
        .MatchSoundsLike = False
490
        .MatchWildcards = False
491
        .Text = ChrW(9608)
492
        .Replacement.Text = "^p"
493
        .Execute Replace:=wdReplaceAll
494
        .Text = ChrW(9668)
495
        .Replacement.Text = "^l"
496
        .Execute Replace:=wdReplaceAll
497
        .Text = ChrW(9618)
498
        .Replacement.Text = "^m"
499
        .Execute Replace:=wdReplaceAll
500
    End With
501
'
502
'
503
'  11. Save to the desktop as 'Formatted Text.rtf'. RTF chosen because it has the
504
'      highest compatibility with InDesign, and possibly other word processors.
505
'
506
    ActiveDocument.SaveAs2 FileName:="Formatted Text.rtf", FileFormat:= _
507
        wdFormatRTF, LockComments:=False, Password:="", AddToRecentFiles:=True, _
508
        WritePassword:="", ReadOnlyRecommended:=False, EmbedTrueTypeFonts:=False, _
509
         SaveNativePictureFormat:=False, SaveFormsData:=False, SaveAsAOCELetter:= _
510
        False, CompatibilityMode:=0
511
    ActiveDocument.Close
512
'
513
'
514
' 12. Clean-up, and a simple prompt message when it's done.
515
'
516
    Kill "Plain Text.txt"
517
    Documents.Open FileName:="Formatted Text.rtf"
518
    MsgBox ("Done!")
519
    
520
End Sub