SHOW:
|
|
- or go back to the newest paste.
1 | Sub LayoutPrep() | |
2 | ' | |
3 | ' Macro written by DSpider, around mid 2012. | |
4 | ' | |
5 | ' What this does, is it breaks down a Word document into a squeaky clean document, | |
6 | ' with its basic formatting intact, ready for re-applying styles or quick styles | |
7 | ' in Word, or in Adobe InDesign. | |
8 | ' | |
9 | ' - Bold text | |
10 | ' - Italic text | |
11 | ' - Underlined text | |
12 | ' - Subscripts | |
13 | ' - Superscripts | |
14 | ' - Paragraph ending marks | |
15 | ' - Line Breaks | |
16 | ' - Page Breaks | |
17 | ' | |
18 | ' | |
19 | ' Set the desktop variable and the working directory for everything else. | |
20 | ' | |
21 | Dim WshShell As Object | |
22 | Dim SpecialPath As String | |
23 | Set WshShell = CreateObject("WScript.Shell") | |
24 | DESKTOP = WshShell.SpecialFolders("Desktop") | |
25 | ChangeFileOpenDirectory DESKTOP | |
26 | ' | |
27 | ' 1. Convert numbered and bulleted lists to regular text. Honestly, they're more | |
28 | ' of a nuisance with scanned text and OCR. Without this they would not make it | |
29 | ' to the second part because Word treats them (along with footnotes) more like | |
30 | ' objects. | |
31 | ' | |
32 | ActiveDocument.ConvertNumbersToText | |
33 | ' | |
34 | ' | |
35 | ' 2. Convert footnotes to regular text so that they too make it intact. | |
36 | ' | |
37 | Dim afootnote As Footnote | |
38 | Dim NumberOfFootnotes As Integer | |
39 | Dim i As Integer | |
40 | Dim aFootnoteReference As String | |
41 | Dim aFootnoteRefTag As String | |
42 | ||
43 | NumberOfFootnotes = ActiveDocument.Footnotes.Count | |
44 | For i = NumberOfFootnotes To 1 Step -1 | |
45 | Set afootnote = ActiveDocument.Footnotes(i) | |
46 | afootnote.Range.Select | |
47 | Selection.MoveStartWhile Cset:=" " & Chr(9) | |
48 | Selection.Cut | |
49 | aFootnoteReference = afootnote.Reference.Text | |
50 | Select Case aFootnoteReference | |
51 | Case Chr(2) | |
52 | aFootnoteRefTag = "num" | |
53 | Case "*" | |
54 | aFootnoteRefTag = "star" | |
55 | Case Else | |
56 | aFootnoteRefTag = "symbol" _ | |
57 | & aFootnoteReference & "/FNRef" | |
58 | End Select | |
59 | afootnote.Reference.Select | |
60 | If afootnote.Reference.Text = Chr(40) Then | |
61 | With Dialogs(wdDialogInsertSymbol) | |
62 | aFootnoteRefTag = _ | |
63 | "FNSym," & .Font & "," _ | |
64 | & .CharNum & "" | |
65 | End With | |
66 | End If | |
67 | afootnote.Delete | |
68 | Selection.InsertBefore ChrW(9616) _ | |
69 | & aFootnoteRefTag | |
70 | Selection.Collapse (wdCollapseEnd) | |
71 | Selection.Paste | |
72 | Selection.InsertAfter ChrW(9612) | |
73 | Next i | |
74 | ' | |
75 | ' | |
76 | ' 3. Remove all the tab characters by replacing them with the space character. | |
77 | ' This is because FineReader sometimes adds multiple tabs when you only need | |
78 | ' one. Easier to spot too when you're assigning styles. | |
79 | ' | |
80 | Selection.Find.ClearFormatting | |
81 | Selection.Find.Replacement.ClearFormatting | |
82 | With Selection.Find | |
83 | .Text = "^t" | |
84 | .Replacement.Text = " " | |
85 | .Forward = True | |
86 | .Wrap = wdFindContinue | |
87 | .Format = False | |
88 | .MatchCase = False | |
89 | .MatchWholeWord = False | |
90 | .MatchWildcards = False | |
91 | .MatchSoundsLike = False | |
92 | .MatchAllWordForms = False | |
93 | End With | |
94 | Selection.Find.Execute Replace:=wdReplaceAll | |
95 | ' | |
96 | ' | |
97 | ' 4. Paragraph-level formatting: Delete the ruler tabs. | |
98 | ' | |
99 | Selection.WholeStory | |
100 | With Selection.ParagraphFormat | |
101 | .SpaceBeforeAuto = False | |
102 | .SpaceAfterAuto = False | |
103 | End With | |
104 | Selection.ParagraphFormat.TabStops.ClearAll | |
105 | ActiveDocument.DefaultTabStop = InchesToPoints(0.5) | |
106 | ' | |
107 | ' | |
108 | ' 5. Paragraph-level formatting: Align everything to the left. | |
109 | ' | |
110 | With Selection.ParagraphFormat | |
111 | .LeftIndent = InchesToPoints(0) | |
112 | .RightIndent = InchesToPoints(0) | |
113 | .SpaceBefore = 0 | |
114 | .SpaceBeforeAuto = False | |
115 | .SpaceAfter = 0 | |
116 | .SpaceAfterAuto = False | |
117 | .LineSpacingRule = wdLineSpaceSingle | |
118 | .Alignment = wdAlignParagraphLeft | |
119 | .FirstLineIndent = InchesToPoints(0) | |
120 | .OutlineLevel = wdOutlineLevelBodyText | |
121 | .CharacterUnitLeftIndent = 0 | |
122 | .CharacterUnitRightIndent = 0 | |
123 | .CharacterUnitFirstLineIndent = 0 | |
124 | .LineUnitBefore = 0 | |
125 | .LineUnitAfter = 0 | |
126 | End With | |
127 | ' | |
128 | ' | |
129 | ' 6. Replace the line break character (but keep the paragraph ending marks or | |
130 | ' else the process would get too slow), then replace the Page Break and | |
131 | ' Section Break. | |
132 | ' | |
133 | With ActiveDocument.Content.Find | |
134 | .ClearFormatting | |
135 | .Replacement.ClearFormatting | |
136 | .Forward = True | |
137 | .Wrap = wdFindContinue | |
138 | .Format = False | |
139 | .MatchCase = False | |
140 | .MatchWholeWord = False | |
141 | .MatchAllWordForms = False | |
142 | .MatchSoundsLike = False | |
143 | .MatchWildcards = False | |
144 | .Text = "^l" | |
145 | .Replacement.Text = ChrW(9668) | |
146 | .Execute Replace:=wdReplaceAll | |
147 | .Text = "^m" | |
148 | .Replacement.Text = ChrW(9618) | |
149 | .Execute Replace:=wdReplaceAll | |
150 | .Text = "^b" | |
151 | .Replacement.Text = ChrW(9618) | |
152 | .Execute Replace:=wdReplaceAll | |
153 | End With | |
154 | ' | |
155 | ' | |
156 | ' 7. Italic characters. | |
157 | ' | |
158 | Selection.Find.ClearFormatting | |
159 | Selection.Find.Font.Italic = True | |
160 | Selection.Find.Replacement.ClearFormatting | |
161 | With Selection.Find | |
162 | .Text = "(?)" | |
163 | .Replacement.Text = ChrW(9500) & "\1" & ChrW(9508) | |
164 | .Forward = True | |
165 | .Wrap = wdFindContinue | |
166 | .Format = True | |
167 | .MatchCase = False | |
168 | .MatchWholeWord = False | |
169 | .MatchAllWordForms = False | |
170 | .MatchSoundsLike = False | |
171 | .MatchWildcards = True | |
172 | End With | |
173 | Selection.Find.Execute Replace:=wdReplaceAll | |
174 | ' | |
175 | ' | |
176 | ' 8. Bold characters. | |
177 | ' | |
178 | Selection.Find.ClearFormatting | |
179 | Selection.Find.Font.Bold = True | |
180 | Selection.Find.Replacement.ClearFormatting | |
181 | With Selection.Find | |
182 | .Text = "(?)" | |
183 | .Replacement.Text = ChrW(9568) & "\1" & ChrW(9571) | |
184 | .Forward = True | |
185 | .Wrap = wdFindContinue | |
186 | .Format = True | |
187 | .MatchCase = False | |
188 | .MatchWholeWord = False | |
189 | .MatchAllWordForms = False | |
190 | .MatchSoundsLike = False | |
191 | .MatchWildcards = True | |
192 | End With | |
193 | Selection.Find.Execute Replace:=wdReplaceAll | |
194 | ' | |
195 | ' | |
196 | ' 9. Underlined characters. | |
197 | ' | |
198 | Selection.Find.ClearFormatting | |
199 | Selection.Find.Font.Underline = wdUnderlineSingle | |
200 | Selection.Find.Replacement.ClearFormatting | |
201 | With Selection.Find | |
202 | .Text = "(?)" | |
203 | .Replacement.Text = ChrW(9556) & "\1" & ChrW(9559) | |
204 | .Forward = True | |
205 | .Wrap = wdFindContinue | |
206 | .Format = True | |
207 | .MatchCase = False | |
208 | .MatchWholeWord = False | |
209 | .MatchAllWordForms = False | |
210 | .MatchSoundsLike = False | |
211 | .MatchWildcards = True | |
212 | End With | |
213 | Selection.Find.Execute Replace:=wdReplaceAll | |
214 | ' | |
215 | ' | |
216 | ' 10. Superscripts. | |
217 | ' | |
218 | Selection.Find.ClearFormatting | |
219 | With Selection.Find.Font | |
220 | .Superscript = True | |
221 | .Subscript = False | |
222 | End With | |
223 | Selection.Find.Replacement.ClearFormatting | |
224 | With Selection.Find | |
225 | .Text = "(?)" | |
226 | .Replacement.Text = ChrW(9560) & "\1" & ChrW(9563) | |
227 | .Forward = True | |
228 | .Wrap = wdFindContinue | |
229 | .Format = True | |
230 | .MatchCase = False | |
231 | .MatchWholeWord = False | |
232 | .MatchAllWordForms = False | |
233 | .MatchSoundsLike = False | |
234 | .MatchWildcards = True | |
235 | End With | |
236 | Selection.Find.Execute Replace:=wdReplaceAll | |
237 | ' | |
238 | ' | |
239 | ' 11. Subscripts. | |
240 | ' | |
241 | Selection.Find.ClearFormatting | |
242 | With Selection.Find.Font | |
243 | .Superscript = False | |
244 | .Subscript = True | |
245 | End With | |
246 | Selection.Find.Replacement.ClearFormatting | |
247 | With Selection.Find | |
248 | .Text = "(?)" | |
249 | .Replacement.Text = ChrW(9554) & "\1" & ChrW(9557) | |
250 | .Forward = True | |
251 | .Wrap = wdFindContinue | |
252 | .Format = True | |
253 | .MatchCase = False | |
254 | .MatchWholeWord = False | |
255 | .MatchAllWordForms = False | |
256 | .MatchSoundsLike = False | |
257 | .MatchWildcards = True | |
258 | End With | |
259 | Selection.Find.Execute Replace:=wdReplaceAll | |
260 | ' | |
261 | ' | |
262 | ' 12. Save to the desktop as 'Plain Text.txt' and close the file. | |
263 | ' | |
264 | ActiveDocument.SaveAs2 FileName:="Plain Text.txt", FileFormat:= _ | |
265 | wdFormatText, LockComments:=False, Password:="", AddToRecentFiles:=True, _ | |
266 | WritePassword:="", ReadOnlyRecommended:=False, EmbedTrueTypeFonts:=False, _ | |
267 | SaveNativePictureFormat:=False, SaveFormsData:=False, SaveAsAOCELetter:= _ | |
268 | False, Encoding:=1200, InsertLineBreaks:=False, AllowSubstitutions:=False _ | |
269 | , LineEnding:=wdCRLF, CompatibilityMode:=0 | |
270 | ActiveDocument.Close | |
271 | ' | |
272 | ' | |
273 | ' -------------------------- | |
274 | ' PART II | |
275 | ' -------------------------- | |
276 | ' | |
277 | ' Open the broken down text, to be able to restore the formatting. | |
278 | ' | |
279 | Documents.Open FileName:="Plain Text.txt", ConfirmConversions:=False, _ | |
280 | ReadOnly:=False, AddToRecentFiles:=False, PasswordDocument:="", _ | |
281 | PasswordTemplate:="", Revert:=False, WritePasswordDocument:="", _ | |
282 | WritePasswordTemplate:="", Format:=wdOpenFormatAuto, XMLTransform:="", _ | |
283 | Encoding:=1200 | |
284 | ' | |
285 | ' | |
286 | ' 1. Delete the useless "Plain Text" style. There's no such thing as plain text | |
287 | ' in Word. Everything has some kind of formatting applied to it. | |
288 | ' | |
289 | ActiveDocument.Styles("Plain Text").Delete | |
290 | ' | |
291 | ' | |
292 | - | ' 2. Remove all quick styles for the document (default Word templates are intact). |
292 | + | ' 2. Remove all quick styles from the document; default Word templates are |
293 | ' unaffected. | |
294 | ' | |
295 | Dim s As Style | |
296 | For Each s In ActiveDocument.Styles | |
297 | If s.Type = wdStyleTypeCharacter Or _ | |
298 | s.Type = wdStyleTypeParagraph Or _ | |
299 | s.Type = wdStyleTypeLinked Then | |
300 | s.QuickStyle = False | |
301 | End If | |
302 | Next s | |
303 | ' | |
304 | ' | |
305 | ' 3. Create a "Text" style and apply it to the document because "Normal" is too | |
306 | ' mainstream. For various adjustments you can add it to the Quick Styles menu. | |
307 | ' | |
308 | ActiveDocument.Styles.Add Name:="Text", Type:=wdStyleTypeParagraph | |
309 | ActiveDocument.Content.Style = ActiveDocument.Styles("Text") | |
310 | With ActiveDocument.Styles("Text").Font | |
311 | .Name = "Times New Roman" | |
312 | .Size = 10 | |
313 | .Bold = False | |
314 | .Italic = False | |
315 | .Underline = wdUnderlineNone | |
316 | .UnderlineColor = wdColorAutomatic | |
317 | .StrikeThrough = False | |
318 | .DoubleStrikeThrough = False | |
319 | .Outline = False | |
320 | .Emboss = False | |
321 | .Shadow = False | |
322 | .Hidden = False | |
323 | .SmallCaps = False | |
324 | .AllCaps = False | |
325 | .Color = wdColorAutomatic | |
326 | .Engrave = False | |
327 | .Superscript = False | |
328 | .Subscript = False | |
329 | .Scaling = 100 | |
330 | .Kerning = 0 | |
331 | .Animation = wdAnimationNone | |
332 | .Ligatures = wdLigaturesNone | |
333 | .NumberSpacing = wdNumberSpacingDefault | |
334 | .NumberForm = wdNumberFormDefault | |
335 | .StylisticSet = wdStylisticSetDefault | |
336 | .ContextualAlternates = 0 | |
337 | End With | |
338 | With ActiveDocument.Styles("Text") | |
339 | .AutomaticallyUpdate = False | |
340 | .BaseStyle = "" | |
341 | .NextParagraphStyle = "Text" | |
342 | End With | |
343 | ' | |
344 | ' | |
345 | ' 4. Restore subscripts. | |
346 | ' | |
347 | Selection.Find.ClearFormatting | |
348 | Selection.Find.Replacement.ClearFormatting | |
349 | With Selection.Find.Replacement.Font | |
350 | .Superscript = False | |
351 | .Subscript = True | |
352 | End With | |
353 | With Selection.Find | |
354 | .Text = ChrW(9554) & "(?)" & ChrW(9557) | |
355 | .Replacement.Text = "\1" | |
356 | .Forward = True | |
357 | .Wrap = wdFindContinue | |
358 | .Format = True | |
359 | .MatchCase = False | |
360 | .MatchWholeWord = False | |
361 | .MatchAllWordForms = False | |
362 | .MatchSoundsLike = False | |
363 | .MatchWildcards = True | |
364 | End With | |
365 | Selection.Find.Execute Replace:=wdReplaceAll | |
366 | ' | |
367 | ' | |
368 | ' 5. Restore superscripts. | |
369 | ' | |
370 | Selection.Find.ClearFormatting | |
371 | Selection.Find.Replacement.ClearFormatting | |
372 | With Selection.Find.Replacement.Font | |
373 | .Superscript = True | |
374 | .Subscript = False | |
375 | End With | |
376 | With Selection.Find | |
377 | .Text = ChrW(9560) & "(?)" & ChrW(9563) | |
378 | .Replacement.Text = "\1" | |
379 | .Forward = True | |
380 | .Wrap = wdFindContinue | |
381 | .Format = True | |
382 | .MatchCase = False | |
383 | .MatchWholeWord = False | |
384 | .MatchAllWordForms = False | |
385 | .MatchSoundsLike = False | |
386 | .MatchWildcards = True | |
387 | End With | |
388 | Selection.Find.Execute Replace:=wdReplaceAll | |
389 | ' | |
390 | ' | |
391 | ' 6. Replace paragraph endings temporarily so that they too can receive bold and | |
392 | ' italic attributes. It's a little difficult to explain but just know that it's | |
393 | ' needed... For example if you have a block of text with italic (or bold) | |
394 | ' attributes, the paragraph ending marks (¶) will not receive the attribute and | |
395 | ' every line will have separate tags instead of treating it like a whole. | |
396 | ' | |
397 | ' This is the reason the "decoding" process takes such a long time. Because you | |
398 | ' basically search and replace throughout a SINGLE 1+ MB paragraph (depending | |
399 | ' on the complexity of the book. | |
400 | ' | |
401 | With ActiveDocument.Content.Find | |
402 | .ClearFormatting | |
403 | .Replacement.ClearFormatting | |
404 | .Forward = True | |
405 | .Wrap = wdFindContinue | |
406 | .Format = False | |
407 | .MatchCase = False | |
408 | .MatchWholeWord = False | |
409 | .MatchAllWordForms = False | |
410 | .MatchSoundsLike = False | |
411 | .MatchWildcards = False | |
412 | .Text = "^p" | |
413 | .Replacement.Text = ChrW(9608) | |
414 | .Execute Replace:=wdReplaceAll | |
415 | End With | |
416 | ' | |
417 | ' | |
418 | ' 7. Restore underlined characters. | |
419 | ' | |
420 | Selection.Find.ClearFormatting | |
421 | Selection.Find.Replacement.ClearFormatting | |
422 | Selection.Find.Replacement.Font.Underline = wdUnderlineSingle | |
423 | With Selection.Find | |
424 | .Text = ChrW(9556) & "(?)" & ChrW(9559) | |
425 | .Replacement.Text = "\1" | |
426 | .Forward = True | |
427 | .Wrap = wdFindContinue | |
428 | .Format = True | |
429 | .MatchCase = False | |
430 | .MatchWholeWord = False | |
431 | .MatchAllWordForms = False | |
432 | .MatchSoundsLike = False | |
433 | .MatchWildcards = True | |
434 | End With | |
435 | Selection.Find.Execute Replace:=wdReplaceAll | |
436 | ' | |
437 | ' | |
438 | ' 8. Restore bold characters. | |
439 | ' | |
440 | Selection.Find.ClearFormatting | |
441 | Selection.Find.Replacement.ClearFormatting | |
442 | Selection.Find.Replacement.Font.Bold = True | |
443 | With Selection.Find | |
444 | .Text = ChrW(9568) & "(?)" & ChrW(9571) | |
445 | .Replacement.Text = "\1" | |
446 | .Forward = True | |
447 | .Wrap = wdFindContinue | |
448 | .Format = True | |
449 | .MatchCase = False | |
450 | .MatchWholeWord = False | |
451 | .MatchAllWordForms = False | |
452 | .MatchSoundsLike = False | |
453 | .MatchWildcards = True | |
454 | End With | |
455 | Selection.Find.Execute Replace:=wdReplaceAll | |
456 | ' | |
457 | ' | |
458 | ' 9. Restore italic characters. | |
459 | ' | |
460 | Selection.Find.ClearFormatting | |
461 | Selection.Find.Replacement.ClearFormatting | |
462 | Selection.Find.Replacement.Font.Italic = True | |
463 | With Selection.Find | |
464 | .Text = ChrW(9500) & "(?)" & ChrW(9508) | |
465 | .Replacement.Text = "\1" | |
466 | .Forward = True | |
467 | .Wrap = wdFindContinue | |
468 | .Format = True | |
469 | .MatchCase = False | |
470 | .MatchWholeWord = False | |
471 | .MatchAllWordForms = False | |
472 | .MatchSoundsLike = False | |
473 | .MatchWildcards = True | |
474 | End With | |
475 | Selection.Find.Execute Replace:=wdReplaceAll | |
476 | ' | |
477 | ' | |
478 | ' 10. Restore paragraph ending marks, line breaks and page breaks. | |
479 | ' | |
480 | With ActiveDocument.Content.Find | |
481 | .ClearFormatting | |
482 | .Replacement.ClearFormatting | |
483 | .Forward = True | |
484 | .Wrap = wdFindContinue | |
485 | .Format = False | |
486 | .MatchCase = False | |
487 | .MatchWholeWord = False | |
488 | .MatchAllWordForms = False | |
489 | .MatchSoundsLike = False | |
490 | .MatchWildcards = False | |
491 | .Text = ChrW(9608) | |
492 | .Replacement.Text = "^p" | |
493 | .Execute Replace:=wdReplaceAll | |
494 | .Text = ChrW(9668) | |
495 | .Replacement.Text = "^l" | |
496 | .Execute Replace:=wdReplaceAll | |
497 | .Text = ChrW(9618) | |
498 | .Replacement.Text = "^m" | |
499 | .Execute Replace:=wdReplaceAll | |
500 | End With | |
501 | ' | |
502 | ' | |
503 | ' 11. Save to the desktop as 'Formatted Text.rtf'. RTF chosen because it has the | |
504 | ' highest compatibility with InDesign, and possibly other word processors. | |
505 | ' | |
506 | ActiveDocument.SaveAs2 FileName:="Formatted Text.rtf", FileFormat:= _ | |
507 | wdFormatRTF, LockComments:=False, Password:="", AddToRecentFiles:=True, _ | |
508 | WritePassword:="", ReadOnlyRecommended:=False, EmbedTrueTypeFonts:=False, _ | |
509 | SaveNativePictureFormat:=False, SaveFormsData:=False, SaveAsAOCELetter:= _ | |
510 | False, CompatibilityMode:=0 | |
511 | ActiveDocument.Close | |
512 | ' | |
513 | ' | |
514 | ' 12. Clean-up, and a simple prompt message when it's done. | |
515 | ' | |
516 | Kill "Plain Text.txt" | |
517 | Documents.Open FileName:="Formatted Text.rtf" | |
518 | MsgBox ("Done!") | |
519 | ||
520 | End Sub |