corporal_hesitant133

Paperless-ngx → Open WebUI RAG

Sep 24th, 2025
808
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 6.73 KB | None | 0 0
  1. #!/bin/bash
  2. # Paperless-ngx → Open WebUI RAG (schonend, wenige parallele Uploads)
  3. # Unraid User Script – Version 1.0
  4.  
  5. set -euo pipefail
  6.  
  7. ########################################
  8. #            KONFIGURATION             #
  9. ########################################
  10. # Paperless-ngx
  11. export PNGX_BASE="PAPERLESS_SERVER:PORT"     # <- anpassen
  12. export PNGX_TOKEN="PAPERLESS_TOKEN"                   # <- anpassen
  13.  
  14. # Open WebUI
  15. export OWUI_BASE="OWUI_SERVER:PORT"    # <- anpassen
  16. export OWUI_TOKEN="OWUI_TOKEN"             # <- anpassen
  17. export KB_ID="KNOWLEDGE_ID"  # <- deine Knowledge-ID (grab from URL)
  18.  
  19. # Export-Ziel auf Unraid
  20. OUT_DIR="/path/to/folder"      # <- anpassen, existiert schon bei dir
  21. MD_DIR="${OUT_DIR}/md"
  22.  
  23. # Schonende Parallelität beim Upload
  24. PAR_J="${PAR_J:-4}"                            # 4 gleichzeitige Uploads (schwach → 2–4, kräftig → 8–16)
  25.  
  26. # Export-Paginierung
  27. PAGE_SIZE="${PAGE_SIZE:-100}"                  # 100 pro Seite
  28. LIMIT_DOCS="${LIMIT_DOCS:-0}"                  # 0 = kein Limit (für Tests z. B. 500)
  29.  
  30. ########################################
  31. #         HILFS-SKRIPTE SCHREIBEN      #
  32. ########################################
  33.  
  34. mkdir -p "$OUT_DIR"
  35.  
  36. # Exporter: baut ID→Name-Maps & exportiert seitenweise, schreibt nur bei Änderung
  37. cat > "${OUT_DIR}/export_batch.sh" <<'SH'
  38. #!/bin/sh
  39. set -eu
  40. apk add --no-cache curl jq >/dev/null
  41.  
  42. PAGE_SIZE="${PAGE_SIZE:-100}"
  43. LIMIT_DOCS="${LIMIT_DOCS:-0}"
  44.  
  45. mkdir -p /work/md
  46. cd /work
  47.  
  48. log(){ printf '%s\n' "$*" >&2; }
  49.  
  50. # Maps (id=name) erstellen
  51. corr_map="$(
  52.   curl -s -H "Authorization: Token $PNGX_TOKEN" \
  53.     "$PNGX_BASE/api/correspondents/?page_size=1000" \
  54.   | jq -r '.results[] | "\(.id)=\(.name)"' || true
  55. )"
  56. tag_map="$(
  57.   curl -s -H "Authorization: Token $PNGX_TOKEN" \
  58.     "$PNGX_BASE/api/tags/?page_size=2000" \
  59.   | jq -r '.results[] | "\(.id)=\(.name)"' || true
  60. )"
  61.  
  62. corr_name() { awk -F= -v id="$1" '$1==id{print $2; f=1} END{if(!f)print id}' <<EOF
  63. $corr_map
  64. EOF
  65. }
  66. tag_name()  { awk -F= -v id="$1" '$1==id{print $2; f=1} END{if(!f)print id}' <<EOF
  67. $tag_map
  68. EOF
  69. }
  70.  
  71. count=0
  72. next="$PNGX_BASE/api/documents/?page_size=$PAGE_SIZE&fields=id,title,created,modified,archive_serial_number,correspondent,tags,content"
  73.  
  74. while [ -n "$next" ] && [ "$next" != "null" ]; do
  75.   page="$(curl -s -H "Authorization: Token $PNGX_TOKEN" "$next")"
  76.   next="$(echo "$page" | jq -r '.next')"
  77.  
  78.   echo "$page" | jq -c '.results[]' | while read -r doc; do
  79.     id="$(echo "$doc" | jq -r '.id')"
  80.     title="$(echo "$doc" | jq -r '.title')"
  81.     created="$(echo "$doc" | jq -r '.created')"
  82.     asn="$(echo "$doc" | jq -r '.archive_serial_number // empty')"
  83.     corr_raw="$(echo "$doc" | jq -r '.correspondent // empty')"
  84.     tags_ids="$(echo "$doc" | jq -r '.tags[]?')"
  85.     content="$(echo "$doc" | jq -r '.content')"
  86.  
  87.     # Namen auflösen
  88.     corr=""
  89.     if [ -n "$corr_raw" ] && [ "$corr_raw" != "null" ]; then
  90.       case "$corr_raw" in
  91.         *[!0-9]*) corr="$(echo "$doc" | jq -r '.correspondent.name // empty')" ;;
  92.         *)         corr="$(corr_name "$corr_raw")" ;;
  93.       esac
  94.     fi
  95.  
  96.     tags=""
  97.     for tid in $tags_ids; do
  98.       tname="$(tag_name "$tid")"
  99.       tags="${tags:+$tags, }$tname"
  100.     done
  101.  
  102.     base="${asn:-$id}"
  103.     slug="$(printf "%s" "$title" | tr '[:upper:]' '[:lower:]' \
  104.       | sed 's/[^a-z0-9._-]/-/g; s/-\{2,\}/-/g; s/^-//; s/-$//')"
  105.     file="md/$(printf "%06d" "$id")-${base}-${slug:-document}.md"
  106.  
  107.     newhash="$(printf "%s" "$content" | sha256sum | awk '{print $1}')"
  108.     oldhash="$(grep -m1 '^content_sha256:' "$file" 2>/dev/null | awk '{print $2}')"
  109.  
  110.     if [ -n "${oldhash:-}" ] && [ "$newhash" = "$oldhash" ]; then
  111.       log "unchanged: $file"
  112.     else
  113.       tmp="$file.tmp"
  114.       {
  115.         printf -- '---\n'
  116.         printf 'source: paperless-ngx\n'
  117.         printf 'asn: "%s"\n' "$asn"
  118.         printf 'id: %s\n' "$id"
  119.         printf 'title: "%s"\n' "$title"
  120.         printf 'created: "%s"\n' "$created"
  121.         printf 'correspondent: "%s"\n' "$corr"
  122.         printf 'tags: "%s"\n' "$tags"
  123.         printf 'content_sha256: %s\n' "$newhash"
  124.         printf 'paperless_url: "%s/documents/%s/"\n' "$PNGX_BASE" "$id"
  125.         printf -- '---\n\n# %s\n\n' "$title"
  126.         printf '%s\n' "$content"
  127.       } > "$tmp"
  128.       mv -f "$tmp" "$file"
  129.       log "wrote: $file"
  130.     fi
  131.  
  132.     count=$((count+1))
  133.     if [ "$LIMIT_DOCS" -gt 0 ] && [ "$count" -ge "$LIMIT_DOCS" ]; then
  134.       log "Reached LIMIT_DOCS=$LIMIT_DOCS, stopping."
  135.       exit 0
  136.     fi
  137.   done
  138. done
  139.  
  140. log "Done. Processed $count documents."
  141. SH
  142.  
  143. # Parallel-Uploader (schonend, mit Hash-Markern gegen Doppel-Uploads)
  144. cat > "${OUT_DIR}/owui_upload_parallel.sh" <<'SH'
  145. #!/bin/sh
  146. set -eu
  147. apk add --no-cache curl jq parallel >/dev/null
  148.  
  149. DOC_DIR="${DOC_DIR:-/work/md}"
  150. MARK_DIR="/work/.uploaded"
  151. mkdir -p "$MARK_DIR"
  152.  
  153. export OWUI_BASE OWUI_TOKEN KB_ID DOC_DIR MARK_DIR
  154.  
  155. # Schonende Parallelität
  156. PAR_J="${PAR_J:-4}"
  157.  
  158. # Nur neue/geänderte Dateien (per content_sha256) hochladen + anhängen
  159. find "$DOC_DIR" -type f -name "*.md" -print0 \
  160. | parallel -0 -j "$PAR_J" --line-buffer '
  161.     f={};
  162.     h=$(grep -m1 ^content_sha256: "$f" | awk "{print \$2}");
  163.     [ -n "$h" ] || { echo "skip (no hash): $f" >&2; exit 0; }
  164.     [ -e "$MARK_DIR/$h" ] && { echo "already uploaded: $(basename "$f")"; exit 0; }
  165.  
  166.     up=$(curl -s -H "Authorization: Bearer $OWUI_TOKEN" -H "Accept: application/json" \
  167.       -F file=@"$f" "$OWUI_BASE/api/v1/files/");
  168.     id=$(echo "$up" | jq -r .id);
  169.     [ -n "$id" ] || { echo "upload failed: $f -> $up" >&2; exit 0; }
  170.  
  171.     curl -s -H "Authorization: Bearer $OWUI_TOKEN" -H "Content-Type: application/json" -H "Accept: application/json" \
  172.       -X POST "$OWUI_BASE/api/v1/knowledge/$KB_ID/file/add" \
  173.       -d "{\"file_id\":\"$id\"}" >/dev/null
  174.  
  175.     : > "$MARK_DIR/$h"
  176.     echo "attached: $(basename "$f")"
  177. '
  178. SH
  179.  
  180. # Zeilenenden & Rechte
  181. sed -i 's/\r$//' "${OUT_DIR}/export_batch.sh" "${OUT_DIR}/owui_upload_parallel.sh"
  182. chmod +x "${OUT_DIR}/export_batch.sh" "${OUT_DIR}/owui_upload_parallel.sh"
  183.  
  184. ########################################
  185. #           EXPORT AUSFÜHREN           #
  186. ########################################
  187. echo "==> Export starte (PAGE_SIZE=${PAGE_SIZE}, LIMIT_DOCS=${LIMIT_DOCS})"
  188. docker run --rm --network host \
  189.   -e PNGX_BASE -e PNGX_TOKEN \
  190.   -e PAGE_SIZE -e LIMIT_DOCS \
  191.   -v "${OUT_DIR}":/work \
  192.   alpine:3 sh /work/export_batch.sh
  193.  
  194. ########################################
  195. #       PARALLEL-UPLOAD AUSFÜHREN      #
  196. ########################################
  197. echo "==> Upload starte (parallel=${PAR_J})"
  198. docker run --rm --network host \
  199.   -e OWUI_BASE -e OWUI_TOKEN -e KB_ID \
  200.   -e DOC_DIR="/work/md" -e PAR_J="${PAR_J}" \
  201.   -v "${OUT_DIR}":/work \
  202.   alpine:3 sh /work/owui_upload_parallel.sh
  203.  
  204. echo "==> Fertig."
  205.  
Advertisement
Add Comment
Please, Sign In to add comment