Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # Paperless-ngx → Open WebUI RAG (schonend, wenige parallele Uploads)
- # Unraid User Script – Version 1.0
- set -euo pipefail
- ########################################
- # KONFIGURATION #
- ########################################
- # Paperless-ngx
- export PNGX_BASE="PAPERLESS_SERVER:PORT" # <- anpassen
- export PNGX_TOKEN="PAPERLESS_TOKEN" # <- anpassen
- # Open WebUI
- export OWUI_BASE="OWUI_SERVER:PORT" # <- anpassen
- export OWUI_TOKEN="OWUI_TOKEN" # <- anpassen
- export KB_ID="KNOWLEDGE_ID" # <- deine Knowledge-ID (grab from URL)
- # Export-Ziel auf Unraid
- OUT_DIR="/path/to/folder" # <- anpassen, existiert schon bei dir
- MD_DIR="${OUT_DIR}/md"
- # Schonende Parallelität beim Upload
- PAR_J="${PAR_J:-4}" # 4 gleichzeitige Uploads (schwach → 2–4, kräftig → 8–16)
- # Export-Paginierung
- PAGE_SIZE="${PAGE_SIZE:-100}" # 100 pro Seite
- LIMIT_DOCS="${LIMIT_DOCS:-0}" # 0 = kein Limit (für Tests z. B. 500)
- ########################################
- # HILFS-SKRIPTE SCHREIBEN #
- ########################################
- mkdir -p "$OUT_DIR"
- # Exporter: baut ID→Name-Maps & exportiert seitenweise, schreibt nur bei Änderung
- cat > "${OUT_DIR}/export_batch.sh" <<'SH'
- #!/bin/sh
- set -eu
- apk add --no-cache curl jq >/dev/null
- PAGE_SIZE="${PAGE_SIZE:-100}"
- LIMIT_DOCS="${LIMIT_DOCS:-0}"
- mkdir -p /work/md
- cd /work
- log(){ printf '%s\n' "$*" >&2; }
- # Maps (id=name) erstellen
- corr_map="$(
- curl -s -H "Authorization: Token $PNGX_TOKEN" \
- "$PNGX_BASE/api/correspondents/?page_size=1000" \
- | jq -r '.results[] | "\(.id)=\(.name)"' || true
- )"
- tag_map="$(
- curl -s -H "Authorization: Token $PNGX_TOKEN" \
- "$PNGX_BASE/api/tags/?page_size=2000" \
- | jq -r '.results[] | "\(.id)=\(.name)"' || true
- )"
- corr_name() { awk -F= -v id="$1" '$1==id{print $2; f=1} END{if(!f)print id}' <<EOF
- $corr_map
- EOF
- }
- tag_name() { awk -F= -v id="$1" '$1==id{print $2; f=1} END{if(!f)print id}' <<EOF
- $tag_map
- EOF
- }
- count=0
- next="$PNGX_BASE/api/documents/?page_size=$PAGE_SIZE&fields=id,title,created,modified,archive_serial_number,correspondent,tags,content"
- while [ -n "$next" ] && [ "$next" != "null" ]; do
- page="$(curl -s -H "Authorization: Token $PNGX_TOKEN" "$next")"
- next="$(echo "$page" | jq -r '.next')"
- echo "$page" | jq -c '.results[]' | while read -r doc; do
- id="$(echo "$doc" | jq -r '.id')"
- title="$(echo "$doc" | jq -r '.title')"
- created="$(echo "$doc" | jq -r '.created')"
- asn="$(echo "$doc" | jq -r '.archive_serial_number // empty')"
- corr_raw="$(echo "$doc" | jq -r '.correspondent // empty')"
- tags_ids="$(echo "$doc" | jq -r '.tags[]?')"
- content="$(echo "$doc" | jq -r '.content')"
- # Namen auflösen
- corr=""
- if [ -n "$corr_raw" ] && [ "$corr_raw" != "null" ]; then
- case "$corr_raw" in
- *[!0-9]*) corr="$(echo "$doc" | jq -r '.correspondent.name // empty')" ;;
- *) corr="$(corr_name "$corr_raw")" ;;
- esac
- fi
- tags=""
- for tid in $tags_ids; do
- tname="$(tag_name "$tid")"
- tags="${tags:+$tags, }$tname"
- done
- base="${asn:-$id}"
- slug="$(printf "%s" "$title" | tr '[:upper:]' '[:lower:]' \
- | sed 's/[^a-z0-9._-]/-/g; s/-\{2,\}/-/g; s/^-//; s/-$//')"
- file="md/$(printf "%06d" "$id")-${base}-${slug:-document}.md"
- newhash="$(printf "%s" "$content" | sha256sum | awk '{print $1}')"
- oldhash="$(grep -m1 '^content_sha256:' "$file" 2>/dev/null | awk '{print $2}')"
- if [ -n "${oldhash:-}" ] && [ "$newhash" = "$oldhash" ]; then
- log "unchanged: $file"
- else
- tmp="$file.tmp"
- {
- printf -- '---\n'
- printf 'source: paperless-ngx\n'
- printf 'asn: "%s"\n' "$asn"
- printf 'id: %s\n' "$id"
- printf 'title: "%s"\n' "$title"
- printf 'created: "%s"\n' "$created"
- printf 'correspondent: "%s"\n' "$corr"
- printf 'tags: "%s"\n' "$tags"
- printf 'content_sha256: %s\n' "$newhash"
- printf 'paperless_url: "%s/documents/%s/"\n' "$PNGX_BASE" "$id"
- printf -- '---\n\n# %s\n\n' "$title"
- printf '%s\n' "$content"
- } > "$tmp"
- mv -f "$tmp" "$file"
- log "wrote: $file"
- fi
- count=$((count+1))
- if [ "$LIMIT_DOCS" -gt 0 ] && [ "$count" -ge "$LIMIT_DOCS" ]; then
- log "Reached LIMIT_DOCS=$LIMIT_DOCS, stopping."
- exit 0
- fi
- done
- done
- log "Done. Processed $count documents."
- SH
- # Parallel-Uploader (schonend, mit Hash-Markern gegen Doppel-Uploads)
- cat > "${OUT_DIR}/owui_upload_parallel.sh" <<'SH'
- #!/bin/sh
- set -eu
- apk add --no-cache curl jq parallel >/dev/null
- DOC_DIR="${DOC_DIR:-/work/md}"
- MARK_DIR="/work/.uploaded"
- mkdir -p "$MARK_DIR"
- export OWUI_BASE OWUI_TOKEN KB_ID DOC_DIR MARK_DIR
- # Schonende Parallelität
- PAR_J="${PAR_J:-4}"
- # Nur neue/geänderte Dateien (per content_sha256) hochladen + anhängen
- find "$DOC_DIR" -type f -name "*.md" -print0 \
- | parallel -0 -j "$PAR_J" --line-buffer '
- f={};
- h=$(grep -m1 ^content_sha256: "$f" | awk "{print \$2}");
- [ -n "$h" ] || { echo "skip (no hash): $f" >&2; exit 0; }
- [ -e "$MARK_DIR/$h" ] && { echo "already uploaded: $(basename "$f")"; exit 0; }
- up=$(curl -s -H "Authorization: Bearer $OWUI_TOKEN" -H "Accept: application/json" \
- -F file=@"$f" "$OWUI_BASE/api/v1/files/");
- id=$(echo "$up" | jq -r .id);
- [ -n "$id" ] || { echo "upload failed: $f -> $up" >&2; exit 0; }
- curl -s -H "Authorization: Bearer $OWUI_TOKEN" -H "Content-Type: application/json" -H "Accept: application/json" \
- -X POST "$OWUI_BASE/api/v1/knowledge/$KB_ID/file/add" \
- -d "{\"file_id\":\"$id\"}" >/dev/null
- : > "$MARK_DIR/$h"
- echo "attached: $(basename "$f")"
- '
- SH
- # Zeilenenden & Rechte
- sed -i 's/\r$//' "${OUT_DIR}/export_batch.sh" "${OUT_DIR}/owui_upload_parallel.sh"
- chmod +x "${OUT_DIR}/export_batch.sh" "${OUT_DIR}/owui_upload_parallel.sh"
- ########################################
- # EXPORT AUSFÜHREN #
- ########################################
- echo "==> Export starte (PAGE_SIZE=${PAGE_SIZE}, LIMIT_DOCS=${LIMIT_DOCS})"
- docker run --rm --network host \
- -e PNGX_BASE -e PNGX_TOKEN \
- -e PAGE_SIZE -e LIMIT_DOCS \
- -v "${OUT_DIR}":/work \
- alpine:3 sh /work/export_batch.sh
- ########################################
- # PARALLEL-UPLOAD AUSFÜHREN #
- ########################################
- echo "==> Upload starte (parallel=${PAR_J})"
- docker run --rm --network host \
- -e OWUI_BASE -e OWUI_TOKEN -e KB_ID \
- -e DOC_DIR="/work/md" -e PAR_J="${PAR_J}" \
- -v "${OUT_DIR}":/work \
- alpine:3 sh /work/owui_upload_parallel.sh
- echo "==> Fertig."
Advertisement
Add Comment
Please, Sign In to add comment