Files
misc-scripts/collect-repos.sh
T
2026-02-12 12:15:19 +01:00

244 lines
7.3 KiB
Bash

#!/bin/bash
# collect-repos.sh
# Run from ~/git/ - combines all files from each repo into a single .txt file
# Output goes to ~/git/collected/ folder
set -euo pipefail
FORCE=false
if [ "${1:-}" = "--force" ] || [ "${1:-}" = "-f" ]; then
FORCE=true
fi
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
OUTPUT_DIR="${SCRIPT_DIR}/collected"
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
# Max file size to include (skip large binaries/assets)
MAX_FILE_SIZE_KB=500
# Patterns to skip
SKIP_DIRS=(".git" "node_modules" "__pycache__" ".venv" "venv" ".cache")
SKIP_FILES=("*.png" "*.jpg" "*.jpeg" "*.gif" "*.webp" "*.ico" "*.svg" "*.mp4" "*.mp3" "*.zip" "*.tar" "*.gz" "*.bz2" "*.7z" "*.bin" "*.exe" "*.so" "*.o" "*.pyc" "*.woff" "*.woff2" "*.ttf" "*.eot" "*.pdf")
mkdir -p "$OUTPUT_DIR"
# Build find exclusion args for directories
build_skip_args() {
local args=""
for dir in "${SKIP_DIRS[@]}"; do
args="$args -path '*/${dir}' -o -path '*/${dir}/*' -o"
done
# Remove trailing -o
echo "${args% -o}"
}
# Check if file matches skip patterns
should_skip_file() {
local file="$1"
local basename=$(basename "$file")
for pattern in "${SKIP_FILES[@]}"; do
# shellcheck disable=SC2254
case "$basename" in
$pattern) return 0 ;;
esac
done
return 1
}
# Known text file extensions - always include these
TEXT_EXTENSIONS=("sh" "bash" "zsh" "yaml" "yml" "json" "toml" "ini" "cfg" "conf" "txt" "md" "py" "js" "ts" "html" "css" "xml" "env" "service" "timer" "sql" "lua" "rb" "go" "rs" "java" "c" "h" "cpp" "hpp" "Makefile" "Dockerfile" "csv" "log" "properties" "rules")
is_known_text() {
local file="$1"
local basename=$(basename "$file")
local ext="${basename##*.}"
# Files without extension but known names
case "$basename" in
Makefile|Dockerfile|Vagrantfile|Gemfile|Rakefile|.gitignore|.env*) return 0 ;;
esac
for text_ext in "${TEXT_EXTENSIONS[@]}"; do
[ "$ext" = "$text_ext" ] && return 0
done
return 1
}
# Check if file is binary
is_binary() {
local file="$1"
# Known text extensions always pass
is_known_text "$file" && return 1
if file "$file" | grep -qE 'binary|executable|image|archive|compressed'; then
return 0
fi
# Also check with head - if it contains null bytes, it's binary
if head -c 512 "$file" | grep -qP '\x00' 2>/dev/null; then
return 0
fi
return 1
}
echo "=== Repo Collector ==="
echo "Timestamp: $TIMESTAMP"
echo "Working dir: $SCRIPT_DIR"
echo "Output dir: $OUTPUT_DIR"
[ "$FORCE" = true ] && echo "Mode: FORCE (rebuilding all)"
echo ""
# --- Pull all repos first ---
echo "--- Pulling all repos ---"
for repo_dir in */; do
repo_dir="${repo_dir%/}"
[ "$repo_dir" = "collected" ] && continue
[ ! -d "$repo_dir/.git" ] && continue
echo -n " git pull ${repo_dir}... "
if git -C "$repo_dir" pull --quiet 2>&1; then
echo "ok"
else
echo "FAILED (continuing anyway)"
fi
done
echo ""
repo_count=0
total_files=0
updated_repos=()
for repo_dir in */; do
repo_dir="${repo_dir%/}"
# Skip the output directory itself
[ "$repo_dir" = "collected" ] && continue
# Only process directories that look like repos (have files in them)
[ ! -d "$repo_dir" ] && continue
repo_count=$((repo_count + 1))
file_count=0
skipped_count=0
output_file="${OUTPUT_DIR}/repository-${repo_dir}.txt"
# Check if repo changed since last collection
if [ "$FORCE" = false ] && [ -f "$output_file" ]; then
output_mtime=$(stat -c %Y "$output_file" 2>/dev/null || echo 0)
# Get latest commit timestamp in the repo
repo_last_commit=$(git -C "$repo_dir" log -1 --format=%ct 2>/dev/null || echo 0)
if [ "$repo_last_commit" -le "$output_mtime" ] 2>/dev/null; then
echo " Skipping: ${repo_dir}/ (no changes)"
continue
fi
fi
updated_repos+=("$repo_dir")
echo " Updating: ${repo_dir}/"
# Write header
{
echo "================================================================================"
echo "Repository: ${repo_dir}"
echo "Collected: ${TIMESTAMP}"
echo "================================================================================"
echo ""
} > "$output_file"
# Find all files, excluding skip dirs
# First pass: collect READMEs
while IFS= read -r -d '' file; do
rel_path="${file#${repo_dir}/}"
{
echo "--- FILE: ${rel_path} ---"
cat "$file"
echo ""
echo ""
} >> "$output_file"
file_count=$((file_count + 1))
done < <(find "$repo_dir" -type f -iname 'readme.md' \
! -path '*/.git/*' \
-print0 | sort -z)
# Second pass: everything else (excluding READMEs)
while IFS= read -r -d '' file; do
rel_path="${file#${repo_dir}/}"
# Skip READMEs (already included above)
if echo "$(basename "$file")" | grep -qi '^readme\.md$'; then
continue
fi
# Skip files with "secret" in the name
if echo "$rel_path" | grep -qi 'secret'; then
skipped_count=$((skipped_count + 1))
continue
fi
# Skip by pattern
if should_skip_file "$file"; then
skipped_count=$((skipped_count + 1))
continue
fi
# Skip large files
file_size_kb=$(du -k "$file" 2>/dev/null | cut -f1)
if [ "$file_size_kb" -gt "$MAX_FILE_SIZE_KB" ]; then
{
echo "--- FILE: ${rel_path} ---"
echo "[SKIPPED: file too large (${file_size_kb}KB > ${MAX_FILE_SIZE_KB}KB limit)]"
echo ""
} >> "$output_file"
skipped_count=$((skipped_count + 1))
continue
fi
# Skip binary files
if is_binary "$file"; then
{
echo "--- FILE: ${rel_path} ---"
echo "[SKIPPED: binary file]"
echo ""
} >> "$output_file"
skipped_count=$((skipped_count + 1))
continue
fi
# Include the file
{
echo "--- FILE: ${rel_path} ---"
cat "$file"
echo ""
echo ""
} >> "$output_file"
file_count=$((file_count + 1))
done < <(find "$repo_dir" -type f \
! -path '*/.git/*' ! -path '*/.git' \
! -path '*/node_modules/*' \
! -path '*/__pycache__/*' \
! -path '*/.venv/*' \
! -path '*/venv/*' \
! -path '*/.cache/*' \
-print0 | sort -z)
# Append summary at the end
{
echo "================================================================================"
echo "Summary: ${file_count} files included, ${skipped_count} skipped"
echo "================================================================================"
} >> "$output_file"
total_files=$((total_files + file_count))
echo " -> ${output_file} (${file_count} files, ${skipped_count} skipped)"
done
echo ""
echo "Done! ${repo_count} repos found, ${#updated_repos[@]} updated, ${total_files} total files collected."
if [ ${#updated_repos[@]} -gt 0 ]; then
echo ""
echo "Updated repos (upload these to project):"
for repo in "${updated_repos[@]}"; do
echo " -> ${OUTPUT_DIR}/repository-${repo}.txt"
done
else
echo "All repos up to date, nothing to upload."
fi