misc-scripts/collect-repos.sh

#!/bin/bash
# collect-repos.sh
# Run from ~/git/ - combines all files from each repo into a single .txt file
# Output goes to ~/git/collected/ folder
# Output files are UTF-8 with BOM for maximum compatibility

set -euo pipefail

FORCE=false
if [ "${1:-}" = "--force" ] || [ "${1:-}" = "-f" ]; then
    FORCE=true
fi

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
OUTPUT_DIR="${SCRIPT_DIR}/collected"
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')

# Max file size to include (skip large binaries/assets)
MAX_FILE_SIZE_KB=500

# Patterns to skip
SKIP_DIRS=(".git" "node_modules" "__pycache__" ".venv" "venv" ".cache")
SKIP_FILES=("*.png" "*.jpg" "*.jpeg" "*.gif" "*.webp" "*.ico" "*.svg" "*.mp4" "*.mp3" "*.zip" "*.tar" "*.gz" "*.bz2" "*.7z" "*.bin" "*.exe" "*.so" "*.o" "*.pyc" "*.woff" "*.woff2" "*.ttf" "*.eot" "*.pdf")

mkdir -p "$OUTPUT_DIR"

# Build find exclusion args for directories
build_skip_args() {
    local args=""
    for dir in "${SKIP_DIRS[@]}"; do
        args="$args -path '*/${dir}' -o -path '*/${dir}/*' -o"
    done
    # Remove trailing -o
    echo "${args% -o}"
}

# Check if file matches skip patterns
should_skip_file() {
    local file="$1"
    local basename=$(basename "$file")
    for pattern in "${SKIP_FILES[@]}"; do
        # shellcheck disable=SC2254
        case "$basename" in
            $pattern) return 0 ;;
        esac
    done
    return 1
}

# Known text file extensions - always include these
TEXT_EXTENSIONS=("sh" "bash" "zsh" "yaml" "yml" "json" "toml" "ini" "cfg" "conf" "txt" "md" "py" "js" "ts" "html" "css" "xml" "env" "service" "timer" "sql" "lua" "rb" "go" "mod" "sum" "rs" "java" "c" "h" "cpp" "hpp" "Makefile" "Dockerfile" "csv" "log" "properties" "rules")

is_known_text() {
    local file="$1"
    local basename=$(basename "$file")
    local ext="${basename##*.}"
    # Files without extension but known names
    case "$basename" in
        Makefile|Dockerfile|Vagrantfile|Gemfile|Rakefile|.gitignore|.env*) return 0 ;;
    esac
    for text_ext in "${TEXT_EXTENSIONS[@]}"; do
        [ "$ext" = "$text_ext" ] && return 0
    done
    return 1
}

# Check if file is binary
is_binary() {
    local file="$1"
    # Known text extensions always pass
    is_known_text "$file" && return 1
    if file "$file" | grep -qE 'binary|executable|image|archive|compressed'; then
        return 0
    fi
    # Also check with head - if it contains null bytes, it's binary
    if head -c 512 "$file" | grep -qP '\x00' 2>/dev/null; then
        return 0
    fi
    return 1
}

# Cat a file, stripping UTF-8 BOM if present (avoids inline BOMs in combined output)
cat_strip_bom() {
    local file="$1"
    # Check if file starts with UTF-8 BOM (EF BB BF)
    local header
    header=$(head -c 3 "$file" | od -A n -t x1 | tr -d ' \n')
    if [ "$header" = "efbbbf" ]; then
        # Skip first 3 bytes (the BOM)
        tail -c +4 "$file"
    else
        cat "$file"
    fi
}

echo "=== Repo Collector ==="
echo "Timestamp: $TIMESTAMP"
echo "Working dir: $SCRIPT_DIR"
echo "Output dir: $OUTPUT_DIR"
[ "$FORCE" = true ] && echo "Mode: FORCE (rebuilding all)"
echo ""

# --- Pull all repos first ---
echo "--- Pulling all repos ---"
for repo_dir in */; do
    repo_dir="${repo_dir%/}"
    [ "$repo_dir" = "collected" ] && continue
    [ ! -d "$repo_dir/.git" ] && continue
    echo -n "  git pull ${repo_dir}... "
    if git -C "$repo_dir" pull --quiet 2>&1; then
        echo "ok"
    else
        echo "FAILED (continuing anyway)"
    fi
done
echo ""

repo_count=0
total_files=0
updated_repos=()

for repo_dir in */; do
    repo_dir="${repo_dir%/}"

    # Skip the output directory itself
    [ "$repo_dir" = "collected" ] && continue

    # Only process directories that look like repos (have files in them)
    [ ! -d "$repo_dir" ] && continue

    repo_count=$((repo_count + 1))
    file_count=0
    skipped_count=0
    output_file="${OUTPUT_DIR}/repository-${repo_dir}.txt"

    # Check if repo changed since last collection
    if [ "$FORCE" = false ] && [ -f "$output_file" ]; then
        output_mtime=$(stat -c %Y "$output_file" 2>/dev/null || echo 0)
        # Get latest commit timestamp in the repo
        repo_last_commit=$(git -C "$repo_dir" log -1 --format=%ct 2>/dev/null || echo 0)
        if [ "$repo_last_commit" -le "$output_mtime" ] 2>/dev/null; then
            echo "  Skipping: ${repo_dir}/ (no changes)"
            continue
        fi
    fi

    updated_repos+=("$repo_dir")
    echo "  Updating: ${repo_dir}/"

    # Write UTF-8 BOM + header
    # printf writes raw bytes; echo writes the text header after it
    printf '\xEF\xBB\xBF' > "$output_file"
    {
        echo "================================================================================"
        echo "Repository: ${repo_dir}"
        echo "Collected:  ${TIMESTAMP}"
        echo "================================================================================"
        echo ""
    } >> "$output_file"

    # Find all files, excluding skip dirs
    # First pass: collect READMEs
    while IFS= read -r -d '' file; do
        rel_path="${file#${repo_dir}/}"
        {
            echo "--- FILE: ${rel_path} ---"
            cat_strip_bom "$file"
            echo ""
            echo ""
        } >> "$output_file"
        file_count=$((file_count + 1))
    done < <(find "$repo_dir" -type f -iname 'readme.md' \
        ! -path '*/.git/*' \
        -print0 | sort -z)

    # Second pass: everything else (excluding READMEs)
    while IFS= read -r -d '' file; do
        rel_path="${file#${repo_dir}/}"

        # Skip READMEs (already included above)
        if echo "$(basename "$file")" | grep -qi '^readme\.md$'; then
            continue
        fi

        # Skip files with "secret" in the name
        if echo "$rel_path" | grep -qi 'secret'; then
            skipped_count=$((skipped_count + 1))
            continue
        fi

        # Skip by pattern
        if should_skip_file "$file"; then
            skipped_count=$((skipped_count + 1))
            continue
        fi

        # Skip large files
        file_size_kb=$(du -k "$file" 2>/dev/null | cut -f1)
        if [ "$file_size_kb" -gt "$MAX_FILE_SIZE_KB" ]; then
            {
                echo "--- FILE: ${rel_path} ---"
                echo "[SKIPPED: file too large (${file_size_kb}KB > ${MAX_FILE_SIZE_KB}KB limit)]"
                echo ""
            } >> "$output_file"
            skipped_count=$((skipped_count + 1))
            continue
        fi

        # Skip binary files
        if is_binary "$file"; then
            {
                echo "--- FILE: ${rel_path} ---"
                echo "[SKIPPED: binary file]"
                echo ""
            } >> "$output_file"
            skipped_count=$((skipped_count + 1))
            continue
        fi

        # Include the file (strip BOM from source to avoid inline BOMs)
        {
            echo "--- FILE: ${rel_path} ---"
            cat_strip_bom "$file"
            echo ""
            echo ""
        } >> "$output_file"

        file_count=$((file_count + 1))

    done < <(find "$repo_dir" -type f \
        ! -path '*/.git/*' ! -path '*/.git' \
        ! -path '*/node_modules/*' \
        ! -path '*/__pycache__/*' \
        ! -path '*/.venv/*' \
        ! -path '*/venv/*' \
        ! -path '*/.cache/*' \
        -print0 | sort -z)

    # Append summary at the end
    {
        echo "================================================================================"
        echo "Summary: ${file_count} files included, ${skipped_count} skipped"
        echo "================================================================================"
    } >> "$output_file"

    total_files=$((total_files + file_count))
    echo "  -> ${output_file} (${file_count} files, ${skipped_count} skipped)"

done

echo ""
echo "Done! ${repo_count} repos found, ${#updated_repos[@]} updated, ${total_files} total files collected."
if [ ${#updated_repos[@]} -gt 0 ]; then
    echo ""
    echo "Updated repos (upload these to project):"
    for repo in "${updated_repos[@]}"; do
        echo "  -> ${OUTPUT_DIR}/repository-${repo}.txt"
    done
else
    echo "All repos up to date, nothing to upload."
fi