misc-scripts created

2026-02-12 11:57:07 +01:00
commit fdfa87a54b
1 changed files with 218 additions and 0 deletions
@@ -0,0 +1,218 @@
 #!/bin/bash
 # collect-repos.sh
 # Run from ~/git/ - combines all files from each repo into a single .txt file
 # Output goes to ~/git/collected/ folder
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 OUTPUT_DIR="${SCRIPT_DIR}/collected"
 TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
 # Max file size to include (skip large binaries/assets)
 MAX_FILE_SIZE_KB=500
 # Patterns to skip
 SKIP_DIRS=(".git" "node_modules" "__pycache__" ".venv" "venv" ".cache")
 SKIP_FILES=("*.png" "*.jpg" "*.jpeg" "*.gif" "*.webp" "*.ico" "*.svg" "*.mp4" "*.mp3" "*.zip" "*.tar" "*.gz" "*.bz2" "*.7z" "*.bin" "*.exe" "*.so" "*.o" "*.pyc" "*.woff" "*.woff2" "*.ttf" "*.eot" "*.pdf")
 mkdir -p "$OUTPUT_DIR"
 # Build find exclusion args for directories
 build_skip_args() {
    local args=""
    for dir in "${SKIP_DIRS[@]}"; do
        args="$args -path '*/${dir}' -o -path '*/${dir}/*' -o"
    done
    # Remove trailing -o
    echo "${args% -o}"
 }
 # Check if file matches skip patterns
 should_skip_file() {
    local file="$1"
    local basename=$(basename "$file")
    for pattern in "${SKIP_FILES[@]}"; do
        # shellcheck disable=SC2254
        case "$basename" in
            $pattern) return 0 ;;
        esac
    done
    return 1
 }
 # Known text file extensions - always include these
 TEXT_EXTENSIONS=("sh" "bash" "zsh" "yaml" "yml" "json" "toml" "ini" "cfg" "conf" "txt" "md" "py" "js" "ts" "html" "css" "xml" "env" "service" "timer" "sql" "lua" "rb" "go" "rs" "java" "c" "h" "cpp" "hpp" "Makefile" "Dockerfile" "csv" "log" "properties" "rules")
 is_known_text() {
    local file="$1"
    local basename=$(basename "$file")
    local ext="${basename##*.}"
    # Files without extension but known names
    case "$basename" in
        Makefile|Dockerfile|Vagrantfile|Gemfile|Rakefile|.gitignore|.env*) return 0 ;;
    esac
    for text_ext in "${TEXT_EXTENSIONS[@]}"; do
        [ "$ext" = "$text_ext" ] && return 0
    done
    return 1
 }
 # Check if file is binary
 is_binary() {
    local file="$1"
    # Known text extensions always pass
    is_known_text "$file" && return 1
    if file "$file" | grep -qE 'binary|executable|image|archive|compressed'; then
        return 0
    fi
    # Also check with head - if it contains null bytes, it's binary
    if head -c 512 "$file" | grep -qP '\x00' 2>/dev/null; then
        return 0
    fi
    return 1
 }
 echo "=== Repo Collector ==="
 echo "Timestamp: $TIMESTAMP"
 echo "Working dir: $SCRIPT_DIR"
 echo "Output dir: $OUTPUT_DIR"
 echo ""
 # --- Pull all repos first ---
 echo "--- Pulling all repos ---"
 for repo_dir in */; do
    repo_dir="${repo_dir%/}"
    [ "$repo_dir" = "collected" ] && continue
    [ ! -d "$repo_dir/.git" ] && continue
    echo -n "  git pull ${repo_dir}... "
    if git -C "$repo_dir" pull --quiet 2>&1; then
        echo "ok"
    else
        echo "FAILED (continuing anyway)"
    fi
 done
 echo ""
 repo_count=0
 total_files=0
 updated_repos=()
 for repo_dir in */; do
    repo_dir="${repo_dir%/}"
    # Skip the output directory itself
    [ "$repo_dir" = "collected" ] && continue
    # Only process directories that look like repos (have files in them)
    [ ! -d "$repo_dir" ] && continue
    repo_count=$((repo_count + 1))
    file_count=0
    skipped_count=0
    output_file="${OUTPUT_DIR}/${repo_dir}.txt"
    # Check if repo changed since last collection
    if [ -f "$output_file" ]; then
        output_mtime=$(stat -c %Y "$output_file" 2>/dev/null || echo 0)
        # Get latest commit timestamp in the repo
        repo_last_commit=$(git -C "$repo_dir" log -1 --format=%ct 2>/dev/null || echo 0)
        if [ "$repo_last_commit" -le "$output_mtime" ] 2>/dev/null; then
            echo "  Skipping: ${repo_dir}/ (no changes)"
            continue
        fi
    fi
    updated_repos+=("$repo_dir")
    echo "  Updating: ${repo_dir}/"
    # Write header
    {
        echo "================================================================================"
        echo "Repository: ${repo_dir}"
        echo "Collected:  ${TIMESTAMP}"
        echo "================================================================================"
        echo ""
    } > "$output_file"
    # Find all files, excluding skip dirs
    while IFS= read -r -d '' file; do
        rel_path="${file#${repo_dir}/}"
        # Skip files with "secret" in the name
        if echo "$rel_path" | grep -qi 'secret'; then
            skipped_count=$((skipped_count + 1))
            continue
        fi
        # Skip by pattern
        if should_skip_file "$file"; then
            skipped_count=$((skipped_count + 1))
            continue
        fi
        # Skip large files
        file_size_kb=$(du -k "$file" 2>/dev/null | cut -f1)
        if [ "$file_size_kb" -gt "$MAX_FILE_SIZE_KB" ]; then
            {
                echo "--- FILE: ${rel_path} ---"
                echo "[SKIPPED: file too large (${file_size_kb}KB > ${MAX_FILE_SIZE_KB}KB limit)]"
                echo ""
            } >> "$output_file"
            skipped_count=$((skipped_count + 1))
            continue
        fi
        # Skip binary files
        if is_binary "$file"; then
            {
                echo "--- FILE: ${rel_path} ---"
                echo "[SKIPPED: binary file]"
                echo ""
            } >> "$output_file"
            skipped_count=$((skipped_count + 1))
            continue
        fi
        # Include the file
        {
            echo "--- FILE: ${rel_path} ---"
            cat "$file"
            echo ""
            echo ""
        } >> "$output_file"
        file_count=$((file_count + 1))
    done < <(find "$repo_dir" -type f \
        ! -path '*/.git/*' ! -path '*/.git' \
        ! -path '*/node_modules/*' \
        ! -path '*/__pycache__/*' \
        ! -path '*/.venv/*' \
        ! -path '*/venv/*' \
        ! -path '*/.cache/*' \
        -print0 | sort -z)
    # Append summary at the end
    {
        echo "================================================================================"
        echo "Summary: ${file_count} files included, ${skipped_count} skipped"
        echo "================================================================================"
    } >> "$output_file"
    total_files=$((total_files + file_count))
    echo "  -> ${output_file} (${file_count} files, ${skipped_count} skipped)"
 done
 echo ""
 echo "Done! ${repo_count} repos found, ${#updated_repos[@]} updated, ${total_files} total files collected."
 if [ ${#updated_repos[@]} -gt 0 ]; then
    echo ""
    echo "Updated repos (upload these to project):"
    for repo in "${updated_repos[@]}"; do
        echo "  -> ${OUTPUT_DIR}/${repo}.txt"
    done
 else
    echo "All repos up to date, nothing to upload."
 fi