#!/bin/bash # collect-repos.sh # Run from ~/git/ - combines all files from each repo into a single .txt file # Output goes to ~/git/collected/ folder # Output files are UTF-8 with BOM for maximum compatibility set -euo pipefail FORCE=false if [ "${1:-}" = "--force" ] || [ "${1:-}" = "-f" ]; then FORCE=true fi SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" OUTPUT_DIR="${SCRIPT_DIR}/collected" TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') # Max file size to include (skip large binaries/assets) MAX_FILE_SIZE_KB=500 # Patterns to skip SKIP_DIRS=(".git" "node_modules" "__pycache__" ".venv" "venv" ".cache") SKIP_FILES=("*.png" "*.jpg" "*.jpeg" "*.gif" "*.webp" "*.ico" "*.svg" "*.mp4" "*.mp3" "*.zip" "*.tar" "*.gz" "*.bz2" "*.7z" "*.bin" "*.exe" "*.so" "*.o" "*.pyc" "*.woff" "*.woff2" "*.ttf" "*.eot" "*.pdf") mkdir -p "$OUTPUT_DIR" # Build find exclusion args for directories build_skip_args() { local args="" for dir in "${SKIP_DIRS[@]}"; do args="$args -path '*/${dir}' -o -path '*/${dir}/*' -o" done # Remove trailing -o echo "${args% -o}" } # Check if file matches skip patterns should_skip_file() { local file="$1" local basename=$(basename "$file") for pattern in "${SKIP_FILES[@]}"; do # shellcheck disable=SC2254 case "$basename" in $pattern) return 0 ;; esac done return 1 } # Known text file extensions - always include these TEXT_EXTENSIONS=("sh" "bash" "zsh" "yaml" "yml" "json" "toml" "ini" "cfg" "conf" "txt" "md" "py" "js" "ts" "html" "css" "xml" "env" "service" "timer" "sql" "lua" "rb" "go" "mod" "sum" "rs" "java" "c" "h" "cpp" "hpp" "Makefile" "Dockerfile" "csv" "log" "properties" "rules") is_known_text() { local file="$1" local basename=$(basename "$file") local ext="${basename##*.}" # Files without extension but known names case "$basename" in Makefile|Dockerfile|Vagrantfile|Gemfile|Rakefile|.gitignore|.env*) return 0 ;; esac for text_ext in "${TEXT_EXTENSIONS[@]}"; do [ "$ext" = "$text_ext" ] && return 0 done return 1 } # Check if file is binary is_binary() { local file="$1" # Known text extensions always pass is_known_text "$file" && return 1 if file "$file" | grep -qE 'binary|executable|image|archive|compressed'; then return 0 fi # Also check with head - if it contains null bytes, it's binary if head -c 512 "$file" | grep -qP '\x00' 2>/dev/null; then return 0 fi return 1 } # Cat a file, stripping UTF-8 BOM if present (avoids inline BOMs in combined output) cat_strip_bom() { local file="$1" # Check if file starts with UTF-8 BOM (EF BB BF) local header header=$(head -c 3 "$file" | od -A n -t x1 | tr -d ' \n') if [ "$header" = "efbbbf" ]; then # Skip first 3 bytes (the BOM) tail -c +4 "$file" else cat "$file" fi } echo "=== Repo Collector ===" echo "Timestamp: $TIMESTAMP" echo "Working dir: $SCRIPT_DIR" echo "Output dir: $OUTPUT_DIR" [ "$FORCE" = true ] && echo "Mode: FORCE (rebuilding all)" echo "" # --- Pull all repos first --- echo "--- Pulling all repos ---" for repo_dir in */; do repo_dir="${repo_dir%/}" [ "$repo_dir" = "collected" ] && continue [ ! -d "$repo_dir/.git" ] && continue echo -n " git pull ${repo_dir}... " if git -C "$repo_dir" pull --quiet 2>&1; then echo "ok" else echo "FAILED (continuing anyway)" fi done echo "" repo_count=0 total_files=0 updated_repos=() for repo_dir in */; do repo_dir="${repo_dir%/}" # Skip the output directory itself [ "$repo_dir" = "collected" ] && continue # Only process directories that look like repos (have files in them) [ ! -d "$repo_dir" ] && continue repo_count=$((repo_count + 1)) file_count=0 skipped_count=0 output_file="${OUTPUT_DIR}/repository-${repo_dir}.txt" # Check if repo changed since last collection if [ "$FORCE" = false ] && [ -f "$output_file" ]; then output_mtime=$(stat -c %Y "$output_file" 2>/dev/null || echo 0) # Get latest commit timestamp in the repo repo_last_commit=$(git -C "$repo_dir" log -1 --format=%ct 2>/dev/null || echo 0) if [ "$repo_last_commit" -le "$output_mtime" ] 2>/dev/null; then echo " Skipping: ${repo_dir}/ (no changes)" continue fi fi updated_repos+=("$repo_dir") echo " Updating: ${repo_dir}/" # Write UTF-8 BOM + header # printf writes raw bytes; echo writes the text header after it printf '\xEF\xBB\xBF' > "$output_file" { echo "================================================================================" echo "Repository: ${repo_dir}" echo "Collected: ${TIMESTAMP}" echo "================================================================================" echo "" } >> "$output_file" # Find all files, excluding skip dirs # First pass: collect READMEs while IFS= read -r -d '' file; do rel_path="${file#${repo_dir}/}" { echo "--- FILE: ${rel_path} ---" cat_strip_bom "$file" echo "" echo "" } >> "$output_file" file_count=$((file_count + 1)) done < <(find "$repo_dir" -type f -iname 'readme.md' \ ! -path '*/.git/*' \ -print0 | sort -z) # Second pass: everything else (excluding READMEs) while IFS= read -r -d '' file; do rel_path="${file#${repo_dir}/}" # Skip READMEs (already included above) if echo "$(basename "$file")" | grep -qi '^readme\.md$'; then continue fi # Skip files with "secret" in the name if echo "$rel_path" | grep -qi 'secret'; then skipped_count=$((skipped_count + 1)) continue fi # Skip by pattern if should_skip_file "$file"; then skipped_count=$((skipped_count + 1)) continue fi # Skip large files file_size_kb=$(du -k "$file" 2>/dev/null | cut -f1) if [ "$file_size_kb" -gt "$MAX_FILE_SIZE_KB" ]; then { echo "--- FILE: ${rel_path} ---" echo "[SKIPPED: file too large (${file_size_kb}KB > ${MAX_FILE_SIZE_KB}KB limit)]" echo "" } >> "$output_file" skipped_count=$((skipped_count + 1)) continue fi # Skip binary files if is_binary "$file"; then { echo "--- FILE: ${rel_path} ---" echo "[SKIPPED: binary file]" echo "" } >> "$output_file" skipped_count=$((skipped_count + 1)) continue fi # Include the file (strip BOM from source to avoid inline BOMs) { echo "--- FILE: ${rel_path} ---" cat_strip_bom "$file" echo "" echo "" } >> "$output_file" file_count=$((file_count + 1)) done < <(find "$repo_dir" -type f \ ! -path '*/.git/*' ! -path '*/.git' \ ! -path '*/node_modules/*' \ ! -path '*/__pycache__/*' \ ! -path '*/.venv/*' \ ! -path '*/venv/*' \ ! -path '*/.cache/*' \ -print0 | sort -z) # Append summary at the end { echo "================================================================================" echo "Summary: ${file_count} files included, ${skipped_count} skipped" echo "================================================================================" } >> "$output_file" total_files=$((total_files + file_count)) echo " -> ${output_file} (${file_count} files, ${skipped_count} skipped)" done echo "" echo "Done! ${repo_count} repos found, ${#updated_repos[@]} updated, ${total_files} total files collected." if [ ${#updated_repos[@]} -gt 0 ]; then echo "" echo "Updated repos (upload these to project):" for repo in "${updated_repos[@]}"; do echo " -> ${OUTPUT_DIR}/repository-${repo}.txt" done else echo "All repos up to date, nothing to upload." fi