261 lines
8.0 KiB
Bash
261 lines
8.0 KiB
Bash
#!/bin/bash
|
|
# collect-repos.sh
|
|
# Run from ~/git/ - combines all files from each repo into a single .txt file
|
|
# Output goes to ~/git/collected/ folder
|
|
# Output files are UTF-8 with BOM for maximum compatibility
|
|
|
|
set -euo pipefail
|
|
|
|
FORCE=false
|
|
if [ "${1:-}" = "--force" ] || [ "${1:-}" = "-f" ]; then
|
|
FORCE=true
|
|
fi
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
OUTPUT_DIR="${SCRIPT_DIR}/collected"
|
|
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
|
|
|
|
# Max file size to include (skip large binaries/assets)
|
|
MAX_FILE_SIZE_KB=500
|
|
|
|
# Patterns to skip
|
|
SKIP_DIRS=(".git" "node_modules" "__pycache__" ".venv" "venv" ".cache")
|
|
SKIP_FILES=("*.png" "*.jpg" "*.jpeg" "*.gif" "*.webp" "*.ico" "*.svg" "*.mp4" "*.mp3" "*.zip" "*.tar" "*.gz" "*.bz2" "*.7z" "*.bin" "*.exe" "*.so" "*.o" "*.pyc" "*.woff" "*.woff2" "*.ttf" "*.eot" "*.pdf")
|
|
|
|
mkdir -p "$OUTPUT_DIR"
|
|
|
|
# Build find exclusion args for directories
|
|
build_skip_args() {
|
|
local args=""
|
|
for dir in "${SKIP_DIRS[@]}"; do
|
|
args="$args -path '*/${dir}' -o -path '*/${dir}/*' -o"
|
|
done
|
|
# Remove trailing -o
|
|
echo "${args% -o}"
|
|
}
|
|
|
|
# Check if file matches skip patterns
|
|
should_skip_file() {
|
|
local file="$1"
|
|
local basename=$(basename "$file")
|
|
for pattern in "${SKIP_FILES[@]}"; do
|
|
# shellcheck disable=SC2254
|
|
case "$basename" in
|
|
$pattern) return 0 ;;
|
|
esac
|
|
done
|
|
return 1
|
|
}
|
|
|
|
# Known text file extensions - always include these
|
|
TEXT_EXTENSIONS=("sh" "bash" "zsh" "yaml" "yml" "json" "toml" "ini" "cfg" "conf" "txt" "md" "py" "js" "ts" "html" "css" "xml" "env" "service" "timer" "sql" "lua" "rb" "go" "mod" "sum" "rs" "java" "c" "h" "cpp" "hpp" "Makefile" "Dockerfile" "csv" "log" "properties" "rules")
|
|
|
|
is_known_text() {
|
|
local file="$1"
|
|
local basename=$(basename "$file")
|
|
local ext="${basename##*.}"
|
|
# Files without extension but known names
|
|
case "$basename" in
|
|
Makefile|Dockerfile|Vagrantfile|Gemfile|Rakefile|.gitignore|.env*) return 0 ;;
|
|
esac
|
|
for text_ext in "${TEXT_EXTENSIONS[@]}"; do
|
|
[ "$ext" = "$text_ext" ] && return 0
|
|
done
|
|
return 1
|
|
}
|
|
|
|
# Check if file is binary
|
|
is_binary() {
|
|
local file="$1"
|
|
# Known text extensions always pass
|
|
is_known_text "$file" && return 1
|
|
if file "$file" | grep -qE 'binary|executable|image|archive|compressed'; then
|
|
return 0
|
|
fi
|
|
# Also check with head - if it contains null bytes, it's binary
|
|
if head -c 512 "$file" | grep -qP '\x00' 2>/dev/null; then
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
# Cat a file, stripping UTF-8 BOM if present (avoids inline BOMs in combined output)
|
|
cat_strip_bom() {
|
|
local file="$1"
|
|
# Check if file starts with UTF-8 BOM (EF BB BF)
|
|
local header
|
|
header=$(head -c 3 "$file" | od -A n -t x1 | tr -d ' \n')
|
|
if [ "$header" = "efbbbf" ]; then
|
|
# Skip first 3 bytes (the BOM)
|
|
tail -c +4 "$file"
|
|
else
|
|
cat "$file"
|
|
fi
|
|
}
|
|
|
|
echo "=== Repo Collector ==="
|
|
echo "Timestamp: $TIMESTAMP"
|
|
echo "Working dir: $SCRIPT_DIR"
|
|
echo "Output dir: $OUTPUT_DIR"
|
|
[ "$FORCE" = true ] && echo "Mode: FORCE (rebuilding all)"
|
|
echo ""
|
|
|
|
# --- Pull all repos first ---
|
|
echo "--- Pulling all repos ---"
|
|
for repo_dir in */; do
|
|
repo_dir="${repo_dir%/}"
|
|
[ "$repo_dir" = "collected" ] && continue
|
|
[ ! -d "$repo_dir/.git" ] && continue
|
|
echo -n " git pull ${repo_dir}... "
|
|
if git -C "$repo_dir" pull --quiet 2>&1; then
|
|
echo "ok"
|
|
else
|
|
echo "FAILED (continuing anyway)"
|
|
fi
|
|
done
|
|
echo ""
|
|
|
|
repo_count=0
|
|
total_files=0
|
|
updated_repos=()
|
|
|
|
for repo_dir in */; do
|
|
repo_dir="${repo_dir%/}"
|
|
|
|
# Skip the output directory itself
|
|
[ "$repo_dir" = "collected" ] && continue
|
|
|
|
# Only process directories that look like repos (have files in them)
|
|
[ ! -d "$repo_dir" ] && continue
|
|
|
|
repo_count=$((repo_count + 1))
|
|
file_count=0
|
|
skipped_count=0
|
|
output_file="${OUTPUT_DIR}/repository-${repo_dir}.txt"
|
|
|
|
# Check if repo changed since last collection
|
|
if [ "$FORCE" = false ] && [ -f "$output_file" ]; then
|
|
output_mtime=$(stat -c %Y "$output_file" 2>/dev/null || echo 0)
|
|
# Get latest commit timestamp in the repo
|
|
repo_last_commit=$(git -C "$repo_dir" log -1 --format=%ct 2>/dev/null || echo 0)
|
|
if [ "$repo_last_commit" -le "$output_mtime" ] 2>/dev/null; then
|
|
echo " Skipping: ${repo_dir}/ (no changes)"
|
|
continue
|
|
fi
|
|
fi
|
|
|
|
updated_repos+=("$repo_dir")
|
|
echo " Updating: ${repo_dir}/"
|
|
|
|
# Write UTF-8 BOM + header
|
|
# printf writes raw bytes; echo writes the text header after it
|
|
printf '\xEF\xBB\xBF' > "$output_file"
|
|
{
|
|
echo "================================================================================"
|
|
echo "Repository: ${repo_dir}"
|
|
echo "Collected: ${TIMESTAMP}"
|
|
echo "================================================================================"
|
|
echo ""
|
|
} >> "$output_file"
|
|
|
|
# Find all files, excluding skip dirs
|
|
# First pass: collect READMEs
|
|
while IFS= read -r -d '' file; do
|
|
rel_path="${file#${repo_dir}/}"
|
|
{
|
|
echo "--- FILE: ${rel_path} ---"
|
|
cat_strip_bom "$file"
|
|
echo ""
|
|
echo ""
|
|
} >> "$output_file"
|
|
file_count=$((file_count + 1))
|
|
done < <(find "$repo_dir" -type f -iname 'readme.md' \
|
|
! -path '*/.git/*' \
|
|
-print0 | sort -z)
|
|
|
|
# Second pass: everything else (excluding READMEs)
|
|
while IFS= read -r -d '' file; do
|
|
rel_path="${file#${repo_dir}/}"
|
|
|
|
# Skip READMEs (already included above)
|
|
if echo "$(basename "$file")" | grep -qi '^readme\.md$'; then
|
|
continue
|
|
fi
|
|
|
|
# Skip files with "secret" in the name
|
|
if echo "$rel_path" | grep -qi 'secret'; then
|
|
skipped_count=$((skipped_count + 1))
|
|
continue
|
|
fi
|
|
|
|
# Skip by pattern
|
|
if should_skip_file "$file"; then
|
|
skipped_count=$((skipped_count + 1))
|
|
continue
|
|
fi
|
|
|
|
# Skip large files
|
|
file_size_kb=$(du -k "$file" 2>/dev/null | cut -f1)
|
|
if [ "$file_size_kb" -gt "$MAX_FILE_SIZE_KB" ]; then
|
|
{
|
|
echo "--- FILE: ${rel_path} ---"
|
|
echo "[SKIPPED: file too large (${file_size_kb}KB > ${MAX_FILE_SIZE_KB}KB limit)]"
|
|
echo ""
|
|
} >> "$output_file"
|
|
skipped_count=$((skipped_count + 1))
|
|
continue
|
|
fi
|
|
|
|
# Skip binary files
|
|
if is_binary "$file"; then
|
|
{
|
|
echo "--- FILE: ${rel_path} ---"
|
|
echo "[SKIPPED: binary file]"
|
|
echo ""
|
|
} >> "$output_file"
|
|
skipped_count=$((skipped_count + 1))
|
|
continue
|
|
fi
|
|
|
|
# Include the file (strip BOM from source to avoid inline BOMs)
|
|
{
|
|
echo "--- FILE: ${rel_path} ---"
|
|
cat_strip_bom "$file"
|
|
echo ""
|
|
echo ""
|
|
} >> "$output_file"
|
|
|
|
file_count=$((file_count + 1))
|
|
|
|
done < <(find "$repo_dir" -type f \
|
|
! -path '*/.git/*' ! -path '*/.git' \
|
|
! -path '*/node_modules/*' \
|
|
! -path '*/__pycache__/*' \
|
|
! -path '*/.venv/*' \
|
|
! -path '*/venv/*' \
|
|
! -path '*/.cache/*' \
|
|
-print0 | sort -z)
|
|
|
|
# Append summary at the end
|
|
{
|
|
echo "================================================================================"
|
|
echo "Summary: ${file_count} files included, ${skipped_count} skipped"
|
|
echo "================================================================================"
|
|
} >> "$output_file"
|
|
|
|
total_files=$((total_files + file_count))
|
|
echo " -> ${output_file} (${file_count} files, ${skipped_count} skipped)"
|
|
|
|
done
|
|
|
|
echo ""
|
|
echo "Done! ${repo_count} repos found, ${#updated_repos[@]} updated, ${total_files} total files collected."
|
|
if [ ${#updated_repos[@]} -gt 0 ]; then
|
|
echo ""
|
|
echo "Updated repos (upload these to project):"
|
|
for repo in "${updated_repos[@]}"; do
|
|
echo " -> ${OUTPUT_DIR}/repository-${repo}.txt"
|
|
done
|
|
else
|
|
echo "All repos up to date, nothing to upload."
|
|
fi |