Hi all,
ich wollte nur mal kurz ein script sharen.
urlgrep:
Usage:Code:#!/bin/sh while [[ $# -gt 0 ]]; do case $1 in #allows to give custom html sourcecode retrieving command "-c") shift; htmlcommand=$1;; #may help to find "inner" urls in urls by recursive scanning "-r") recursive="true";; #scans the complete document for urls patterns, not just #in between the html-tags like href=* ... "-t") textscan="true";; #allows to grep for regex "-regex") shift; regex=$1;; #allows to grep for case-insensitive regex "-iregex") shift; iregex=$1;; *) URLV="$1 $URLV";; esac shift done word="([^ <>\"\']|[:tab:])" dword="([^ <>\"\'/]|[:tab:])" if [[ -z "$htmlcommand" ]]; then #default html get command htmlcommand="links -source" #and some alternatives # htmlcommand="curl -o -" # htmlcommand="w3c -n" # htmlcommand="w3m -dump_source" fi function make_absolute { STRING=$1 while (echo "$STRING" | grep -q -E "(^|/)\.\./"); do STRING=$(echo "$STRING" | sed -r "s;[^/]+/\.\./;;g" | sed "s;^/\.\./;/;g") done (echo "$STRING" | grep -q -E "^((\.\.)?/|(ftp|https?)://)") && echo "$STRING" || echo "$STRING" | sed -r "s;^;"$root""$path"/;g" } function _settify { while read line; do MD5=$(echo "$line" | md5sum | awk '{print $1}') echo "$MD5ALL" | grep -q "$MD5" retval=$? if [[ $retval -gt 0 ]]; then MD5ALL=$(echo "$MD5ALL"; echo -n "$MD5") echo "$line" fi done } function _main { URL=$1 root=$(echo "$URL" | sed -r "s;^((ftp|https?)://($dword+\.)+$dword+)/.*$;\1;g") HTML=$($htmlcommand "$URL" || exit 1) domain=$(echo "$root" | grep -o -E "[^\.]+\.[^\.]+$") path=$(echo "$URL" | grep -o -E "/[^/].*/" | sed -r "s;^/[^/]+;;g") file=$(echo "$URL" | grep -o -E "[^/]/[^/]+$" | sed -r "s;[^/]/([^/]+);\1;g") urls=$(echo "$urls"; echo "$HTML" | grep -o -E "(href|src)=[\"\']?$word+[\"\']?" | sed -r "s/^(href|src)=[\"\']?($word+)[\"\']?/\2/g" | grep -v "^mailto:") if [[ -n "$textscan" ]]; then urls=$(echo "$urls"; echo "$HTML" | grep -o -E "(ftp|https?)://($word+(:$word+)?@)?($word+\.)+$word+(/$word*)*" | grep -v "^mailto:") fi urls=$(echo "$urls" | sed "s;^/;"$root"/;g" | while read line; do make_absolute "$line"; done) if [[ -n "$recursive" ]]; then urls=$(echo "$urls" | grep -o "[^?&]*" | grep -o -E "(ftp|https?)://($word+(:$word+)?@)?($word+\.)+$word+(/$word*)*") fi if [[ -n "$regex" ]]; then urls=$(echo "$urls" | grep -E "$regex") fi if [[ -n "$iregex" ]]; then urls=$(echo "$urls" | grep -i -E "$iregex") fi echo "$urls" | sed "s;/$;;g" | sed "/^$/d" } for URL in $URLV; do _main "$URL" | _settify done exit $?
urlgrep http://www.heise.de
Näher gehe ich nicht darauf ein, da der Name selbsterklärend ist. Have Fun.
Gruss,
lfhelper.
EDIT: Update1.
EDIT: Update2.
EDIT: Update3.
EDIT: Update4.
EDIT: Update5.
EDIT: Update6.
EDIT: Update7.
EDIT: Update8.
EDIT: Update9.
Lesezeichen