Push old code

This commit is contained in:
Mirror 2021-02-20 20:40:56 +03:00
parent 80240387fa
commit 7fc20571ba
8 changed files with 662 additions and 2 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
pixiv-scrapper
.directory

View File

@ -1,2 +0,0 @@
# pixiv-scrapper

159
main.go Normal file
View File

@ -0,0 +1,159 @@
package main
import (
"flag"
"fmt"
"log"
"./pixiv"
)
type cmdlineT struct {
followings bool
favs bool
illusts bool
userID string
setCookies string
setProxy string
workDirectory string
userAgent string
printID bool
printURL bool
test bool
logErrorsFile string
threads int
}
var (
cmdline cmdlineT
cookies string
pix pixiv.Pixiv
)
func main() {
flags()
pix = pixiv.New(cmdline.setCookies, cmdline.logErrorsFile, cmdline.threads)
defer pix.Close()
if len(cmdline.setProxy) > 0 {
pix.SetProxy(cmdline.setProxy)
}
if len(cmdline.workDirectory) > 0 {
pix.WorkDirectory = cmdline.workDirectory
}
if cmdline.illusts {
err := fetchIllusts(cmdline.userID)
if err != nil {
log.Fatal(err.Error())
}
}
if cmdline.favs {
err := fetchBookmarks(cmdline.userID)
if err != nil {
log.Fatal(err.Error())
}
}
if cmdline.followings {
err := fetchFollows(cmdline.userID)
if err != nil {
log.Fatal(err.Error())
}
}
}
func fetchFollows(userID string) (err error) {
users, err := pix.GetFollows(userID)
if err != nil {
return
}
for _, user := range users {
fmt.Printf("%s %s %s %s\n", user.UserID, "followed_user", "defered", user.UserName)
}
return
}
func fetchBookmarks(userID string) (err error) {
illusts, err := pix.GetBookmarks(userID)
if err != nil {
return
}
for _, illust := range illusts {
if cmdline.printID {
fmt.Printf("%s %s %s %s %s\n", illust.ID, "illust", "defered", illust.Title, illust.Alt)
continue
}
if !illust.Complited() {
err := pix.ComplateIllust(&illust)
if err != nil {
return err
}
}
if cmdline.printURL {
for _, page := range illust.Pages {
fmt.Println(page.URLs.Original)
}
continue
}
if !(cmdline.printURL && cmdline.printID) {
if !(cmdline.printURL && cmdline.printID) {
for _, illust := range illusts {
pix.DownloadChannel <- illust
}
}
}
}
return
}
func fetchIllusts(userID string) (err error) {
illusts, err := pix.GetUserIllustsID(userID)
if err != nil {
return
}
if cmdline.printID {
for _, illust := range illusts {
fmt.Println(illust.ID)
}
return
}
if cmdline.printURL {
for _, illust := range illusts {
id := illust.ID
illust, err := pix.GetIllust(id)
if err != nil {
return err
}
for _, page := range illust.Pages {
fmt.Println(page.URLs.Original)
}
}
return
}
if !(cmdline.printURL && cmdline.printID) {
for _, illust := range illusts {
pix.DownloadChannel <- illust
}
}
return
}
func flags() {
flag.BoolVar(&cmdline.illusts, "illusts", false, "download illusts")
flag.BoolVar(&cmdline.followings, "followings", false, "fetch user followings")
flag.BoolVar(&cmdline.favs, "favs", false, "fetch user favorites")
flag.StringVar(&cmdline.userID, "user-id", "", "set userID to extract followings, favs or illustrations from")
flag.StringVar(&cmdline.setCookies, "set-cookies", "", "set cookies for request. PHPSESSID is REQUIRED if you want to see R-18 works. SHOULD be something like PHPSESSID=12345678_MEPujYEEwOfXKYdB9TXeMIxaq7pQPYXA")
flag.StringVar(&cmdline.setProxy, "set-proxy", "", "set proxy for request")
flag.StringVar(&cmdline.workDirectory, "workdir", "", "set directory to save images in")
flag.StringVar(&cmdline.userAgent, "user-agent", "Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0", "set user agent")
flag.BoolVar(&cmdline.printID, "print-id", false, "only print IDs, not download")
flag.BoolVar(&cmdline.printURL, "print-url", false, "only print URLs, not download")
flag.BoolVar(&cmdline.test, "test", false, "test")
flag.StringVar(&cmdline.logErrorsFile, "log-errors", "", "file to strore failed items")
flag.IntVar(&cmdline.threads, "threads", 1, "threads number")
flag.Parse()
}

70
pixiv/ajax.go Normal file
View File

@ -0,0 +1,70 @@
package pixiv
import (
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"log"
"net/http"
)
type ajaxResp struct {
Error bool `json:"error"`
Message string `json:"message"`
Body json.RawMessage `json:"body"`
}
func (p *Pixiv) ajaxRequest(addr string) (r json.RawMessage, err error) {
req, err := http.NewRequest("GET", addr, nil)
if err != nil {
err = fmt.Errorf("NewRequest failed (%s)", addr)
return
}
req.AddCookie(&p.phpsessid)
req.Header.Set("User-Agent", p.Ua)
req.Header.Set("Referer", "https://www.pixiv.net")
var resp *http.Response
for i := 0; i < p.RetryCount; i++ {
resp, err = p.client.Do(req)
if err != nil {
log.Printf("%s", err.Error())
log.Printf("Retry %d of %d...", i, p.RetryCount)
continue
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Printf("GET %s failed: bad status code: %d", addr, resp.StatusCode)
log.Printf("Retry %d of %d...", i, p.RetryCount)
continue
}
break
}
if err != nil {
log.Printf("%s", err.Error())
return
}
if resp.StatusCode != http.StatusOK {
log.Printf("GET %s failed: bad status code: %d", addr, resp.StatusCode)
return
}
respData, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Println("ioutil.ReadAll(resp.Body) failed")
return
}
var ajax ajaxResp
err = json.Unmarshal(respData, &ajax)
if err != nil {
log.Println("json.Unmarshal(respData, &ajax) failed")
return
}
if ajax.Error {
err = errors.New(ajax.Message)
log.Println("ajax error")
return
}
r = ajax.Body
return
}

122
pixiv/downloader.go Normal file
View File

@ -0,0 +1,122 @@
package pixiv
import (
"fmt"
"io"
"log"
"net/http"
"os"
"strings"
)
//DownloadIllust .
func (p *Pixiv) downloadIllust(i Illust) (err error) {
if !i.Complited() {
err = p.ComplateIllust(&i)
if err != nil {
return
}
}
getExtension := func(fullpath string) (ext string) {
parts := strings.Split(fullpath, ".")
ext = parts[len(parts)-1]
return
}
type artworkFile struct {
directory string
filename string
url string
}
type artwork struct {
tags []string
files []artworkFile
}
var art artwork
for _, tag := range i.Tags.Tags {
if len(tag.Translation.En) > 0 {
art.tags = append(art.tags, tag.Translation.En)
} else if len(tag.Romaji) > 0 {
art.tags = append(art.tags, tag.Romaji)
} else {
art.tags = append(art.tags, tag.Tag)
}
}
for pageNumber, page := range i.Pages {
directory := fmt.Sprintf("%s/%s_%s/", p.WorkDirectory, i.UserID, i.UserAccount)
filename := fmt.Sprintf("%s_p%d.%s", i.ID, pageNumber, getExtension(page.URLs.Original))
art.files = append(art.files,
artworkFile{directory: directory,
filename: filename,
url: page.URLs.Original})
}
for _, file := range art.files {
err := os.MkdirAll(file.directory, 0755)
if err != nil {
return err
}
outfile, err := os.Create(file.directory + file.filename)
if err != nil {
return err
}
defer outfile.Close()
//log.Printf("Downloading %s to %s", file.url, outfile.Name())
err = p.downloadTo(file.url, outfile)
if err != nil {
return err
}
}
return
}
func (p *Pixiv) downloadTo(addr string, file *os.File) (err error) {
req, err := http.NewRequest("GET", addr, nil)
if err != nil {
err = fmt.Errorf("NewRequest failed (%s)", addr)
return
}
req.AddCookie(&p.phpsessid)
req.Header.Set("User-Agent", p.Ua)
req.Header.Set("Referer", "https://www.pixiv.net")
var resp *http.Response
for i := 0; i < p.RetryCount; i++ {
resp, err = p.client.Do(req)
if err != nil {
log.Printf("%s", err.Error())
log.Printf("Retry %d of %d...", i, p.RetryCount)
continue
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Printf("GET %s failed: bad status code: %d", addr, resp.StatusCode)
log.Printf("Retry %d of %d...", i, p.RetryCount)
continue
}
err = file.Truncate(0)
if err != nil {
return
}
_, err = io.Copy(file, resp.Body)
if err != nil {
log.Printf("%s", err.Error())
log.Printf("Retry %d of %d...", i, p.RetryCount)
continue
}
break
}
return
}
func (p *Pixiv) downloadWorker() {
for illust := range p.DownloadChannel {
err := p.downloadIllust(illust)
if err != nil {
p.logChannel <- fmt.Sprintf("%s %s %s\n", illust.ID, "illust", "failed")
continue
}
p.logChannel <- fmt.Sprintf("%s %s %s\n", illust.ID, "illust", "downloaded")
}
return
}

71
pixiv/new.go Normal file
View File

@ -0,0 +1,71 @@
package pixiv
import (
"fmt"
"log"
"net/http"
"os"
"h12.io/socks"
)
//Pixiv is API
type Pixiv struct {
phpsessid http.Cookie
Ua string
client *http.Client
RetryCount int
ItemsPerRequest int
WorkDirectory string
logChannel chan string
DownloadChannel chan Illust
}
//New returns object with methods to access API functions
func New(cookies string, logFilePath string, threads int) (p Pixiv) {
p.phpsessid = http.Cookie{Name: "PHPSESSID", Value: cookies}
p.Ua = "Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0"
p.client = &http.Client{}
p.RetryCount = 5
p.ItemsPerRequest = 100
p.WorkDirectory = fmt.Sprintf("%s/Pictures/pixiv", os.Getenv("HOME"))
if len(logFilePath) > 0 {
logfile, err := os.OpenFile(logFilePath, os.O_APPEND, 664)
if err != nil {
log.Fatal(err.Error())
}
p.logChannel = make(chan string)
go p.logger(logfile)
} else {
p.logChannel = make(chan string)
go p.logger(os.Stdout)
}
p.DownloadChannel = make(chan Illust)
for i := 0; i < threads; i++ {
go p.downloadWorker()
}
return
}
//Close .
func (p *Pixiv) Close() {
close(p.logChannel)
close(p.DownloadChannel)
}
func (p *Pixiv) logger(logfile *os.File) {
for entry := range p.logChannel {
logfile.WriteString(entry)
}
log.Println("Closing log file")
logfile.Close()
}
//SetProxy sets SOCKS proxy for all requests
func (p *Pixiv) SetProxy(proxy string) (err error) {
dialSocksProxy := socks.Dial(proxy)
tr := &http.Transport{Dial: dialSocksProxy}
p.client.Transport = tr
return
}

155
pixiv/pixiv.go Normal file
View File

@ -0,0 +1,155 @@
package pixiv
import (
"encoding/json"
"fmt"
"log"
)
//GetBookmarks .
func (p *Pixiv) GetBookmarks(userID string) (illusts []Illust, err error) {
type ajaxBody struct {
Works []struct {
ID string `json:"id"`
} `json:"works"`
Total int `json:"total"`
}
offset := 0
for {
req := fmt.Sprintf("https://www.pixiv.net/ajax/user/%s/illusts/bookmarks?tag=&offset=%d&limit=%d&rest=show&lang=en", userID, offset, p.ItemsPerRequest)
rawAjaxBody, err := p.ajaxRequest(req)
if err != nil {
return nil, err
}
var body ajaxBody
err = json.Unmarshal(rawAjaxBody, &body)
if err != nil {
return nil, err
}
for _, illust := range body.Works {
illusts = append(illusts, Illust{ID: illust.ID, complited: false})
}
if len(illusts) < body.Total {
offset = len(illusts)
log.Printf("Fetched %d/%d bookmarks", offset, body.Total)
continue
} else {
log.Printf("Fetched all %d bookmarks of user %s", offset, userID)
break
}
}
return
}
//GetFollows .
func (p *Pixiv) GetFollows(userID string) (users []User, err error) {
type ajaxBody struct {
Users []User `json:"users"`
Total int `json:"total"`
}
offset := 0
for {
req := fmt.Sprintf("https://www.pixiv.net/ajax/user/%s/following?offset=%d&limit=%d&rest=show&tag=&lang=en", userID, offset, p.ItemsPerRequest)
rawAjaxBody, err := p.ajaxRequest(req)
if err != nil {
return nil, err
}
var body ajaxBody
err = json.Unmarshal(rawAjaxBody, &body)
if err != nil {
return nil, err
}
users = append(users, body.Users[:]...)
if len(users) < body.Total {
offset = len(users)
log.Printf("Fetched %d/%d follows", offset, body.Total)
continue
} else {
log.Printf("Fetched all %d follows of user %s", offset, userID)
break
}
}
return
}
//GetIllust .
func (p *Pixiv) GetIllust(illustID string) (r Illust, err error) {
req := fmt.Sprintf("https://www.pixiv.net/ajax/illust/%s?lang=en", illustID)
apiResp, err := p.ajaxRequest(req)
if err != nil {
return
}
err = json.Unmarshal(apiResp, &r)
if err != nil {
log.Printf("Unmarshaling of api response returned an error: %s", err.Error())
return
}
if r.PageCount > 1 {
req := fmt.Sprintf("https://www.pixiv.net/ajax/illust/%s/pages?lang=en", illustID)
apiResp, err := p.ajaxRequest(req)
if err != nil {
return Illust{}, err
}
err = json.Unmarshal(apiResp, &r.Pages)
if err != nil {
log.Printf("Unmarshaling of api response returned an error: %s", err.Error())
return Illust{}, err
}
} else {
var page IllustPage
page.URLs = r.URLs
page.Width = r.Width
page.Height = r.Height
r.Pages = append(r.Pages, page)
}
return
}
//GetUserIllustsID .
func (p *Pixiv) GetUserIllustsID(userID string) (illusts []Illust, err error) {
type ajaxBody struct {
Illusts map[string]interface{} `json:"illusts"`
}
var body ajaxBody
req := fmt.Sprintf("https://www.pixiv.net/ajax/user/%s/profile/all?lang=en", userID)
rawAjaxBody, err := p.ajaxRequest(req)
if err != nil {
return
}
err = json.Unmarshal(rawAjaxBody, &body)
if err != nil {
return
}
for ID := range body.Illusts {
illusts = append(illusts, Illust{ID: ID, complited: false})
}
return
}
//ComplateIllust .
func (p *Pixiv) ComplateIllust(illust *Illust) (err error) {
*illust, err = p.GetIllust(illust.ID)
if err != nil {
return
}
illust.complited = true
return
}
//ComplateIllustMultiple .
func (p *Pixiv) ComplateIllustMultiple(illusts *[]Illust) (err error) {
for i, illust := range *illusts {
(*illusts)[i], err = p.GetIllust(illust.ID)
if err != nil {
return err
}
(*illusts)[i].complited = true
}
return
}
//Complited .
func (i *Illust) Complited() bool {
return i.complited
}

83
pixiv/types.go Normal file
View File

@ -0,0 +1,83 @@
package pixiv
//Illust .
type Illust struct {
complited bool
ID string `json:"id"`
IlustID string `json:"illustId"`
Title string `json:"title"`
IllustTitle string `json:"illustTitle"`
IllustComment string `json:"illustComment"`
IllustType int `json:"illustType"`
XRestrict int `json:"xRestrict"`
Sl int `json:"sl"`
URL string `json:"url"`
Pages []IllustPage
URLs struct {
Mini string `json:"mini"`
Thumb string `json:"thumb"`
Small string `json:"small"`
Regular string `json:"regular"`
Original string `json:"original"`
} `json:"urls"`
Description string `json:"description"`
Tags struct {
AuthorID string `json:"authorId"`
IsLocked bool `json:"isLocked"`
Tags []IllustTag `json:"tags"`
Writable bool `json:"writable"`
} `json:"tags,omitempty"`
UserID string `json:"userId"`
UserName string `json:"userName"`
UserAccount string `json:"userAccount"`
Width int `json:"width"`
Height int `json:"height"`
PageCount int `json:"pageCount"`
IsBookmarkable bool `json:"isBookmarkable"`
Alt string `json:"alt"`
IsAdContainer bool `json:"isAdContainer"`
TitleCaptionTranslation struct {
WorkTitle string `json:"workTitle"`
WorkCaption string `json:"workCaption"`
} `json:"titleCaptionTranslation"`
CreateDate string `json:"createDate"`
UpdateDate string `json:"updateDate"`
IsUnlisted bool `json:"isUnlisted"`
ProfileImageURL string `json:"profileImageUrl"`
}
//IllustTag .
type IllustTag struct {
Tag string `json:"tag"`
Locked bool `json:"locked"`
Deletable bool `json:"deletable"`
UserID string `json:"userId"`
UserName string `json:"userName"`
Romaji string `json:"romaji"`
Translation struct {
En string `json:"en"`
} `json:"translation"`
}
//IllustPage .
type IllustPage struct {
URLs struct {
Mini string `json:"mini"`
Thumb string `json:"thumb"`
Small string `json:"small"`
Regular string `json:"regular"`
Original string `json:"original"`
} `json:"urls"`
Width int `json:"width"`
Height int `json:"height"`
}
//User .
type User struct {
UserID string `json:"userId"`
UserName string `json:"userName"`
ProfileImageURL string `json:"profileImageUrl"`
Following bool `json:"following"`
Followed bool `json:"followed"`
illusts []Illust
}