From 7fc20571bac054b614532fc26527cd0a1df4170b Mon Sep 17 00:00:00 2001 From: Mirror Date: Sat, 20 Feb 2021 20:40:56 +0300 Subject: [PATCH] Push old code --- .gitignore | 2 + README.md | 2 - main.go | 159 ++++++++++++++++++++++++++++++++++++++++++++ pixiv/ajax.go | 70 +++++++++++++++++++ pixiv/downloader.go | 122 +++++++++++++++++++++++++++++++++ pixiv/new.go | 71 ++++++++++++++++++++ pixiv/pixiv.go | 155 ++++++++++++++++++++++++++++++++++++++++++ pixiv/types.go | 83 +++++++++++++++++++++++ 8 files changed, 662 insertions(+), 2 deletions(-) create mode 100644 .gitignore create mode 100644 main.go create mode 100644 pixiv/ajax.go create mode 100644 pixiv/downloader.go create mode 100644 pixiv/new.go create mode 100644 pixiv/pixiv.go create mode 100644 pixiv/types.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..577fe77 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +pixiv-scrapper +.directory \ No newline at end of file diff --git a/README.md b/README.md index 7704741..e69de29 100644 --- a/README.md +++ b/README.md @@ -1,2 +0,0 @@ -# pixiv-scrapper - diff --git a/main.go b/main.go new file mode 100644 index 0000000..a1247b5 --- /dev/null +++ b/main.go @@ -0,0 +1,159 @@ +package main + +import ( + "flag" + "fmt" + "log" + + "./pixiv" +) + +type cmdlineT struct { + followings bool + favs bool + illusts bool + userID string + setCookies string + setProxy string + workDirectory string + userAgent string + printID bool + printURL bool + test bool + logErrorsFile string + threads int +} + +var ( + cmdline cmdlineT + cookies string + pix pixiv.Pixiv +) + +func main() { + flags() + pix = pixiv.New(cmdline.setCookies, cmdline.logErrorsFile, cmdline.threads) + defer pix.Close() + if len(cmdline.setProxy) > 0 { + pix.SetProxy(cmdline.setProxy) + } + if len(cmdline.workDirectory) > 0 { + pix.WorkDirectory = cmdline.workDirectory + } + + if cmdline.illusts { + err := fetchIllusts(cmdline.userID) + if err != nil { + log.Fatal(err.Error()) + } + } + + if cmdline.favs { + err := fetchBookmarks(cmdline.userID) + if err != nil { + log.Fatal(err.Error()) + } + } + + if cmdline.followings { + err := fetchFollows(cmdline.userID) + if err != nil { + log.Fatal(err.Error()) + } + } + +} + +func fetchFollows(userID string) (err error) { + users, err := pix.GetFollows(userID) + if err != nil { + return + } + for _, user := range users { + fmt.Printf("%s %s %s %s\n", user.UserID, "followed_user", "defered", user.UserName) + } + return +} + +func fetchBookmarks(userID string) (err error) { + illusts, err := pix.GetBookmarks(userID) + if err != nil { + return + } + for _, illust := range illusts { + if cmdline.printID { + fmt.Printf("%s %s %s %s %s\n", illust.ID, "illust", "defered", illust.Title, illust.Alt) + continue + } + if !illust.Complited() { + err := pix.ComplateIllust(&illust) + if err != nil { + return err + } + } + if cmdline.printURL { + for _, page := range illust.Pages { + fmt.Println(page.URLs.Original) + } + continue + } + if !(cmdline.printURL && cmdline.printID) { + if !(cmdline.printURL && cmdline.printID) { + for _, illust := range illusts { + pix.DownloadChannel <- illust + } + } + } + + } + return +} + +func fetchIllusts(userID string) (err error) { + illusts, err := pix.GetUserIllustsID(userID) + if err != nil { + return + } + if cmdline.printID { + for _, illust := range illusts { + fmt.Println(illust.ID) + } + return + } + if cmdline.printURL { + for _, illust := range illusts { + id := illust.ID + illust, err := pix.GetIllust(id) + if err != nil { + return err + } + for _, page := range illust.Pages { + fmt.Println(page.URLs.Original) + } + } + return + } + if !(cmdline.printURL && cmdline.printID) { + for _, illust := range illusts { + pix.DownloadChannel <- illust + } + } + return +} + +func flags() { + flag.BoolVar(&cmdline.illusts, "illusts", false, "download illusts") + flag.BoolVar(&cmdline.followings, "followings", false, "fetch user followings") + flag.BoolVar(&cmdline.favs, "favs", false, "fetch user favorites") + flag.StringVar(&cmdline.userID, "user-id", "", "set userID to extract followings, favs or illustrations from") + flag.StringVar(&cmdline.setCookies, "set-cookies", "", "set cookies for request. PHPSESSID is REQUIRED if you want to see R-18 works. SHOULD be something like PHPSESSID=12345678_MEPujYEEwOfXKYdB9TXeMIxaq7pQPYXA") + flag.StringVar(&cmdline.setProxy, "set-proxy", "", "set proxy for request") + flag.StringVar(&cmdline.workDirectory, "workdir", "", "set directory to save images in") + flag.StringVar(&cmdline.userAgent, "user-agent", "Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0", "set user agent") + flag.BoolVar(&cmdline.printID, "print-id", false, "only print IDs, not download") + flag.BoolVar(&cmdline.printURL, "print-url", false, "only print URLs, not download") + flag.BoolVar(&cmdline.test, "test", false, "test") + flag.StringVar(&cmdline.logErrorsFile, "log-errors", "", "file to strore failed items") + flag.IntVar(&cmdline.threads, "threads", 1, "threads number") + flag.Parse() +} diff --git a/pixiv/ajax.go b/pixiv/ajax.go new file mode 100644 index 0000000..0b8f027 --- /dev/null +++ b/pixiv/ajax.go @@ -0,0 +1,70 @@ +package pixiv + +import ( + "encoding/json" + "errors" + "fmt" + "io/ioutil" + "log" + "net/http" +) + +type ajaxResp struct { + Error bool `json:"error"` + Message string `json:"message"` + Body json.RawMessage `json:"body"` +} + +func (p *Pixiv) ajaxRequest(addr string) (r json.RawMessage, err error) { + req, err := http.NewRequest("GET", addr, nil) + if err != nil { + err = fmt.Errorf("NewRequest failed (%s)", addr) + return + } + req.AddCookie(&p.phpsessid) + req.Header.Set("User-Agent", p.Ua) + req.Header.Set("Referer", "https://www.pixiv.net") + var resp *http.Response + for i := 0; i < p.RetryCount; i++ { + resp, err = p.client.Do(req) + if err != nil { + log.Printf("%s", err.Error()) + log.Printf("Retry %d of %d...", i, p.RetryCount) + continue + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + log.Printf("GET %s failed: bad status code: %d", addr, resp.StatusCode) + log.Printf("Retry %d of %d...", i, p.RetryCount) + continue + } + break + } + + if err != nil { + log.Printf("%s", err.Error()) + return + } + if resp.StatusCode != http.StatusOK { + log.Printf("GET %s failed: bad status code: %d", addr, resp.StatusCode) + return + } + respData, err := ioutil.ReadAll(resp.Body) + if err != nil { + log.Println("ioutil.ReadAll(resp.Body) failed") + return + } + var ajax ajaxResp + err = json.Unmarshal(respData, &ajax) + if err != nil { + log.Println("json.Unmarshal(respData, &ajax) failed") + return + } + if ajax.Error { + err = errors.New(ajax.Message) + log.Println("ajax error") + return + } + r = ajax.Body + return +} diff --git a/pixiv/downloader.go b/pixiv/downloader.go new file mode 100644 index 0000000..ca12644 --- /dev/null +++ b/pixiv/downloader.go @@ -0,0 +1,122 @@ +package pixiv + +import ( + "fmt" + "io" + "log" + "net/http" + "os" + "strings" +) + +//DownloadIllust . +func (p *Pixiv) downloadIllust(i Illust) (err error) { + if !i.Complited() { + err = p.ComplateIllust(&i) + if err != nil { + return + } + } + getExtension := func(fullpath string) (ext string) { + parts := strings.Split(fullpath, ".") + ext = parts[len(parts)-1] + return + } + type artworkFile struct { + directory string + filename string + url string + } + type artwork struct { + tags []string + files []artworkFile + } + var art artwork + for _, tag := range i.Tags.Tags { + if len(tag.Translation.En) > 0 { + art.tags = append(art.tags, tag.Translation.En) + } else if len(tag.Romaji) > 0 { + art.tags = append(art.tags, tag.Romaji) + } else { + art.tags = append(art.tags, tag.Tag) + } + } + + for pageNumber, page := range i.Pages { + directory := fmt.Sprintf("%s/%s_%s/", p.WorkDirectory, i.UserID, i.UserAccount) + filename := fmt.Sprintf("%s_p%d.%s", i.ID, pageNumber, getExtension(page.URLs.Original)) + art.files = append(art.files, + artworkFile{directory: directory, + filename: filename, + url: page.URLs.Original}) + } + + for _, file := range art.files { + err := os.MkdirAll(file.directory, 0755) + if err != nil { + return err + } + outfile, err := os.Create(file.directory + file.filename) + if err != nil { + return err + } + defer outfile.Close() + //log.Printf("Downloading %s to %s", file.url, outfile.Name()) + err = p.downloadTo(file.url, outfile) + if err != nil { + return err + } + + } + return +} + +func (p *Pixiv) downloadTo(addr string, file *os.File) (err error) { + req, err := http.NewRequest("GET", addr, nil) + if err != nil { + err = fmt.Errorf("NewRequest failed (%s)", addr) + return + } + req.AddCookie(&p.phpsessid) + req.Header.Set("User-Agent", p.Ua) + req.Header.Set("Referer", "https://www.pixiv.net") + var resp *http.Response + for i := 0; i < p.RetryCount; i++ { + resp, err = p.client.Do(req) + if err != nil { + log.Printf("%s", err.Error()) + log.Printf("Retry %d of %d...", i, p.RetryCount) + continue + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + log.Printf("GET %s failed: bad status code: %d", addr, resp.StatusCode) + log.Printf("Retry %d of %d...", i, p.RetryCount) + continue + } + err = file.Truncate(0) + if err != nil { + return + } + _, err = io.Copy(file, resp.Body) + if err != nil { + log.Printf("%s", err.Error()) + log.Printf("Retry %d of %d...", i, p.RetryCount) + continue + } + break + } + return +} + +func (p *Pixiv) downloadWorker() { + for illust := range p.DownloadChannel { + err := p.downloadIllust(illust) + if err != nil { + p.logChannel <- fmt.Sprintf("%s %s %s\n", illust.ID, "illust", "failed") + continue + } + p.logChannel <- fmt.Sprintf("%s %s %s\n", illust.ID, "illust", "downloaded") + } + return +} diff --git a/pixiv/new.go b/pixiv/new.go new file mode 100644 index 0000000..0968ea8 --- /dev/null +++ b/pixiv/new.go @@ -0,0 +1,71 @@ +package pixiv + +import ( + "fmt" + "log" + "net/http" + "os" + + "h12.io/socks" +) + +//Pixiv is API +type Pixiv struct { + phpsessid http.Cookie + Ua string + client *http.Client + RetryCount int + ItemsPerRequest int + WorkDirectory string + logChannel chan string + DownloadChannel chan Illust +} + +//New returns object with methods to access API functions +func New(cookies string, logFilePath string, threads int) (p Pixiv) { + p.phpsessid = http.Cookie{Name: "PHPSESSID", Value: cookies} + p.Ua = "Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0" + p.client = &http.Client{} + p.RetryCount = 5 + p.ItemsPerRequest = 100 + p.WorkDirectory = fmt.Sprintf("%s/Pictures/pixiv", os.Getenv("HOME")) + if len(logFilePath) > 0 { + logfile, err := os.OpenFile(logFilePath, os.O_APPEND, 664) + if err != nil { + log.Fatal(err.Error()) + } + p.logChannel = make(chan string) + go p.logger(logfile) + } else { + p.logChannel = make(chan string) + go p.logger(os.Stdout) + } + p.DownloadChannel = make(chan Illust) + for i := 0; i < threads; i++ { + go p.downloadWorker() + } + + return +} + +//Close . +func (p *Pixiv) Close() { + close(p.logChannel) + close(p.DownloadChannel) +} + +func (p *Pixiv) logger(logfile *os.File) { + for entry := range p.logChannel { + logfile.WriteString(entry) + } + log.Println("Closing log file") + logfile.Close() +} + +//SetProxy sets SOCKS proxy for all requests +func (p *Pixiv) SetProxy(proxy string) (err error) { + dialSocksProxy := socks.Dial(proxy) + tr := &http.Transport{Dial: dialSocksProxy} + p.client.Transport = tr + return +} diff --git a/pixiv/pixiv.go b/pixiv/pixiv.go new file mode 100644 index 0000000..ecbbfff --- /dev/null +++ b/pixiv/pixiv.go @@ -0,0 +1,155 @@ +package pixiv + +import ( + "encoding/json" + "fmt" + "log" +) + +//GetBookmarks . +func (p *Pixiv) GetBookmarks(userID string) (illusts []Illust, err error) { + type ajaxBody struct { + Works []struct { + ID string `json:"id"` + } `json:"works"` + Total int `json:"total"` + } + offset := 0 + for { + req := fmt.Sprintf("https://www.pixiv.net/ajax/user/%s/illusts/bookmarks?tag=&offset=%d&limit=%d&rest=show&lang=en", userID, offset, p.ItemsPerRequest) + rawAjaxBody, err := p.ajaxRequest(req) + if err != nil { + return nil, err + } + var body ajaxBody + err = json.Unmarshal(rawAjaxBody, &body) + if err != nil { + return nil, err + } + for _, illust := range body.Works { + illusts = append(illusts, Illust{ID: illust.ID, complited: false}) + } + if len(illusts) < body.Total { + offset = len(illusts) + log.Printf("Fetched %d/%d bookmarks", offset, body.Total) + continue + } else { + log.Printf("Fetched all %d bookmarks of user %s", offset, userID) + break + } + } + return +} + +//GetFollows . +func (p *Pixiv) GetFollows(userID string) (users []User, err error) { + type ajaxBody struct { + Users []User `json:"users"` + Total int `json:"total"` + } + offset := 0 + for { + req := fmt.Sprintf("https://www.pixiv.net/ajax/user/%s/following?offset=%d&limit=%d&rest=show&tag=&lang=en", userID, offset, p.ItemsPerRequest) + rawAjaxBody, err := p.ajaxRequest(req) + if err != nil { + return nil, err + } + var body ajaxBody + err = json.Unmarshal(rawAjaxBody, &body) + if err != nil { + return nil, err + } + users = append(users, body.Users[:]...) + if len(users) < body.Total { + offset = len(users) + log.Printf("Fetched %d/%d follows", offset, body.Total) + continue + } else { + log.Printf("Fetched all %d follows of user %s", offset, userID) + break + } + } + return +} + +//GetIllust . +func (p *Pixiv) GetIllust(illustID string) (r Illust, err error) { + req := fmt.Sprintf("https://www.pixiv.net/ajax/illust/%s?lang=en", illustID) + apiResp, err := p.ajaxRequest(req) + if err != nil { + return + } + err = json.Unmarshal(apiResp, &r) + if err != nil { + log.Printf("Unmarshaling of api response returned an error: %s", err.Error()) + return + } + if r.PageCount > 1 { + req := fmt.Sprintf("https://www.pixiv.net/ajax/illust/%s/pages?lang=en", illustID) + apiResp, err := p.ajaxRequest(req) + if err != nil { + return Illust{}, err + } + err = json.Unmarshal(apiResp, &r.Pages) + if err != nil { + log.Printf("Unmarshaling of api response returned an error: %s", err.Error()) + return Illust{}, err + } + } else { + var page IllustPage + page.URLs = r.URLs + page.Width = r.Width + page.Height = r.Height + r.Pages = append(r.Pages, page) + } + return +} + +//GetUserIllustsID . +func (p *Pixiv) GetUserIllustsID(userID string) (illusts []Illust, err error) { + type ajaxBody struct { + Illusts map[string]interface{} `json:"illusts"` + } + var body ajaxBody + req := fmt.Sprintf("https://www.pixiv.net/ajax/user/%s/profile/all?lang=en", userID) + rawAjaxBody, err := p.ajaxRequest(req) + if err != nil { + return + } + err = json.Unmarshal(rawAjaxBody, &body) + if err != nil { + return + } + + for ID := range body.Illusts { + illusts = append(illusts, Illust{ID: ID, complited: false}) + } + return +} + +//ComplateIllust . +func (p *Pixiv) ComplateIllust(illust *Illust) (err error) { + *illust, err = p.GetIllust(illust.ID) + if err != nil { + return + } + illust.complited = true + return +} + +//ComplateIllustMultiple . +func (p *Pixiv) ComplateIllustMultiple(illusts *[]Illust) (err error) { + for i, illust := range *illusts { + (*illusts)[i], err = p.GetIllust(illust.ID) + if err != nil { + return err + } + (*illusts)[i].complited = true + } + return +} + +//Complited . +func (i *Illust) Complited() bool { + return i.complited +} diff --git a/pixiv/types.go b/pixiv/types.go new file mode 100644 index 0000000..5599c00 --- /dev/null +++ b/pixiv/types.go @@ -0,0 +1,83 @@ +package pixiv + +//Illust . +type Illust struct { + complited bool + ID string `json:"id"` + IlustID string `json:"illustId"` + Title string `json:"title"` + IllustTitle string `json:"illustTitle"` + IllustComment string `json:"illustComment"` + IllustType int `json:"illustType"` + XRestrict int `json:"xRestrict"` + Sl int `json:"sl"` + URL string `json:"url"` + Pages []IllustPage + URLs struct { + Mini string `json:"mini"` + Thumb string `json:"thumb"` + Small string `json:"small"` + Regular string `json:"regular"` + Original string `json:"original"` + } `json:"urls"` + Description string `json:"description"` + Tags struct { + AuthorID string `json:"authorId"` + IsLocked bool `json:"isLocked"` + Tags []IllustTag `json:"tags"` + Writable bool `json:"writable"` + } `json:"tags,omitempty"` + UserID string `json:"userId"` + UserName string `json:"userName"` + UserAccount string `json:"userAccount"` + Width int `json:"width"` + Height int `json:"height"` + PageCount int `json:"pageCount"` + IsBookmarkable bool `json:"isBookmarkable"` + Alt string `json:"alt"` + IsAdContainer bool `json:"isAdContainer"` + TitleCaptionTranslation struct { + WorkTitle string `json:"workTitle"` + WorkCaption string `json:"workCaption"` + } `json:"titleCaptionTranslation"` + CreateDate string `json:"createDate"` + UpdateDate string `json:"updateDate"` + IsUnlisted bool `json:"isUnlisted"` + ProfileImageURL string `json:"profileImageUrl"` +} + +//IllustTag . +type IllustTag struct { + Tag string `json:"tag"` + Locked bool `json:"locked"` + Deletable bool `json:"deletable"` + UserID string `json:"userId"` + UserName string `json:"userName"` + Romaji string `json:"romaji"` + Translation struct { + En string `json:"en"` + } `json:"translation"` +} + +//IllustPage . +type IllustPage struct { + URLs struct { + Mini string `json:"mini"` + Thumb string `json:"thumb"` + Small string `json:"small"` + Regular string `json:"regular"` + Original string `json:"original"` + } `json:"urls"` + Width int `json:"width"` + Height int `json:"height"` +} + +//User . +type User struct { + UserID string `json:"userId"` + UserName string `json:"userName"` + ProfileImageURL string `json:"profileImageUrl"` + Following bool `json:"following"` + Followed bool `json:"followed"` + illusts []Illust +}