Twitter/X blocks regular scrapers and doesn't provide og:image meta tags. This uses the cdn.syndication.twimg.com API to fetch tweet data including photos, video posters, and card thumbnails. Note: The API may return 404 for protected/deleted tweets or accounts with restricted access - in those cases we fall back gracefully.
410 lines
10 KiB
Go
410 lines
10 KiB
Go
package services
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"mime"
|
|
"net/http"
|
|
"net/url"
|
|
"path"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/disintegration/imaging"
|
|
"golang.org/x/net/html"
|
|
|
|
"git.soup.land/soup/lookbook/internal/data/image"
|
|
"git.soup.land/soup/lookbook/internal/data/item"
|
|
)
|
|
|
|
var tweetURLPattern = regexp.MustCompile(`^https?://(?:www\.)?(?:twitter\.com|x\.com)/([^/]+)/status/(\d+)`)
|
|
|
|
const thumbWidth = 480
|
|
|
|
func CreateItemFromURL(ctx context.Context, db *sql.DB, sourceURL string) (item.Row, error) {
|
|
meta, err := FetchMetadata(ctx, sourceURL)
|
|
if err != nil {
|
|
return item.Row{}, err
|
|
}
|
|
|
|
row, err := item.QCreate(ctx, db, sourceURL, meta.Title, meta.Description, meta.SiteName)
|
|
if err != nil {
|
|
return item.Row{}, err
|
|
}
|
|
|
|
if err := storeImages(ctx, db, row.ID, meta); err != nil {
|
|
return row, err
|
|
}
|
|
|
|
return row, nil
|
|
}
|
|
|
|
func RefreshItemFromURL(ctx context.Context, db *sql.DB, row item.Row) error {
|
|
meta, err := FetchMetadata(ctx, row.SourceURL)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := item.QUpdateMeta(ctx, db, row.ID, meta.Title, meta.Description, meta.SiteName); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := image.QDeleteByItem(ctx, db, row.ID); err != nil {
|
|
return err
|
|
}
|
|
|
|
return storeImages(ctx, db, row.ID, meta)
|
|
}
|
|
|
|
type Metadata struct {
|
|
Title string
|
|
Description string
|
|
SiteName string
|
|
ImageURL string
|
|
}
|
|
|
|
func FetchMetadata(ctx context.Context, sourceURL string) (Metadata, error) {
|
|
// Check if this is a Twitter/X URL and use syndication API
|
|
if meta, ok := fetchTwitterMetadata(ctx, sourceURL); ok {
|
|
return meta, nil
|
|
}
|
|
|
|
resp, err := fetchURL(ctx, sourceURL)
|
|
if err != nil {
|
|
return Metadata{}, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
body, err := io.ReadAll(io.LimitReader(resp.Body, 8<<20))
|
|
if err != nil {
|
|
return Metadata{}, err
|
|
}
|
|
|
|
meta := Metadata{}
|
|
contentType := resp.Header.Get("Content-Type")
|
|
meta.ImageURL = extractImageURL(resp.Request.URL, contentType)
|
|
|
|
if strings.HasPrefix(strings.ToLower(contentType), "image/") {
|
|
if meta.Title == "" {
|
|
meta.Title = path.Base(resp.Request.URL.Path)
|
|
}
|
|
if meta.SiteName == "" {
|
|
meta.SiteName = resp.Request.URL.Hostname()
|
|
}
|
|
return meta, nil
|
|
}
|
|
|
|
doc, err := html.Parse(bytes.NewReader(body))
|
|
if err != nil {
|
|
return meta, nil
|
|
}
|
|
|
|
extractMeta(doc, &meta)
|
|
|
|
if meta.Title == "" {
|
|
meta.Title = titleFromDoc(doc)
|
|
}
|
|
|
|
if meta.ImageURL == "" {
|
|
if oembed, err := fetchOEmbed(ctx, sourceURL); err == nil {
|
|
if meta.Title == "" {
|
|
meta.Title = oembed.Title
|
|
}
|
|
if meta.Description == "" {
|
|
meta.Description = oembed.Description
|
|
}
|
|
if meta.ImageURL == "" {
|
|
meta.ImageURL = oembed.ThumbnailURL
|
|
}
|
|
if meta.SiteName == "" {
|
|
meta.SiteName = oembed.ProviderName
|
|
}
|
|
}
|
|
}
|
|
|
|
return meta, nil
|
|
}
|
|
|
|
type oEmbedResponse struct {
|
|
Title string `json:"title"`
|
|
Description string `json:"description"`
|
|
ThumbnailURL string `json:"thumbnail_url"`
|
|
ProviderName string `json:"provider_name"`
|
|
}
|
|
|
|
// Twitter/X syndication API response structures
|
|
type twitterSyndicationResponse struct {
|
|
Text string `json:"text"`
|
|
User twitterUser `json:"user"`
|
|
Photos []twitterPhoto `json:"photos"`
|
|
Video *twitterVideo `json:"video"`
|
|
Card *twitterCard `json:"card"`
|
|
Media []twitterMediaEntry `json:"mediaDetails"`
|
|
}
|
|
|
|
type twitterUser struct {
|
|
Name string `json:"name"`
|
|
ScreenName string `json:"screen_name"`
|
|
}
|
|
|
|
type twitterPhoto struct {
|
|
URL string `json:"url"`
|
|
}
|
|
|
|
type twitterVideo struct {
|
|
Poster string `json:"poster"`
|
|
}
|
|
|
|
type twitterCard struct {
|
|
ThumbnailImageOriginal string `json:"thumbnail_image_original"`
|
|
}
|
|
|
|
type twitterMediaEntry struct {
|
|
MediaURLHTTPS string `json:"media_url_https"`
|
|
Type string `json:"type"`
|
|
}
|
|
|
|
// fetchTwitterMetadata attempts to fetch metadata from Twitter's syndication API
|
|
// Returns the metadata and true if successful, or empty metadata and false if not a Twitter URL or fetch failed
|
|
func fetchTwitterMetadata(ctx context.Context, sourceURL string) (Metadata, bool) {
|
|
matches := tweetURLPattern.FindStringSubmatch(sourceURL)
|
|
if matches == nil {
|
|
return Metadata{}, false
|
|
}
|
|
|
|
username := matches[1]
|
|
tweetID := matches[2]
|
|
|
|
syndicationURL := fmt.Sprintf("https://cdn.syndication.twimg.com/tweet-result?id=%s&token=0", tweetID)
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, syndicationURL, nil)
|
|
if err != nil {
|
|
return Metadata{}, false
|
|
}
|
|
|
|
// The syndication API requires specific headers
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
|
req.Header.Set("Referer", "https://platform.twitter.com/")
|
|
|
|
client := &http.Client{Timeout: 12 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return Metadata{}, false
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != 200 {
|
|
return Metadata{}, false
|
|
}
|
|
|
|
// Check content type - if it's HTML, it's an error page
|
|
contentType := resp.Header.Get("Content-Type")
|
|
if !strings.Contains(contentType, "application/json") {
|
|
return Metadata{}, false
|
|
}
|
|
|
|
var tweet twitterSyndicationResponse
|
|
if err := json.NewDecoder(resp.Body).Decode(&tweet); err != nil {
|
|
return Metadata{}, false
|
|
}
|
|
|
|
meta := Metadata{
|
|
Title: truncateText(tweet.Text, 200),
|
|
Description: fmt.Sprintf("@%s", username),
|
|
SiteName: "X",
|
|
}
|
|
|
|
// Try to get image URL from various sources
|
|
// Priority: photos > video poster > card thumbnail > mediaDetails
|
|
if len(tweet.Photos) > 0 {
|
|
meta.ImageURL = tweet.Photos[0].URL
|
|
} else if tweet.Video != nil && tweet.Video.Poster != "" {
|
|
meta.ImageURL = tweet.Video.Poster
|
|
} else if tweet.Card != nil && tweet.Card.ThumbnailImageOriginal != "" {
|
|
meta.ImageURL = tweet.Card.ThumbnailImageOriginal
|
|
} else if len(tweet.Media) > 0 {
|
|
meta.ImageURL = tweet.Media[0].MediaURLHTTPS
|
|
}
|
|
|
|
// If we got user info, use it for a better description
|
|
if tweet.User.Name != "" {
|
|
meta.Description = fmt.Sprintf("%s (@%s)", tweet.User.Name, tweet.User.ScreenName)
|
|
}
|
|
|
|
return meta, true
|
|
}
|
|
|
|
func truncateText(s string, maxLen int) string {
|
|
if len(s) <= maxLen {
|
|
return s
|
|
}
|
|
// Try to break at a word boundary
|
|
truncated := s[:maxLen]
|
|
if lastSpace := strings.LastIndex(truncated, " "); lastSpace > maxLen/2 {
|
|
truncated = truncated[:lastSpace]
|
|
}
|
|
return truncated + "…"
|
|
}
|
|
|
|
func fetchOEmbed(ctx context.Context, sourceURL string) (oEmbedResponse, error) {
|
|
oembedURL := fmt.Sprintf("https://noembed.com/embed?url=%s", url.QueryEscape(sourceURL))
|
|
resp, err := fetchURL(ctx, oembedURL)
|
|
if err != nil {
|
|
return oEmbedResponse{}, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
var payload oEmbedResponse
|
|
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
|
|
return oEmbedResponse{}, err
|
|
}
|
|
return payload, nil
|
|
}
|
|
|
|
func fetchURL(ctx context.Context, rawURL string) (*http.Response, error) {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("User-Agent", "lookbook/1.0")
|
|
client := &http.Client{Timeout: 12 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 400 {
|
|
resp.Body.Close()
|
|
return nil, fmt.Errorf("fetch %s: status %d", rawURL, resp.StatusCode)
|
|
}
|
|
return resp, nil
|
|
}
|
|
|
|
func extractMeta(n *html.Node, meta *Metadata) {
|
|
if n.Type == html.ElementNode && n.Data == "meta" {
|
|
var property, content, name string
|
|
for _, attr := range n.Attr {
|
|
switch strings.ToLower(attr.Key) {
|
|
case "property":
|
|
property = strings.ToLower(attr.Val)
|
|
case "content":
|
|
content = strings.TrimSpace(attr.Val)
|
|
case "name":
|
|
name = strings.ToLower(attr.Val)
|
|
}
|
|
}
|
|
if content != "" {
|
|
switch property {
|
|
case "og:title", "twitter:title":
|
|
if meta.Title == "" {
|
|
meta.Title = content
|
|
}
|
|
case "og:description", "twitter:description":
|
|
if meta.Description == "" {
|
|
meta.Description = content
|
|
}
|
|
case "og:site_name":
|
|
if meta.SiteName == "" {
|
|
meta.SiteName = content
|
|
}
|
|
case "og:image", "twitter:image":
|
|
if meta.ImageURL == "" {
|
|
meta.ImageURL = content
|
|
}
|
|
}
|
|
if meta.Description == "" && name == "description" {
|
|
meta.Description = content
|
|
}
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
extractMeta(c, meta)
|
|
}
|
|
}
|
|
|
|
func titleFromDoc(n *html.Node) string {
|
|
if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
|
|
return strings.TrimSpace(n.FirstChild.Data)
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
if title := titleFromDoc(c); title != "" {
|
|
return title
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractImageURL(baseURL *url.URL, contentType string) string {
|
|
if strings.HasPrefix(strings.ToLower(contentType), "image/") {
|
|
return baseURL.String()
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func storeImages(ctx context.Context, db *sql.DB, itemID int64, meta Metadata) error {
|
|
if meta.ImageURL == "" {
|
|
return nil
|
|
}
|
|
|
|
resp, err := fetchURL(ctx, meta.ImageURL)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
payload, err := io.ReadAll(io.LimitReader(resp.Body, 16<<20))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
contentType := resp.Header.Get("Content-Type")
|
|
if contentType == "" {
|
|
contentType = mime.TypeByExtension(strings.ToLower(path.Ext(resp.Request.URL.Path)))
|
|
}
|
|
|
|
width, height, thumbBytes, thumbHeight, err := createThumb(payload)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
_, err = image.QCreate(ctx, db, itemID, meta.ImageURL, contentType, payload, width, height, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if thumbBytes != nil {
|
|
_, err = image.QCreate(ctx, db, itemID, meta.ImageURL, thumbContentType(contentType), thumbBytes, thumbWidth, thumbHeight, true)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func createThumb(payload []byte) (int, int, []byte, int, error) {
|
|
img, err := imaging.Decode(bytes.NewReader(payload))
|
|
if err != nil {
|
|
return 0, 0, nil, 0, nil
|
|
}
|
|
bounds := img.Bounds()
|
|
width := bounds.Dx()
|
|
height := bounds.Dy()
|
|
|
|
if width <= thumbWidth {
|
|
return width, height, payload, height, nil
|
|
}
|
|
|
|
thumb := imaging.Resize(img, thumbWidth, 0, imaging.Lanczos)
|
|
buf := new(bytes.Buffer)
|
|
if err := imaging.Encode(buf, thumb, imaging.JPEG); err != nil {
|
|
return width, height, nil, 0, err
|
|
}
|
|
return width, height, buf.Bytes(), thumb.Bounds().Dy(), nil
|
|
}
|
|
|
|
func thumbContentType(_ string) string {
|
|
return "image/jpeg"
|
|
}
|