tag analyse v1

This commit is contained in:
Flo 2024-08-10 17:07:40 +00:00
parent 9e3e2ce15d
commit 7c62023e6f
8 changed files with 182 additions and 7 deletions

View File

@ -21,7 +21,8 @@
"teewurst\/pipeline": "^3.0", "teewurst\/pipeline": "^3.0",
"guzzlehttp\/guzzle": "^7.8", "guzzlehttp\/guzzle": "^7.8",
"micilini\/video-stream": "^1.0", "micilini\/video-stream": "^1.0",
"nesbot\/carbon": "^3.0" "nesbot\/carbon": "^3.0",
"ext-iconv": "*"
}, },
"autoload": { "autoload": {
"psr-4": { "psr-4": {

View File

@ -21,7 +21,8 @@
"teewurst/pipeline": "^3.0", "teewurst/pipeline": "^3.0",
"guzzlehttp/guzzle": "^7.8", "guzzlehttp/guzzle": "^7.8",
"micilini/video-stream": "^1.0", "micilini/video-stream": "^1.0",
"nesbot/carbon": "^3.0" "nesbot/carbon": "^3.0",
"ext-iconv": "*"
}, },
"autoload": { "autoload": {
"psr-4": { "psr-4": {

View File

@ -1,5 +1,6 @@
<?php <?php
use MyTube\API\Console\Command\AnalyzeTagsCommand;
use MyTube\API\Console\Command\AnalyzeVideoTitlesCommand; use MyTube\API\Console\Command\AnalyzeVideoTitlesCommand;
use MyTube\API\Console\Command\InitializeDataCommand; use MyTube\API\Console\Command\InitializeDataCommand;
use MyTube\API\Console\Command\RbacUpdateCommand; use MyTube\API\Console\Command\RbacUpdateCommand;
@ -11,5 +12,6 @@ return [
RbacUpdateCommand::class, RbacUpdateCommand::class,
AnalyzeVideoTitlesCommand::class, AnalyzeVideoTitlesCommand::class,
ReadUntaggedVideosCommand::class, ReadUntaggedVideosCommand::class,
AnalyzeTagsCommand::class,
] ]
]; ];

View File

@ -1,5 +1,6 @@
<?php <?php
use MyTube\API\Console\Command\AnalyzeTagsCommand;
use MyTube\API\Console\Command\AnalyzeVideoTitlesCommand; use MyTube\API\Console\Command\AnalyzeVideoTitlesCommand;
use MyTube\API\Console\Command\InitializeDataCommand; use MyTube\API\Console\Command\InitializeDataCommand;
use MyTube\API\Console\Command\RbacUpdateCommand; use MyTube\API\Console\Command\RbacUpdateCommand;
@ -11,6 +12,7 @@ return [
InitializeDataCommand::class => AutoWiringFactory::class, InitializeDataCommand::class => AutoWiringFactory::class,
RbacUpdateCommand::class => AutoWiringFactory::class, RbacUpdateCommand::class => AutoWiringFactory::class,
AnalyzeVideoTitlesCommand::class => AutoWiringFactory::class, AnalyzeVideoTitlesCommand::class => AutoWiringFactory::class,
AnalyzeTagsCommand::class => AutoWiringFactory::class,
ReadUntaggedVideosCommand::class => AutoWiringFactory::class, ReadUntaggedVideosCommand::class => AutoWiringFactory::class,
], ],
]; ];

View File

@ -0,0 +1,133 @@
<?php
namespace MyTube\API\Console\Command;
use MyTube\Data\Business\Entity\Tag;
use MyTube\Data\Business\Entity\Video;
use MyTube\Data\Business\Manager\MyTubeEntityManager;
use MyTube\Data\Business\Repository\VideoRepository;
use MyTube\Handling\Tag\Rule\IsTagSubstringRule;
use MyTube\Handling\Video\Analyzer\VideoDurationAnalyzer;
use MyTube\Handling\Video\Analyzer\VideoTitleAnalyzer;
use MyTube\Infrastructure\Logging\Logger\Logger;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
use function Webmozart\Assert\Tests\StaticAnalysis\length;
#[AsCommand(name: 'analyze:tags', description: 'Analyzes video titles and add tags')]
class AnalyzeTagsCommand extends Command
{
private readonly VideoRepository $videoRepository;
public function __construct(
private readonly MyTubeEntityManager $entityManager,
private readonly IsTagSubstringRule $isTagSubstringRule,
private readonly Logger $logger,
) {
parent::__construct($this->getName());
$this->videoRepository = $this->entityManager->getRepository(Video::class);
}
protected function execute(
InputInterface $input,
OutputInterface $output
): int {
$io = new SymfonyStyle($input, $output);
try {
$videos = $this->videoRepository->findAll();
/** @var Video $video */
foreach ($videos as $video) {
$comments[] = $video->getTitle();
}
// Normalisierte Kommentare
$normalized_comments = array_map([$this, 'normalize'], $comments);
// Tokenisierung und Wortzählung
$word_counts = [];
foreach ($normalized_comments as $comment) {
$words = explode(' ', $comment);
foreach ($words as $word) {
if ($word) {
if (!isset($word_counts[$word])) {
$word_counts[$word] = 0;
}
$word_counts[$word]++;
}
}
}
// Konsolidierung der Wörter unter Berücksichtigung von Tippfehlern
$corrected_word_counts = [];
$dictionary = array_keys($word_counts);
foreach ($word_counts as $word => $count) {
$correct_word = $this->correct_typo($word, $dictionary);
if (!isset($corrected_word_counts[$correct_word])) {
$corrected_word_counts[$correct_word] = 0;
}
$corrected_word_counts[$correct_word] += $count;
}
// Sortieren nach Häufigkeit
arsort($corrected_word_counts);
$corrected_word_counts = array_reverse($corrected_word_counts);
// Ausgabe der häufigsten Wörter
foreach ($corrected_word_counts as $word => $count) {
if ($count > 3 && !$this->isTagSubstringRule->appliesTo($word)) {
echo $word . ": " . $count . "\n";
}
}
$io->success('OK!');
} catch (\Throwable $e) {
$io->error($e->getMessage());
$io->error($e->getTraceAsString());
$this->logger->error($e->getMessage(), ['exception' => $e]);
return Command::FAILURE;
}
return Command::SUCCESS;
}
function normalize($text) {
// Kleinbuchstaben
$text = mb_strtolower($text);
// Akzente entfernen
$text = iconv('UTF-8', 'ASCII//TRANSLIT', $text);
// Interpunktion entfernen
$text = preg_replace("/[^a-z\s]/", "", $text);
// Trimmen
$text = trim($text);
return $text;
}
// Tippfehlerkorrektur mit Levenshtein-Distanz
function correct_typo($word, $dictionary) {
$closest_word = $word;
$shortest_distance = -1;
foreach ($dictionary as $dict_word) {
$lev = levenshtein($word, $dict_word);
if ($lev == 0) {
$closest_word = $word;
$shortest_distance = 0;
break;
}
if ($lev <= 2 && ($lev < $shortest_distance || $shortest_distance < 0)) {
$closest_word = $dict_word;
$shortest_distance = $lev;
}
}
return $closest_word;
}
}

View File

@ -10,17 +10,17 @@ use MyTube\Data\Business\Manager\MyTubeEntityManager;
class AnalyzeVideoRepository class AnalyzeVideoRepository
{ {
public function __construct(
private readonly MyTubeEntityManager $entityManager
) {
}
private const FIELD_MAP = [ private const FIELD_MAP = [
'duration' => 'v.duration', 'duration' => 'v.duration',
'title' => 'v.title', 'title' => 'v.title',
'createdAt' => 'v.createdAt' 'createdAt' => 'v.createdAt'
]; ];
public function __construct(
private readonly MyTubeEntityManager $entityManager
) {
}
public function findByFilter( public function findByFilter(
?string $query, ?string $query,
int $page, int $page,

View File

@ -12,6 +12,7 @@ use MyTube\Handling\Tag\Handler\Query\ReadThumbnail\ReadThumbnailQueryBuilder;
use MyTube\Handling\Tag\Handler\Query\ReadThumbnail\ReadThumbnailQueryHandler; use MyTube\Handling\Tag\Handler\Query\ReadThumbnail\ReadThumbnailQueryHandler;
use MyTube\Handling\Tag\Handler\Query\ReadVideoList\ReadVideoListQueryBuilder; use MyTube\Handling\Tag\Handler\Query\ReadVideoList\ReadVideoListQueryBuilder;
use MyTube\Handling\Tag\Handler\Query\ReadVideoList\ReadVideoListQueryHandler; use MyTube\Handling\Tag\Handler\Query\ReadVideoList\ReadVideoListQueryHandler;
use MyTube\Handling\Tag\Rule\IsTagSubstringRule;
use MyTube\Handling\Tag\Rule\TagAliasExistsRule; use MyTube\Handling\Tag\Rule\TagAliasExistsRule;
use MyTube\Handling\Tag\Rule\TagExistsRule; use MyTube\Handling\Tag\Rule\TagExistsRule;
use Reinfi\DependencyInjection\Factory\AutoWiringFactory; use Reinfi\DependencyInjection\Factory\AutoWiringFactory;
@ -24,6 +25,7 @@ return [
/// Rule /// Rule
TagExistsRule::class => InjectionFactory::class, TagExistsRule::class => InjectionFactory::class,
TagAliasExistsRule::class => InjectionFactory::class, TagAliasExistsRule::class => InjectionFactory::class,
IsTagSubstringRule::class => InjectionFactory::class,
/// Builder /// Builder
TagBuilder::class => AutoWiringFactory::class, TagBuilder::class => AutoWiringFactory::class,

View File

@ -0,0 +1,34 @@
<?php
namespace MyTube\Handling\Tag\Rule;
use MyTube\Data\Business\Repository\TagRepository;
use Reinfi\DependencyInjection\Annotation\InjectDoctrineRepository;
class IsTagSubstringRule
{
/**
* @InjectDoctrineRepository(
* entityManager="MyTube\Data\Business\Manager\MyTubeEntityManager",
* entity="MyTube\Data\Business\Entity\Tag"
* )
*/
public function __construct(
private readonly TagRepository $tagRepository,
) {
}
public function appliesTo(
string $substring,
): bool
{
$substring = "%" . $substring . "%";
$qb = $this->tagRepository->createQueryBuilder('t')
->where('t.description like :substring')
->setParameter('substring', $substring);
return count($qb->getQuery()->getResult()) !== 0;
}
}